diff --git a/lib/itmo-assembler.jar b/lib/itmo-assembler.jar index 1465404..0b85be2 100644 Binary files a/lib/itmo-assembler.jar and b/lib/itmo-assembler.jar differ diff --git a/src/algo/AddSequencesShiftingRightTask.java b/src/algo/AddSequencesShiftingRightTask.java index a39060d..7ce0db4 100644 --- a/src/algo/AddSequencesShiftingRightTask.java +++ b/src/algo/AddSequencesShiftingRightTask.java @@ -1,20 +1,13 @@ package algo; -import it.unimi.dsi.fastutil.longs.Long2IntMap; -import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import ru.ifmo.genetics.dna.Dna; import ru.ifmo.genetics.dna.DnaTools; import ru.ifmo.genetics.dna.kmers.ShortKmer; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.structures.map.Long2IntHashMap; -import ru.ifmo.genetics.structures.map.MutableLongIntEntry; -import ru.ifmo.genetics.utils.KmerUtils; +import ru.ifmo.genetics.structures.map.*; import structures.Sequence; import java.util.Iterator; -import java.util.List; import java.util.Queue; /** @@ -22,16 +15,16 @@ */ public class AddSequencesShiftingRightTask implements Runnable { - final BigLong2IntHashMap hm; - final Long2IntHashMap openHM; + final BigLong2ShortHashMap hm; + final Long2ShortHashMap openHM; final int k; int freqThreshold; int lenThreshold; final Queue sequences; final LongOpenHashSet used; - public AddSequencesShiftingRightTask(BigLong2IntHashMap hm, - Long2IntHashMap openHM, + public AddSequencesShiftingRightTask(BigLong2ShortHashMap hm, + Long2ShortHashMap openHM, int k, int freqThreshold, int lenThreshold, Queue sequences, LongOpenHashSet used) { this.hm = hm; @@ -45,11 +38,11 @@ public AddSequencesShiftingRightTask(BigLong2IntHashMap hm, @Override public void run() { - Iterator it = openHM.entryIterator(); + Iterator it = openHM.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); long key = entry.getKey(); - int value = entry.getValue(); + short value = entry.getValue(); if (value <= freqThreshold) { continue; } diff --git a/src/algo/ComponentsBuilder.java b/src/algo/ComponentsBuilder.java index 1f05dcc..a9eeecf 100644 --- a/src/algo/ComponentsBuilder.java +++ b/src/algo/ComponentsBuilder.java @@ -4,110 +4,107 @@ import it.unimi.dsi.fastutil.longs.LongArrayFIFOQueue; import org.apache.log4j.Logger; import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.structures.map.MutableLongShortEntry; +import ru.ifmo.genetics.structures.set.BigLongHashSet; import structures.ConnectedComponent; import java.io.FileNotFoundException; import java.io.PrintWriter; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; -/** - * Created by ulyantsev on 06.05.14. - * - */ public class ComponentsBuilder { - public static List splitStrategy(ArrayLong2IntHashMap hm, + public static List splitStrategy(BigLong2ShortHashMap hm, int k, - int b1, - int b2, + int b1, int b2, String statFP, - Logger logger, - int availableProcessors) throws FileNotFoundException { + Logger logger) + throws FileNotFoundException { List ans = new ArrayList(); + BigLongHashSet processedKmers = new BigLongHashSet(hm.capacity()); PrintWriter statPW = new PrintWriter(statFP); - statPW.println("comp.size\tcomp.weight\tfreqThreshold"); + statPW.println("# component.size\tcomponent.weight\tfreqThreshold"); for (int freqThreshold = 0; ; freqThreshold++) { - List components = getComponents(hm, k, freqThreshold, availableProcessors); + List components = getComponents(hm, k, freqThreshold, processedKmers); if (components.size() == 0) { break; } + int added = 0; for (ConnectedComponent comp : components) { if (comp.size() < b1) { banComponent(hm, comp); } else if (comp.size() < b2) { ans.add(comp); + added++; statPW.println(comp.size() + "\t" + comp.getWeight() + "\t" + freqThreshold); banComponent(hm, comp); } } - logger.debug("Freq = " + freqThreshold + ", components count = " + ans.size()); + logger.debug("FreqThreshold = " + freqThreshold + ", " + + "components added = " + added + ", total components = " + ans.size()); } statPW.close(); return ans; } - private static void banComponent(ArrayLong2IntHashMap hm, ConnectedComponent component) { + private static void banComponent(BigLong2ShortHashMap hm, ConnectedComponent component) { for (long kmer : component.kmers) { - hm.add(kmer, -hm.get(kmer)); + hm.put(kmer, (short) 0); } } - private static List getComponents(ArrayLong2IntHashMap hm, - int k, - int freqThreshold, - int availableProcessors) { + private static List getComponents(BigLong2ShortHashMap hm, + int k, int freqThreshold, + BigLongHashSet processedKmers) { List ans = new ArrayList(); - ArrayLong2IntHashMap processedKmers = - new ArrayLong2IntHashMap((int) (Math.log(availableProcessors) / Math.log(2)) + 4); - for (int i = 0; i < hm.hm.length; ++i) { - for (Long2IntMap.Entry entry : hm.hm[i].long2IntEntrySet()) { - long kmer = entry.getLongKey(); - if (processedKmers.get(kmer) > 0) { - continue; - } + processedKmers.reset(); - int value = entry.getIntValue(); - if (value > freqThreshold) { - ConnectedComponent comp = getComponent(hm, k, kmer, freqThreshold, processedKmers); - ans.add(comp); - } + Iterator it = hm.entryIterator(); + while (it.hasNext()) { + MutableLongShortEntry entry = it.next(); + long kmer = entry.getKey(); + short value = entry.getValue(); + if (value > freqThreshold && !processedKmers.contains(kmer)) { + ConnectedComponent comp = getComponent(hm, k, kmer, freqThreshold, processedKmers); + ans.add(comp); } } return ans; } - private static ConnectedComponent getComponent(ArrayLong2IntHashMap hm, - int kValue, - long kmer, + private static ConnectedComponent getComponent(BigLong2ShortHashMap hm, + int k, + long startKmer, int freqThreshold, - ArrayLong2IntHashMap processedKmers) { - if (hm.get(kmer) <= freqThreshold || processedKmers.get(kmer) > 0) { - return null; - } + BigLongHashSet processedKmers) { + ConnectedComponent ans = new ConnectedComponent(); + long weight = 0; - long weight = hm.get(kmer); LongArrayFIFOQueue queue = new LongArrayFIFOQueue(); - queue.enqueue(kmer); - processedKmers.add(kmer, 1); + queue.enqueue(startKmer); + processedKmers.add(startKmer); + ans.add(startKmer); + weight += hm.get(startKmer); while (queue.size() > 0) { - long kmerRepr = queue.dequeue(); - ans.add(kmerRepr); - - for (long neighbour : KmerOperations.possibleNeighbours(kmerRepr, kValue)) { - int value = hm.get(neighbour); - if (value <= freqThreshold || processedKmers.get(neighbour) > 0) { - continue; + long kmer = queue.dequeue(); + + for (long neighbour : KmerOperations.possibleNeighbours(kmer, k)) { + short value = hm.get(neighbour); + if (value > freqThreshold && !processedKmers.contains(neighbour)) { + queue.enqueue(neighbour); + processedKmers.add(neighbour); + ans.add(neighbour); + weight += value; } - weight += value; - processedKmers.add(neighbour, 1); - queue.enqueue(neighbour); } } diff --git a/src/algo/HashMapOperations.java b/src/algo/HashMapOperations.java index 8b9b046..99477c3 100644 --- a/src/algo/HashMapOperations.java +++ b/src/algo/HashMapOperations.java @@ -3,16 +3,14 @@ import it.unimi.dsi.fastutil.longs.Long2IntMap; import org.apache.log4j.Logger; import ru.ifmo.genetics.dna.kmers.ShortKmer; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.structures.map.MutableLongIntEntry; +import ru.ifmo.genetics.structures.map.*; import java.util.HashMap; import java.util.Iterator; public class HashMapOperations { - public static byte getLeftNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int freqThreshold) { + public static byte getLeftNucleotide(BigLong2ShortHashMap hm, ShortKmer kmer, int freqThreshold) { byte rightNuc = kmer.nucAt(kmer.length() - 1); byte ansNuc = -1; for (byte nuc = 0; nuc <= 3; nuc++) { @@ -30,7 +28,7 @@ public static byte getLeftNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int return ansNuc; } - public static byte getRightNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int freqThreshold) { + public static byte getRightNucleotide(BigLong2ShortHashMap hm, ShortKmer kmer, int freqThreshold) { byte leftNuc = kmer.nucAt(0); byte ansNuc = -1; for (byte nuc = 0; nuc <= 3; nuc++) { @@ -48,25 +46,18 @@ public static byte getRightNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int return ansNuc; } - public static HashMap getNeighbours(BigLong2IntHashMap hm, - ShortKmer kmer, - int depth) { - - return null; - } - - public static void banBranchingKmers(BigLong2IntHashMap hm, + public static void banBranchingKmers(BigLong2ShortHashMap hm, int freqThreshold, int k, Logger logger) { - int BAN_VALUE = 1000000000; + final short BAN_VALUE = -1; long totalKmers = 0, uniqueKmers = 0, totalBanned = 0, uniqueBanned = 0, totalUnderThreshold = 0, uniqueUnderThreshold = 0; - Iterator it = hm.entryIterator(); + Iterator it = hm.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); long key = entry.getKey(); int value = entry.getValue(); totalKmers += value; @@ -80,24 +71,14 @@ public static void banBranchingKmers(BigLong2IntHashMap hm, ShortKmer kmer = new ShortKmer(key, k); if (getLeftNucleotide(hm, kmer, freqThreshold) == -2 || getRightNucleotide(hm, kmer, freqThreshold) == -2) { - hm.addAndBound(key, BAN_VALUE); + hm.put(key, BAN_VALUE); totalBanned += value; uniqueBanned++; } } logger.info("Total k-mers = " + totalKmers + ", unique k-mers = " + uniqueKmers); - logger.info("Total k-mers [<=] threshold = " + totalUnderThreshold + ", unique = " + uniqueUnderThreshold); + logger.info("Total k-mers under threshold = " + totalUnderThreshold + ", unique = " + uniqueUnderThreshold); logger.info("Total k-mers banned = " + totalBanned + ", unique = " + uniqueBanned); - - it = hm.entryIterator(); - while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); - int value = entry.getValue(); - if (value >= BAN_VALUE) { - long key = entry.getKey(); - hm.addAndBound(key, -(value + 1)); - } - } } } diff --git a/src/algo/SequencesFinders.java b/src/algo/SequencesFinders.java index 6276957..e8441b2 100644 --- a/src/algo/SequencesFinders.java +++ b/src/algo/SequencesFinders.java @@ -2,8 +2,7 @@ import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import ru.ifmo.genetics.executors.BlockingThreadPoolExecutor; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; import structures.Sequence; import java.util.*; @@ -11,7 +10,7 @@ public class SequencesFinders { - public static Deque thresholdStrategy(BigLong2IntHashMap hm, + public static Deque thresholdStrategy(BigLong2ShortHashMap hm, int availableProcessors, int freqThreshold, int lenThreshold, diff --git a/src/io/BytesDispatcher.java b/src/io/BytesDispatcher.java new file mode 100644 index 0000000..c6b89b5 --- /dev/null +++ b/src/io/BytesDispatcher.java @@ -0,0 +1,69 @@ +package io; + +import org.apache.log4j.Logger; +import ru.ifmo.genetics.dna.Dna; +import ru.ifmo.genetics.io.sources.Source; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.utils.Misc; +import ru.ifmo.genetics.utils.NumUtils; +import ru.ifmo.genetics.utils.iterators.ProgressableIterator; +import ru.ifmo.genetics.utils.tool.Tool; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + +public class BytesDispatcher { + final Logger logger = Logger.getLogger("bytes-dispatcher"); + + final InputStream is; + public final int workRangeSize; + long bytesRead = 0; + + final BigLong2ShortHashMap hm; // for debug output + + public BytesDispatcher(InputStream is, int workRangeSize, BigLong2ShortHashMap hmForMonitoring) { + this.is = is; + this.workRangeSize = workRangeSize; + Tool.debug(logger, "Using " + workRangeSize + " bytes as workRangeSize"); + hm = hmForMonitoring; + } + + + + public byte[] getNewEmptyWorkRange() { + return new byte[workRangeSize]; + } + + public synchronized int readWorkRange(byte[] range) { + try { + int read = 0; + while (read < range.length) { + int r = is.read(range, read, range.length - read); + if (r == -1) { + break; + } + read += r; + } + + bytesRead += read; + /* + if ((bytesRead & ((1 << 29) - 1)) == 0) { // 512 Mb + logger.debug("Processed " + (bytesRead >> 20) + " Mb of data:"); + if (hm != null) { + logger.debug("Total hm size = " + NumUtils.groupDigits(hm.size()) + ", " + + "size in hm.maps = {" + NumUtils.groupDigits(hm.maps[0].size()) + ", " + + NumUtils.groupDigits(hm.maps[1].size()) + ", " + + NumUtils.groupDigits(hm.maps[2].size()) + ", " + + NumUtils.groupDigits(hm.maps[3].size()) + ", ...}"); + } + logger.debug("Available memory (without running GC) = " + Misc.availableMemoryWithoutRunningGCAsString()); + } + */ + return read; + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/src/io/BytesWorker.java b/src/io/BytesWorker.java new file mode 100644 index 0000000..8f88948 --- /dev/null +++ b/src/io/BytesWorker.java @@ -0,0 +1,43 @@ +package io; + +import java.util.concurrent.CountDownLatch; + +public abstract class BytesWorker implements Runnable { + + private BytesDispatcher dispatcher = null; + private CountDownLatch latch = null; + + boolean interrupted = false; + + + void setDispatcher(BytesDispatcher dispatcher) { + this.dispatcher = dispatcher; + } + void setLatch(CountDownLatch latch) { + this.latch = latch; + } + + + + public abstract void process(byte[] range, int len); + + @Override + public void run() { + if (dispatcher == null || latch == null) { + throw new RuntimeException("Not full initialization!"); + } + while (!interrupted) { + byte[] range = dispatcher.getNewEmptyWorkRange(); + int r = dispatcher.readWorkRange(range); + if (r <= 0) { + break; + } + process(range, r); + } + latch.countDown(); + } + + public void interrupt() { + interrupted = true; + } +} diff --git a/src/io/IOUtils.java b/src/io/IOUtils.java index 80c4993..92ce722 100644 --- a/src/io/IOUtils.java +++ b/src/io/IOUtils.java @@ -1,92 +1,52 @@ package io; -import it.unimi.dsi.fastutil.longs.Long2IntMap; +import org.apache.log4j.Logger; import ru.ifmo.genetics.dna.Dna; -import ru.ifmo.genetics.dna.DnaQ; -import ru.ifmo.genetics.dna.DnaTools; -import ru.ifmo.genetics.dna.kmers.Kmer; -import ru.ifmo.genetics.dna.kmers.KmerIteratorFactory; import ru.ifmo.genetics.dna.kmers.ShortKmer; -import ru.ifmo.genetics.dna.kmers.ShortKmerIteratorFactory; -import ru.ifmo.genetics.executors.BlockingThreadPoolExecutor; import ru.ifmo.genetics.io.ReadersUtils; import ru.ifmo.genetics.io.sources.NamedSource; import ru.ifmo.genetics.io.sources.Source; import ru.ifmo.genetics.statistics.QuickQuantitativeStatistics; import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; +import ru.ifmo.genetics.structures.map.BigLong2LongHashMap; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.structures.map.MutableLongShortEntry; +import ru.ifmo.genetics.utils.NumUtils; +import ru.ifmo.genetics.utils.tool.ExecutionFailedException; +import ru.ifmo.genetics.utils.tool.Tool; import java.io.*; import java.util.Iterator; -import java.util.Random; +import java.util.List; import java.util.concurrent.CountDownLatch; -import org.apache.log4j.Logger; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.structures.map.MutableLongIntEntry; -import ru.ifmo.genetics.tools.ec.DnaQReadDispatcher; -import ru.ifmo.genetics.utils.tool.ExecutionFailedException; -import ru.ifmo.genetics.utils.NumUtils; -import ru.ifmo.genetics.utils.tool.Tool; - public class IOUtils { - static final int BYTES_PER_KMER = 12; - - public static void addFASTASequences(File[] files, - ArrayLong2IntHashMap hm, - int k, - int minSeqLen, - Logger logger) throws IOException { - int totalSeq = 0; - long totalLen = 0; - for (File file : files) { - Source source = ReadersUtils.readDnaLazy(file); - Iterator in = source.iterator(); - - int cnt = 0; - long len = 0; - while (in.hasNext()) { - Dna dna = in.next(); - if (dna.length() >= minSeqLen) { - addSequence(hm, dna, k); - cnt++; - len += dna.length(); - } - } - logger.debug(cnt + " sequences added, summary len = " + len + " from " + file.getName()); - totalSeq += cnt; - totalLen += len; - } - logger.debug("Total sequences count = " + totalSeq); - logger.debug("Total sequences length = " + totalLen); - logger.debug("k-mers HM size = " + hm.size()); - } + static final int READS_WORK_RANGE_SIZE = 1 << 15; // 32 K reads + static final int KMERS_WORK_RANGE_SIZE = 16777220; // ~16 Mb of data + + - private static void addSequence(ArrayLong2IntHashMap hm, Dna dna, int k) { - Iterator it = ShortKmer.kmersOf(dna, k).iterator(); - while (it.hasNext()) { - hm.add(it.next().toLong(), 1); - } - } - public static long printKmers(BigLong2IntHashMap hm, int threshold, + public static long printKmers(BigLong2ShortHashMap hm, int threshold, File outFile, File stFile) throws IOException { - DataOutputStream stream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile))); + DataOutputStream stream = new DataOutputStream(new BufferedOutputStream( + new FileOutputStream(outFile), 1 << 24)); // 16 Mb buffer - QuickQuantitativeStatistics stats = new QuickQuantitativeStatistics(); + QuickQuantitativeStatistics stats = new QuickQuantitativeStatistics(); long good = 0; - Iterator it = hm.entryIterator(); + Iterator it = hm.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); long key = entry.getKey(); - int value = entry.getValue(); + short value = entry.getValue(); stats.add(value); if (value > threshold) { stream.writeLong(key); - stream.writeInt(value); + stream.writeShort(value); good++; } } @@ -96,181 +56,249 @@ public static long printKmers(BigLong2IntHashMap hm, int threshold, return good; } - public static BigLong2IntHashMap loadKmers(File[] files, - int freqThreshold, - int availableProcessors, - Logger logger) throws IOException { - BigLong2IntHashMap hm = new BigLong2IntHashMap( - (int) (Math.log(availableProcessors) / Math.log(2)) + 4, 8); - addKmers(files, hm, freqThreshold, logger); + + + // ---------------------------- for loading kmers ---------------------------------- + + static class Kmers2HMWorker extends KmersLoadWorker { + Kmers2HMWorker(BigLong2ShortHashMap hm, int freqThreshold) { + this.hm = hm; + this.freqThreshold = freqThreshold; + } + + final BigLong2ShortHashMap hm; + final int freqThreshold; + long kmers = 0, kmersAdded = 0; + long freqSum = 0, freqSumAdded = 0; + + @Override + public void processKmer(long kmer, short freq) { + kmers++; + freqSum += freq; + if (freq > freqThreshold) { + hm.addAndBound(kmer, freq); + kmersAdded++; + freqSumAdded += freq; + } + } + } + + public static BigLong2ShortHashMap loadKmers(File[] files, int freqThreshold, int availableProcessors, Logger logger) + throws ExecutionFailedException { + + BigLong2ShortHashMap hm = new BigLong2ShortHashMap( + (int) (Math.log(availableProcessors) / Math.log(2)) + 4, 12); + + Kmers2HMWorker[] workers = new Kmers2HMWorker[availableProcessors]; + for (int i = 0; i < workers.length; ++i) { + workers[i] = new Kmers2HMWorker(hm, freqThreshold); + } + + run(files, workers, hm, logger); + + // calculating statistics... + long kmers = 0, kmersAdded = 0; + long freqSum = 0, freqSumAdded = 0; + for (Kmers2HMWorker worker : workers) { + kmers += worker.kmers; + kmersAdded += worker.kmersAdded; + freqSum += worker.freqSum; + freqSumAdded += worker.freqSumAdded; + } + + Tool.debug(logger, + "Added/All kmers count = " + NumUtils.groupDigits(kmersAdded) + "/" + NumUtils.groupDigits(kmers) + + " (" + String.format("%.1f", kmersAdded * 100.0 / kmers) + "%)"); + Tool.debug(logger, + "Added/All kmers frequency sum = " + NumUtils.groupDigits(freqSumAdded) + "/" + NumUtils.groupDigits(freqSum) + + " (" + String.format("%.1f", freqSumAdded * 100.0 / freqSum) + "%)"); + logger.debug("k-mers HM size = " + NumUtils.groupDigits(hm.size())); + return hm; } - public static void addKmers(File[] files, - BigLong2IntHashMap hm, - int freqThreshold, - Logger logger) throws IOException { - for (File file : files) { - Tool.info(logger, "Loading k-mer from " + file + "..."); - - FileInputStream fis = new FileInputStream(file); - DataInputStream is = new DataInputStream(new BufferedInputStream(fis)); - - long uniqueKmers = 0, uniqueKmersAdded = 0, totalKmers = 0, totalKmersAdded = 0; - long c = fis.getChannel().size() / 12; - uniqueKmers = c; - for (; c > 0; c--) { - long kmerRepr = is.readLong(); - int freq = is.readInt(); - -// uniqueKmers++; - totalKmers += freq; - if (freq > freqThreshold) { - uniqueKmersAdded++; - totalKmersAdded += freq; - hm.addAndBound(kmerRepr, freq); - } - } - if (is.available() > 2) { - throw new RuntimeException("Size mismatch. Possibly wrong file format/file is corrupted."); + static class KmersPresenceWorker extends KmersLoadWorker { + KmersPresenceWorker(BigLong2LongHashMap hm) { + this.hm = hm; + } + final BigLong2LongHashMap hm; + @Override + public void processKmer(long kmer, short freq) { + if (hm.contains(kmer)) { + hm.addAndBound(kmer, freq); } - is.close(); + } + } - Tool.debug(logger, file + " : " + NumUtils.groupDigits(uniqueKmersAdded) + " / " - + NumUtils.groupDigits(uniqueKmers) + " unique k-mers added/all"); - Tool.debug(logger, file + " : " + NumUtils.groupDigits(totalKmersAdded) + " / " - + NumUtils.groupDigits(totalKmers) + " total k-mers added/all"); - Tool.info(logger, NumUtils.groupDigits(uniqueKmersAdded) + " k-mers loaded from " + file); + public static void calculatePresenceForKmers(File[] files, BigLong2LongHashMap hm, int availableProcessors, Logger logger) + throws ExecutionFailedException { + BytesWorker[] workers = new BytesWorker[availableProcessors]; + for (int i = 0; i < workers.length; ++i) { + workers[i] = new KmersPresenceWorker(hm); } + run(files, workers, null, logger); } - public static BigLong2IntHashMap loadKmers(File[] files, - int freqThreshold, - int availableProcessors) throws InterruptedException { - BigLong2IntHashMap hm = new BigLong2IntHashMap( - (int) (Math.log(availableProcessors) / Math.log(2)) + 4, 8); - addKmers(files, hm, freqThreshold, availableProcessors); - return hm; - } + public static void run(File[] files, BytesWorker[] workers, BigLong2ShortHashMap hmForMonitoring, Logger logger) + throws ExecutionFailedException { + try { + for (File file : files) { + Tool.info(logger, "Loading file " + file + "..."); - public static void addKmers(File[] files, - BigLong2IntHashMap hm, - int freqThreshold, - int availableProcessors) throws InterruptedException { + InputStream is = new FileInputStream(file); + BytesDispatcher dispatcher = new BytesDispatcher(is, KMERS_WORK_RANGE_SIZE, hmForMonitoring); + CountDownLatch latch = new CountDownLatch(workers.length); - BlockingThreadPoolExecutor executor = new BlockingThreadPoolExecutor(availableProcessors); + for (int i = 0; i < workers.length; ++i) { + workers[i].setDispatcher(dispatcher); + workers[i].setLatch(latch); + new Thread(workers[i]).start(); + } - for (File file : files) { - long bytesInFile = file.length(); - long kmersCount = bytesInFile / BYTES_PER_KMER; - long kmersPerThread = kmersCount / availableProcessors + 1; - - long kmersSum = 0; - for (int i = 0; i < availableProcessors; i++) { - long kmersToAdd = Math.min(kmersPerThread, kmersCount - kmersSum); - executor.blockingExecute(new KmersToHMAdditionTask(hm, file, kmersSum, kmersToAdd, freqThreshold)); - kmersSum += kmersToAdd; + try { + latch.await(); + } catch (InterruptedException e) { + Tool.warn(logger, "Main thread interrupted"); + for (BytesWorker worker : workers) { + worker.interrupt(); + } + throw new ExecutionFailedException("Thread was interrupted", e); + } + Tool.debug(logger, NumUtils.sizeInBytesAsString(dispatcher.bytesRead) + " of data processed from " + file); } + } catch (IOException e) { + throw new ExecutionFailedException("Can't load k-mers file", e); } + } + + + // ---------------------------- for loading reads ---------------------------------- - executor.shutdownAndAwaitTermination(); + static class ReadsLoadWorker extends ReadsWorker { + ReadsLoadWorker(BigLong2ShortHashMap hm, int k, int minDnaLen) { + this.hm = hm; + this.k = k; + this.minDnaLen = minDnaLen; + } + + final BigLong2ShortHashMap hm; + final int k; + final int minDnaLen; + int totalSeq = 0, goodSeq = 0; + long totalLen = 0, goodLen = 0; + + @Override + public void process(List reads) { + for (Dna dna : reads) { + totalSeq++; + totalLen += dna.length(); + + if (dna.length() >= minDnaLen) { + for (ShortKmer kmer : ShortKmer.kmersOf(dna, k)) { + hm.addAndBound(kmer.toLong(), (short) 1); + } + goodSeq++; + goodLen += dna.length(); + } + } + } } - public static BigLong2IntHashMap loadBINQReads(File[] files, - int k, - int loadTaskSize, - KmerIteratorFactory factory, - int availableProcessors, - Logger logger) throws IOException { - BigLong2IntHashMap hm = new BigLong2IntHashMap( - (int) (Math.log(availableProcessors) / Math.log(2)) + 4, 8); - addBINQReads(files, hm, k, loadTaskSize, factory, availableProcessors, logger); + public static BigLong2ShortHashMap loadReads(File[] files, int k, int minSeqLen, + int availableProcessors, Logger logger) + throws ExecutionFailedException { + BigLong2ShortHashMap hm = new BigLong2ShortHashMap( + (int) (Math.log(availableProcessors) / Math.log(2)) + 4, 12); + + ReadsLoadWorker[] workers = new ReadsLoadWorker[availableProcessors]; + for (int i = 0; i < workers.length; ++i) { + workers[i] = new ReadsLoadWorker(hm, k, minSeqLen); + } + + run(files, workers, hm, logger); + + // calculating statistics... + int totalSeq = 0, goodSeq = 0; + long totalLen = 0, goodLen = 0; + for (ReadsLoadWorker worker : workers) { + totalSeq += worker.totalSeq; + goodSeq += worker.goodSeq; + totalLen += worker.totalLen; + goodLen += worker.goodLen; + } + Tool.debug(logger, + "Good/Total sequences count = " + NumUtils.groupDigits(goodSeq) + "/" + NumUtils.groupDigits(totalSeq) + + " (" + String.format("%.1f", goodSeq * 100.0 / totalSeq) + "%)"); + Tool.debug(logger, + "Good/Total sequences length = " + NumUtils.groupDigits(goodLen) + "/" + NumUtils.groupDigits(totalLen) + + " (" + String.format("%.1f", goodLen * 100.0 / totalLen) + "%)"); + logger.debug("k-mers HM size = " + NumUtils.groupDigits(hm.size())); + return hm; } - public static void addBINQReads(File[] files, - BigLong2IntHashMap hm, - int k, - int loadTaskSize, - KmerIteratorFactory factory, - int availableProcessors, - Logger logger) throws IOException { - Source source = ReadersUtils.readDnaQLazy(files); - Iterator it = source.iterator(); + static class ReadsPresenceWorker extends ReadsWorker { + ReadsPresenceWorker(BigLong2LongHashMap hm, int k) { + this.hm = hm; + this.k = k; + } - DnaQReadDispatcher dispatcher = new DnaQReadDispatcher(source, loadTaskSize, null); - KmerLoadWorker[] workers = new KmerLoadWorker[availableProcessors]; - CountDownLatch latch = new CountDownLatch(workers.length); + final BigLong2LongHashMap hm; + final int k; - for (int i = 0; i < workers.length; ++i) { - workers[i] = new KmerLoadWorker(dispatcher, latch, new Random(42), - k, hm, factory); - new Thread(workers[i]).start(); + @Override + public void process(List reads) { + for (Dna dna : reads) { + for (ShortKmer kmer : ShortKmer.kmersOf(dna, k)) { + if (hm.contains(kmer.toLong())) { + hm.addAndBound(kmer.toLong(), 1); + } + } + } } + } - try { - latch.await(); - } catch (InterruptedException e) { - logger.warn("Main thread interrupted"); - for (KmerLoadWorker worker : workers) { - worker.interrupt(); - } + public static void calculatePresenceForReads(File[] files, int k, BigLong2LongHashMap hm, int availableProcessors, Logger logger) + throws ExecutionFailedException { + ReadsWorker[] workers = new ReadsWorker[availableProcessors]; + for (int i = 0; i < workers.length; ++i) { + workers[i] = new ReadsPresenceWorker(hm, k); } - logger.info(NumUtils.groupDigits(dispatcher.getReads()) + " reads added"); -// logger.info("k-mers loaded"); + run(files, workers, null, logger); } - public static BigLong2IntHashMap loadReads(File[] files, - int k, - int loadTaskSize, - KmerIteratorFactory factory, - int availableProcessors, - Logger logger) throws IOException, ExecutionFailedException { - BigLong2IntHashMap hm = new BigLong2IntHashMap( - (int) (Math.log(availableProcessors) / Math.log(2)) + 4, 8); - addReads(files, hm, k, loadTaskSize, factory, availableProcessors, logger); - return hm; - } - public static void addReads(File[] files, - BigLong2IntHashMap hm, - int k, - int loadTaskSize, - KmerIteratorFactory factory, - int availableProcessors, - Logger logger) throws ExecutionFailedException { + public static void run(File[] files, ReadsWorker[] workers, BigLong2ShortHashMap hmForMonitoring, Logger logger) + throws ExecutionFailedException { for (File file : files) { - logger.info("Processing file " + file + "..."); + Tool.info(logger, "Loading file " + file + "..."); NamedSource reader = ReadersUtils.readDnaLazy(file); - UniversalReadDispatcher dispatcher = new UniversalReadDispatcher(reader, loadTaskSize, hm); - UniversalLoadWorker[] workers = new UniversalLoadWorker[availableProcessors]; + ReadsDispatcher dispatcher = new ReadsDispatcher(reader, READS_WORK_RANGE_SIZE, hmForMonitoring); CountDownLatch latch = new CountDownLatch(workers.length); - logger.debug("Starting workers..."); for (int i = 0; i < workers.length; ++i) { - workers[i] = new UniversalLoadWorker(dispatcher, latch, k, hm, factory); + workers[i].setDispatcher(dispatcher); + workers[i].setLatch(latch); new Thread(workers[i]).start(); } - try { - logger.debug("Waiting workers..."); latch.await(); } catch (InterruptedException e) { - logger.warn("Main thread interrupted"); - for (UniversalLoadWorker worker : workers) { + Tool.warn(logger, "Main thread interrupted"); + for (ReadsWorker worker : workers) { worker.interrupt(); } + throw new ExecutionFailedException("Thread was interrupted", e); } - logger.info(NumUtils.groupDigits(dispatcher.reads) + " reads added from " + file); + Tool.info(logger, NumUtils.groupDigits(dispatcher.reads) + " reads added from " + file); } -// logger.info("k-mers loaded"); } } diff --git a/src/io/KmerLoadWorker.java b/src/io/KmerLoadWorker.java deleted file mode 100644 index e6a0002..0000000 --- a/src/io/KmerLoadWorker.java +++ /dev/null @@ -1,82 +0,0 @@ -package io; - -import org.apache.log4j.Logger; -import ru.ifmo.genetics.dna.DnaQ; -import ru.ifmo.genetics.dna.kmers.*; -import ru.ifmo.genetics.dna.kmers.KmerIteratorFactory; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.tools.ec.DnaQReadDispatcher; - -import java.util.List; -import java.util.Random; -import java.util.concurrent.CountDownLatch; - -public class KmerLoadWorker implements Runnable { - - private DnaQReadDispatcher dispatcher; - CountDownLatch latch; - int LEN; - long step; - BigLong2IntHashMap hm; - KmerIteratorFactory factory; - - boolean interrupted = false; - - Random random; - - Logger logger; - - public KmerLoadWorker(DnaQReadDispatcher dispatcher, CountDownLatch latch, Random random, - int LEN, BigLong2IntHashMap hm, - KmerIteratorFactory factory) { - - this.dispatcher = dispatcher; - this.latch = latch; - this.LEN = LEN; - this.hm = hm; - this.random = random; - - - this.factory = factory; - } - - public void add(Kmer kmer) { - long key = kmer.toLong(); - hm.addAndBound(key, 1); - } - - void add(DnaQ dnaq) { - ShortKmer kmer = new ShortKmer(0, LEN); - for (int pos = 0; pos < dnaq.length(); pos++) { - kmer.shiftRight(dnaq.nucAt(pos)); - if (pos >= LEN - 1) { - add(kmer); - } - } - } - - void add(Iterable dnaqs) { - for (DnaQ dnaq : dnaqs) { - add(dnaq); - } - } - - public void interrupt() { - interrupted = true; - } - - public void run() { - logger = Logger.getLogger("worker-" + Thread.currentThread().getId()); - while (!interrupted) { - List list = dispatcher.getWorkRange(); - if (list == null) { - break; - } - - add(list); - } - latch.countDown(); - } - -} diff --git a/src/io/KmersLoadWorker.java b/src/io/KmersLoadWorker.java new file mode 100644 index 0000000..a9a0d53 --- /dev/null +++ b/src/io/KmersLoadWorker.java @@ -0,0 +1,35 @@ +package io; + +import java.io.ByteArrayInputStream; +import java.io.DataInputStream; +import java.io.IOException; + +public abstract class KmersLoadWorker extends BytesWorker { + + final static int KMER_RECORD_SIZE = 10; + + + + public abstract void processKmer(long kmer, short freq); + + @Override + public void process(byte[] range, int len) { + if (len % KMER_RECORD_SIZE != 0) { + throw new RuntimeException("BAD division by work range"); + } + try { + DataInputStream is = new DataInputStream(new ByteArrayInputStream(range, 0, len)); + int c = len / KMER_RECORD_SIZE; + for (int i = 0; i < c; i++) { + long kmer = is.readLong(); + short freq = is.readShort(); + processKmer(kmer, freq); + } + if (is.available() > 0) { + throw new RuntimeException("Bad KMER_RECORD_SIZE!"); + } + } catch (IOException e) { + throw new RuntimeException("Can't load kmers from file", e); + } + } +} diff --git a/src/io/KmersToHMAdditionTask.java b/src/io/KmersToHMAdditionTask.java deleted file mode 100644 index 2f65148..0000000 --- a/src/io/KmersToHMAdditionTask.java +++ /dev/null @@ -1,54 +0,0 @@ -package io; - -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; - -import java.io.*; - -/** - * Created by ulyantsev on 05.05.14. - * - */ -public class KmersToHMAdditionTask implements Runnable { - - final int BYTES_PER_KMER = 12; - - BigLong2IntHashMap hm; - File kmersFile; - long kmersToSkip; - long kmersToRead; - int maxBadFrequency; - - public KmersToHMAdditionTask(BigLong2IntHashMap hm, - File kmersFile, - long kmersToSkip, - long kmersToRead, - int maxBadFrequency) { - this.hm = hm; - this.kmersFile = kmersFile; - this.kmersToSkip = kmersToSkip; - this.kmersToRead = kmersToRead; - this.maxBadFrequency = maxBadFrequency; - } - - @Override - public void run() { - try { - DataInputStream inputStream = new DataInputStream(new BufferedInputStream(new FileInputStream(kmersFile))); - long reallyReaden = inputStream.skip(BYTES_PER_KMER * kmersToSkip); - assert reallyReaden == BYTES_PER_KMER * kmersToSkip; - - for (long i = 0; i < kmersToRead; i++) { - long kmerRepr = inputStream.readLong(); - int frequency = inputStream.readInt(); - if (frequency > maxBadFrequency) { - hm.addAndBound(kmerRepr, frequency); - } - } - - inputStream.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } -} diff --git a/src/io/ReadsDispatcher.java b/src/io/ReadsDispatcher.java new file mode 100644 index 0000000..76ca6ed --- /dev/null +++ b/src/io/ReadsDispatcher.java @@ -0,0 +1,54 @@ +package io; + +import org.apache.log4j.Logger; +import ru.ifmo.genetics.dna.Dna; +import ru.ifmo.genetics.dna.DnaQ; +import ru.ifmo.genetics.io.sources.Source; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.utils.Misc; +import ru.ifmo.genetics.utils.NumUtils; +import ru.ifmo.genetics.utils.iterators.ProgressableIterator; +import ru.ifmo.genetics.utils.tool.Tool; + +import java.io.*; +import java.util.ArrayList; +import java.util.List; + +public class ReadsDispatcher { + final Logger logger = Logger.getLogger("reads-dispatcher"); + + final ProgressableIterator iterator; + public final int workRangeSize; + long reads = 0; + + final BigLong2ShortHashMap hm; // for debug output + + public ReadsDispatcher(Source reader, int workRangeSize, BigLong2ShortHashMap hmForMonitoring) { + this.iterator = reader.iterator(); + this.workRangeSize = workRangeSize; + Tool.debug(logger, "Using " + workRangeSize + " reads as workRangeSize"); + this.hm = hmForMonitoring; + } + + + public synchronized List getWorkRange() { + List list = new ArrayList(workRangeSize); + while ((list.size() < workRangeSize) && iterator.hasNext()) { + list.add(iterator.next()); + ++reads; + + if (reads % 2500000 == 0) { + logger.debug("Processed " + NumUtils.groupDigits(reads) + " reads:"); + if (hm != null) { + logger.debug("Total hm size = " + NumUtils.groupDigits(hm.size()) + ", " + + "size in hm.maps = {" + NumUtils.groupDigits(hm.maps[0].size()) + ", " + + NumUtils.groupDigits(hm.maps[1].size()) + ", " + + NumUtils.groupDigits(hm.maps[2].size()) + ", " + + NumUtils.groupDigits(hm.maps[3].size()) + ", ...}"); + } + logger.debug("Available memory (without running GC) = " + Misc.availableMemoryWithoutRunningGCAsString()); + } + } + return list.isEmpty() ? null : list; + } +} diff --git a/src/io/ReadsWorker.java b/src/io/ReadsWorker.java new file mode 100644 index 0000000..76fc665 --- /dev/null +++ b/src/io/ReadsWorker.java @@ -0,0 +1,46 @@ +package io; + +import ru.ifmo.genetics.dna.Dna; + +import java.util.List; +import java.util.concurrent.CountDownLatch; + +public abstract class ReadsWorker implements Runnable { + + private ReadsDispatcher dispatcher = null; + private CountDownLatch latch = null; + + boolean interrupted = false; + + + void setDispatcher(ReadsDispatcher dispatcher) { + this.dispatcher = dispatcher; + } + void setLatch(CountDownLatch latch) { + this.latch = latch; + } + + + + public abstract void process(List reads); + + + @Override + public void run() { + if (dispatcher == null || latch == null) { + throw new RuntimeException("Not full initialization!"); + } + while (!interrupted) { + List list = dispatcher.getWorkRange(); + if (list == null) { + break; + } + process(list); + } + latch.countDown(); + } + + public void interrupt() { + interrupted = true; + } +} diff --git a/src/io/UniversalLoadWorker.java b/src/io/UniversalLoadWorker.java deleted file mode 100644 index 433edde..0000000 --- a/src/io/UniversalLoadWorker.java +++ /dev/null @@ -1,68 +0,0 @@ -package io; - -import org.apache.log4j.Logger; -import ru.ifmo.genetics.dna.Dna; -import ru.ifmo.genetics.dna.DnaQ; -import ru.ifmo.genetics.dna.DnaTools; -import ru.ifmo.genetics.dna.kmers.Kmer; -import ru.ifmo.genetics.dna.kmers.KmerIteratorFactory; -import ru.ifmo.genetics.dna.kmers.ShortKmer; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; - -import java.util.List; -import java.util.Random; -import java.util.concurrent.CountDownLatch; - -public class UniversalLoadWorker implements Runnable { - - private UniversalReadDispatcher dispatcher; - CountDownLatch latch; - int LEN; - long step; - BigLong2IntHashMap hm; - KmerIteratorFactory factory; - - boolean interrupted = false; - - Logger logger; - - public UniversalLoadWorker(UniversalReadDispatcher dispatcher, CountDownLatch latch, - int LEN, BigLong2IntHashMap hm, - KmerIteratorFactory factory) { - - this.dispatcher = dispatcher; - this.latch = latch; - this.LEN = LEN; - this.hm = hm; - - this.factory = factory; - } - - - - void add(Iterable dnas) { - for (Dna dna : dnas) { - for (ShortKmer kmer : ShortKmer.kmersOf(dna, LEN)) { - hm.addAndBound(kmer.toLong(), 1); - } - } - } - - public void interrupt() { - interrupted = true; - } - - public void run() { - logger = Logger.getLogger("worker-" + Thread.currentThread().getId()); - while (!interrupted) { - List list = dispatcher.getWorkRange(); - if (list == null) { - break; - } - - add(list); - } - latch.countDown(); - } -} diff --git a/src/io/UniversalReadDispatcher.java b/src/io/UniversalReadDispatcher.java deleted file mode 100644 index 1721ca7..0000000 --- a/src/io/UniversalReadDispatcher.java +++ /dev/null @@ -1,122 +0,0 @@ -package io; - -import org.apache.log4j.Logger; -import ru.ifmo.genetics.dna.Dna; -import ru.ifmo.genetics.dna.DnaQ; -import ru.ifmo.genetics.io.sources.NamedSource; -import ru.ifmo.genetics.io.sources.Source; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.utils.Misc; -import ru.ifmo.genetics.utils.NumUtils; -import ru.ifmo.genetics.utils.TextUtils; -import ru.ifmo.genetics.utils.iterators.ProgressableIterator; - -import java.io.IOException; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; - -public class UniversalReadDispatcher { - protected final Logger logger = Logger.getLogger("read-dispatcher"); - - ProgressableIterator iterator; - int workRangeSize; - final OutputStream out; - - long reads = 0; - long totalReadsProcessed; - - long[] ar; - int r = 0; - int w = 0; - final BigLong2IntHashMap hm; // tmp - - public UniversalReadDispatcher(Source reader, int workRangeSize, BigLong2IntHashMap hm) { - this.iterator = reader.iterator(); - this.workRangeSize = workRangeSize; - out = null; - logger.debug("Using " + workRangeSize + " reads as workRangeSize"); - this.hm = hm; - } - - public UniversalReadDispatcher(Source reader, - OutputStream out, - int workRangeSize, - long totalReadsProcessed, - int workersNumber) { - this.iterator = reader.iterator(); - - this.workRangeSize = workRangeSize; - this.out = out; - this.totalReadsProcessed = totalReadsProcessed; - - ar = new long[workersNumber]; - hm = null; - } - - private List readRange(int workRangeSize) throws IOException { - List list = new ArrayList(workRangeSize); - while ((list.size() < workRangeSize) && iterator.hasNext()) { - list.add(iterator.next()); - ++reads; - - if (reads % 1000000 == 0) { - logger.debug("Processed " + NumUtils.groupDigits(reads) + " reads:"); - logger.debug("Total hm size = " + NumUtils.groupDigits(hm.size()) + ", " + - "size in hm.hm = {" + NumUtils.groupDigits(hm.maps[0].size()) + ", " - + NumUtils.groupDigits(hm.maps[1].size()) + ", " - + NumUtils.groupDigits(hm.maps[2].size()) + ", " - + NumUtils.groupDigits(hm.maps[3].size()) + ", ...}"); - logger.debug("Available memory (without running GC) = " + Misc.availableMemoryWithoutRunningGCAsString()); - } - } - return list; - } - - public List getWorkRange(long id) { - List list; - try { - synchronized (iterator) { - list = readRange(workRangeSize); - ar[r] = id; - r = (r + 1) % ar.length; - } - } catch (IOException e) { - throw new RuntimeException(e); - } - return list.isEmpty() ? null : list; - } - - public List getWorkRange() { - List list; - try { - synchronized (iterator) { - list = readRange(workRangeSize); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - return list.isEmpty() ? null : list; - } - - public void writeDnaQs(List list, long id) throws IOException, InterruptedException { - synchronized (out) { - while (ar[w] != id) { - out.wait(); - } - for (DnaQ dnaq : list) { - ru.ifmo.genetics.io.IOUtils.putByteArray(dnaq.toByteArray(), out); - ++totalReadsProcessed; - } - w = (w + 1) % ar.length; - out.notifyAll(); - } - } - - public long getReads() { - return reads; - } - -} diff --git a/src/structures/ConnectedComponent.java b/src/structures/ConnectedComponent.java index 4e3d56d..0d5e908 100644 --- a/src/structures/ConnectedComponent.java +++ b/src/structures/ConnectedComponent.java @@ -48,8 +48,8 @@ public static void saveComponents(Collection components, Str outputStream.close(); } - public static List loadComponents(String fp) throws IOException { - DataInputStream inputStream = new DataInputStream(new BufferedInputStream(new FileInputStream(fp))); + public static List loadComponents(File file) throws IOException { + DataInputStream inputStream = new DataInputStream(new BufferedInputStream(new FileInputStream(file))); int componentsCnt = inputStream.readInt(); List res = new ArrayList(componentsCnt); diff --git a/src/tools/AntibodyFinderMain.java b/src/tools/AntibodyFinderMain.java index df3fd9f..bda933d 100644 --- a/src/tools/AntibodyFinderMain.java +++ b/src/tools/AntibodyFinderMain.java @@ -6,7 +6,7 @@ import ru.ifmo.genetics.dna.kmers.ShortKmer; import ru.ifmo.genetics.dna.kmers.ShortKmerIteratorFactory; import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; import ru.ifmo.genetics.utils.tool.ExecutionFailedException; import ru.ifmo.genetics.utils.tool.Parameter; import ru.ifmo.genetics.utils.tool.Tool; @@ -70,14 +70,8 @@ protected void runImpl() throws ExecutionFailedException { String seq = loadConstantFragment(); info("Constant fragment length = " + seq.length()); - BigLong2IntHashMap readsHM; - try { - readsHM = IOUtils.loadBINQReads(readsFiles.get(), LEN, LOAD_TASK_SIZE, - new ShortKmerIteratorFactory(), availableProcessors.get(), this.logger); - } catch (IOException e) { - throw new ExecutionFailedException("Couldn't load k-mers from reads", e); - } - info("Reads HM size = " + readsHM.size()); + BigLong2ShortHashMap readsHM = + IOUtils.loadReads(readsFiles.get(), LEN, 0, availableProcessors.get(), logger); ShortKmer kmer = new ShortKmer(seq.substring(0, LEN)); for (int pos = LEN; pos < seq.length(); pos++) { @@ -93,7 +87,7 @@ protected void runImpl() throws ExecutionFailedException { // } - readsHM.addAndBound(kmer.toLong(), maximalBadFrequency.get() + 1); + readsHM.addAndBound(kmer.toLong(), (short) (maximalBadFrequency.get() + 1)); // deb += " " + kmer.toLong() + " " + readsHM.get(kmer.toLong()); debug(deb); } diff --git a/src/tools/CompareReadsAndComponentsMain.java b/src/tools/CompareReadsAndComponentsMain.java index 093a9b0..1177c4a 100644 --- a/src/tools/CompareReadsAndComponentsMain.java +++ b/src/tools/CompareReadsAndComponentsMain.java @@ -54,7 +54,7 @@ public class CompareReadsAndComponentsMain extends Tool { protected void runImpl() throws ExecutionFailedException { debug("Lets load components"); try { - components = ConnectedComponent.loadComponents(componentsFile.get().getAbsolutePath()); + components = ConnectedComponent.loadComponents(componentsFile.get()); } catch (IOException e) { throw new ExecutionFailedException("Couldn't load components", e); } diff --git a/src/tools/ComponentCutterMain.java b/src/tools/ComponentCutterMain.java index e620832..4e6b5cd 100644 --- a/src/tools/ComponentCutterMain.java +++ b/src/tools/ComponentCutterMain.java @@ -2,6 +2,7 @@ import algo.ComponentsBuilder; import ru.ifmo.genetics.statistics.Timer; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; import ru.ifmo.genetics.utils.Misc; import ru.ifmo.genetics.utils.tool.values.InValue; import structures.ConnectedComponent; @@ -65,24 +66,19 @@ public class ComponentCutterMain extends Tool { @Override protected void runImpl() throws ExecutionFailedException { Timer t = new Timer(); - info("Loading sequences from files..."); - ArrayLong2IntHashMap hm = - new ArrayLong2IntHashMap((int) (Math.log(availableProcessors.get()) / Math.log(2)) + 4); - try { - IOUtils.addFASTASequences(sequencesFiles.get(), hm, k.get(), minLen.get(), this.logger); - } catch (IOException e) { - e.printStackTrace(); - return; - } + debug("Loading sequences from files..."); + BigLong2ShortHashMap hm = IOUtils.loadReads(sequencesFiles.get(), k.get(), minLen.get(), + availableProcessors.get(), logger); debug("Memory used = " + Misc.usedMemoryAsString() + ", time = " + t); + info("Searching for components..."); List components; try { String statFP = workDir + File.separator + "components-stat-" + minComponentSize.get() + "-" + maxComponentSize.get() + ".txt"; components = ComponentsBuilder.splitStrategy(hm, k.get(), minComponentSize.get(), - maxComponentSize.get(), statFP, this.logger, availableProcessors.get()); + maxComponentSize.get(), statFP, logger); } catch (FileNotFoundException e) { e.printStackTrace(); return; diff --git a/src/tools/DistanceMatrixBuilderMain.java b/src/tools/DistanceMatrixBuilderMain.java index d6a9aea..9f94774 100644 --- a/src/tools/DistanceMatrixBuilderMain.java +++ b/src/tools/DistanceMatrixBuilderMain.java @@ -6,10 +6,7 @@ import ru.ifmo.genetics.utils.tool.inputParameterBuilder.FileMVParameterBuilder; import ru.ifmo.genetics.utils.tool.inputParameterBuilder.FileParameterBuilder; import ru.ifmo.genetics.utils.tool.inputParameterBuilder.IntParameterBuilder; -import ru.ifmo.genetics.utils.tool.values.FilesFromOneFileYielder; -import ru.ifmo.genetics.utils.tool.values.IfYielder; -import ru.ifmo.genetics.utils.tool.values.InValue; -import ru.ifmo.genetics.utils.tool.values.SimpleFixingInValue; +import ru.ifmo.genetics.utils.tool.values.*; import java.io.*; @@ -64,17 +61,7 @@ public class DistanceMatrixBuilderMain extends Tool { { setFix(kmersCounter.k, k); setFix(kmersCounter.inputFiles, inputFiles); - setFix(kmersCounter.maximalBadFrequency, - new IfYielder(new InValue() { - @Override - public Boolean get() { - return maximalBadFrequency.get() >= 1; - } - }, - new SimpleFixingInValue(1), - maximalBadFrequency - ) - ); + setFix(kmersCounter.maximalBadFrequency, maximalBadFrequency); setFixDefault(kmersCounter.outputDir); addSubTool(kmersCounter); } @@ -101,10 +88,10 @@ public Boolean get() { public FeaturesCalculatorMain featuresCalculator = new FeaturesCalculatorMain(); { setFix(featuresCalculator.k, k); - setFix(featuresCalculator.componentsFiles, new FilesFromOneFileYielder(compCutter.componentsFileOut)); + setFix(featuresCalculator.componentsFile, compCutter.componentsFileOut); setFix(featuresCalculator.readsFiles, inputFiles); - setFix(featuresCalculator.threshold, kmersCounter.maximalBadFrequency); setFixDefault(featuresCalculator.kmersFiles); + setFixDefault(featuresCalculator.threshold); addSubTool(featuresCalculator); } diff --git a/src/tools/FeaturesCalculatorMain.java b/src/tools/FeaturesCalculatorMain.java index 591cc57..18c8c06 100644 --- a/src/tools/FeaturesCalculatorMain.java +++ b/src/tools/FeaturesCalculatorMain.java @@ -2,13 +2,14 @@ import ru.ifmo.genetics.io.ReadersUtils; import ru.ifmo.genetics.statistics.Timer; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; +import ru.ifmo.genetics.structures.map.BigLong2LongHashMap; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.utils.Misc; import ru.ifmo.genetics.utils.NumUtils; import ru.ifmo.genetics.utils.tool.values.InMemoryValue; import ru.ifmo.genetics.utils.tool.values.InValue; import structures.ConnectedComponent; import io.IOUtils; -import ru.ifmo.genetics.dna.kmers.ShortKmerIteratorFactory; import ru.ifmo.genetics.utils.tool.ExecutionFailedException; import ru.ifmo.genetics.utils.tool.Parameter; import ru.ifmo.genetics.utils.tool.Tool; @@ -23,7 +24,6 @@ public class FeaturesCalculatorMain extends Tool { public static final String NAME = "features-calculator"; public static final String DESCRIPTION = "Calculates features values for input reads files"; - static final int LOAD_TASK_SIZE = 1 << 15; public final Parameter k = addParameter(new IntParameterBuilder("k") .mandatory() @@ -31,10 +31,10 @@ public class FeaturesCalculatorMain extends Tool { .withDescription("k-mer size") .create()); - public final Parameter componentsFiles = addParameter(new FileMVParameterBuilder("components-files") + public final Parameter componentsFile = addParameter(new FileParameterBuilder("components-file") .mandatory() .withShortOpt("cm") - .withDescription("files with connected components (one component is considered as one feature)") + .withDescription("file with connected components (one component is considered as one feature)") .create()); public final Parameter readsFiles = addParameter(new FileMVParameterBuilder("reads") @@ -50,7 +50,7 @@ public class FeaturesCalculatorMain extends Tool { public final Parameter threshold = addParameter(new IntParameterBuilder("threshold") .withShortOpt("b") - .withDescription("threshold for k-mers from ") + .withDescription("maximal frequency for a k-mer to be assumed erroneous") .withDefaultValue(0) .create()); @@ -63,25 +63,37 @@ public class FeaturesCalculatorMain extends Tool { @Override protected void runImpl() throws ExecutionFailedException { Timer t = new Timer(); - List> models = new ArrayList>(); - List modelsDirs = new ArrayList(); - - for (File componentsFile : componentsFiles.get()) { - try { - List components = - ConnectedComponent.loadComponents(componentsFile.getAbsolutePath()); - models.add(components); - info(NumUtils.groupDigits(components.size()) + " components loaded from " + componentsFile); - } catch (IOException e) { - throw new ExecutionFailedException("Couldn't load components", e); - } - String compName = FileUtils.removeExtension(componentsFile.getName(), ".bin"); - String modelDir = String.format(workDir + File.separator + "%03d-" + compName, models.size()); - modelsDirs.add(modelDir); - (new File(modelDir)).mkdir(); - debug("Dir created for vectors: " + modelDir); + debug("Loading components..."); + List components; + + try { + components = ConnectedComponent.loadComponents(componentsFile.get()); + info(NumUtils.groupDigits(components.size()) + " components loaded from " + componentsFile.get()); + } catch (IOException e) { + throw new ExecutionFailedException("Couldn't load components", e); + } + + String compName = FileUtils.removeExtension(componentsFile.get().getName(), ".bin"); + File outDir = new File(workDir.get(), "vectors"); + outDir.mkdirs(); + debug(outDir + " directory was created for components file " + componentsFile.get().getName()); + + + // preparing + long kmers = 0; + for (ConnectedComponent component : components) { + kmers += component.size(); + } + BigLong2LongHashMap hm = new BigLong2LongHashMap( + (int) (Math.log(availableProcessors.get()) / Math.log(2)) + 4, 12); + for (ConnectedComponent component : components) { + for (long kmer : component.kmers) { + hm.put(kmer, 0); + } } + debug("Memory used (before processing files) = " + Misc.usedMemoryAsString() + ", time = " + t); + int featuresFilesCount = (readsFiles.get() == null ? 0 : readsFiles.get().length) + (kmersFiles.get() == null ? 0 : kmersFiles.get().length); @@ -90,52 +102,29 @@ protected void runImpl() throws ExecutionFailedException { if (readsFiles.get() != null) { for (File readsFile : readsFiles.get()) { - BigLong2IntHashMap readsHM; - try { - readsHM = IOUtils.loadReads(new File[]{readsFile}, k.get(), LOAD_TASK_SIZE, - new ShortKmerIteratorFactory(), availableProcessors.get(), this.logger); - } catch (IOException e) { - throw new ExecutionFailedException("Couldn't load k-mers from " + readsFile, e); - } - - for (int i = 0; i < models.size(); i++) { - File outFile = new File(modelsDirs.get(i), - ReadersUtils.readDnaLazy(readsFile).name() + ".vec"); - try { - buildAndPrintVector(readsHM, models.get(i), outFile); - } catch (FileNotFoundException e) { - e.printStackTrace(); - return; - } - info("Features for " + readsFile + " printed to " + outFile); - featuresFiles[curFiles] = outFile; - curFiles++; - } + hm.resetValues(); + IOUtils.calculatePresenceForReads(new File[]{readsFile}, k.get(), hm, + availableProcessors.get(), logger); + + File outFile = new File(outDir, ReadersUtils.readDnaLazy(readsFile).name() + ".vec"); + buildAndPrintVector(components, hm, threshold.get(), outFile); + info("Features for file " + readsFile.getName() + " printed to " + outFile); + featuresFiles[curFiles] = outFile; + curFiles++; } } - if (kmersFiles.get() != null && kmersFiles.get().length != 0) { + if (kmersFiles.get() != null) { for (File kmersFile : kmersFiles.get()) { - BigLong2IntHashMap kmersHM; - try { - kmersHM = IOUtils.loadKmers(new File[]{kmersFile}, - threshold.get(), availableProcessors.get(), this.logger); - } catch (IOException e) { - throw new ExecutionFailedException("Couldn't load k-mers from " + kmersFile, e); - } - - for (int i = 0; i < models.size(); i++) { - File outFile = new File(modelsDirs.get(i), - ReadersUtils.readDnaLazy(kmersFile).name() + ".vec"); - try { - buildAndPrintVector(kmersHM, models.get(i), outFile); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } - info("Features for " + kmersFile + " printed to " + outFile); - featuresFiles[curFiles] = outFile; - curFiles++; - } + hm.resetValues(); + IOUtils.calculatePresenceForKmers(new File[]{kmersFile}, hm, + availableProcessors.get(), logger); + + File outFile = new File(outDir, FileUtils.removeExtension(kmersFile.getName()) + ".vec"); + buildAndPrintVector(components, hm, threshold.get(), outFile); + info("Features for file " + kmersFile + " printed to " + outFile); + featuresFiles[curFiles] = outFile; + curFiles++; } } @@ -143,21 +132,28 @@ protected void runImpl() throws ExecutionFailedException { debug("Features-calculator has finished! Time = " + t); } - private static void buildAndPrintVector(BigLong2IntHashMap readsHM, - List components, - File outFile) throws FileNotFoundException { + private static void buildAndPrintVector(List components, + BigLong2LongHashMap hm, int threshold, File outFile) + throws ExecutionFailedException { List vector = new ArrayList(); for (ConnectedComponent component : components) { long kmersInComponent = 0; for (long kmer : component.kmers) { - int value = readsHM.getWithZero(kmer); - kmersInComponent += value; + long value = hm.getWithZero(kmer); + if (value > threshold) { + kmersInComponent += value; + } } vector.add(kmersInComponent); } - PrintWriter vectorPW = new PrintWriter(outFile); + PrintWriter vectorPW = null; + try { + vectorPW = new PrintWriter(outFile); + } catch (FileNotFoundException e) { + throw new ExecutionFailedException("Can't create file " + outFile, e); + } for (long x : vector) { vectorPW.println(x); } diff --git a/src/tools/KmersCounterMain.java b/src/tools/KmersCounterMain.java index b9f3fb0..fa196d1 100644 --- a/src/tools/KmersCounterMain.java +++ b/src/tools/KmersCounterMain.java @@ -3,18 +3,12 @@ import io.IOUtils; import ru.ifmo.genetics.dna.kmers.ShortKmerIteratorFactory; import ru.ifmo.genetics.io.ReadersUtils; -import ru.ifmo.genetics.statistics.QuantitativeStatistics; -import ru.ifmo.genetics.statistics.QuickQuantitativeStatistics; import ru.ifmo.genetics.statistics.Timer; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.structures.map.MutableLongIntEntry; -import ru.ifmo.genetics.utils.FileUtils; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; import ru.ifmo.genetics.utils.Misc; import ru.ifmo.genetics.utils.tool.*; import ru.ifmo.genetics.utils.tool.Parameter; import ru.ifmo.genetics.utils.tool.Tool; -import ru.ifmo.genetics.utils.tool.inputParameterBuilder.BoolParameterBuilder; import ru.ifmo.genetics.utils.tool.inputParameterBuilder.FileMVParameterBuilder; import ru.ifmo.genetics.utils.tool.inputParameterBuilder.FileParameterBuilder; import ru.ifmo.genetics.utils.tool.inputParameterBuilder.IntParameterBuilder; @@ -23,20 +17,14 @@ import ru.ifmo.genetics.utils.NumUtils; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; public class KmersCounterMain extends Tool { public static final String NAME = "kmer-counter"; public static final String DESCRIPTION = "Count k-mers in given reads with ArrayLong2IntHashMap"; -// "\nBinary output format: 64 bits to k-mer itself + 32 bits to frequency"; - static final int LOAD_TASK_SIZE = 1 << 15; // 32 K reads public final Parameter k = addParameter(new IntParameterBuilder("k") .mandatory() @@ -68,23 +56,6 @@ public class KmersCounterMain extends Tool { .create()); - /* - To think about that params - - public final Parameter plainText = addParameter(new BoolParameterBuilder("plainText") - .withShortOpt("pt") - .optional() - .withDescription("print k-mers in plain text, not in binary (64 + 32) format") - .withDefaultValue(false) - .create()); - - public final Parameter notPrintNames = addParameter(new BoolParameterBuilder("not-print-names") - .withShortOpt("npn") - .optional() - .withDescription("print only k-mers counts for all possible k-mers (k < 15)") - .withDefaultValue(false) - .create()); - */ private final InMemoryValue resultingKmerFilesPr = new InMemoryValue(); public final InValue resultingKmerFiles = @@ -93,18 +64,12 @@ public class KmersCounterMain extends Tool { @Override protected void runImpl() throws ExecutionFailedException { - int LEN = k.get(); - Timer t = new Timer(); - BigLong2IntHashMap hm; - try { - hm = IOUtils.loadReads(inputFiles.get(), LEN, LOAD_TASK_SIZE, - new ShortKmerIteratorFactory(), availableProcessors.get(), this.logger); - } catch (IOException e) { - throw new ExecutionFailedException("Couldn't load k-mers", e); - } + BigLong2ShortHashMap hm = IOUtils.loadReads(inputFiles.get(), k.get(), 0, + availableProcessors.get(), logger); debug("Memory used = " + Misc.usedMemoryAsString() + ", time = " + t); + File outDir = outputDir.get(); if (!outDir.exists()) { outDir.mkdirs(); diff --git a/src/tools/SeqBuilderMain.java b/src/tools/SeqBuilderMain.java index 99fe5ef..da7a0d0 100644 --- a/src/tools/SeqBuilderMain.java +++ b/src/tools/SeqBuilderMain.java @@ -3,8 +3,8 @@ import algo.SequencesFinders; import io.IOUtils; import ru.ifmo.genetics.statistics.*; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.structures.map.MutableLongIntEntry; +import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap; +import ru.ifmo.genetics.structures.map.MutableLongShortEntry; import ru.ifmo.genetics.utils.Misc; import ru.ifmo.genetics.utils.tool.ExecutionFailedException; import ru.ifmo.genetics.utils.tool.Parameter; @@ -80,21 +80,15 @@ protected void runImpl() throws ExecutionFailedException { } Timer t = new Timer(); - - BigLong2IntHashMap hm; - try { - hm = IOUtils.loadKmers(inputFiles.get(), 0, availableProcessors.get(), this.logger); - } catch (IOException e) { - e.printStackTrace(); - return; - } + BigLong2ShortHashMap hm = + IOUtils.loadKmers(inputFiles.get(), maximalBadFrequency.get(), availableProcessors.get(), logger); debug("Memory used = " + Misc.usedMemoryAsString() + ", time = " + t); long totalKmers = 0; int[] stat = new int[STAT_LEN]; - Iterator it = hm.entryIterator(); + Iterator it = hm.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); int value = entry.getValue(); totalKmers += value; if (value >= stat.length) { @@ -149,7 +143,6 @@ protected void runImpl() throws ExecutionFailedException { File destination = new File(fp); outputFilePr.set(destination); - //info("hm brackets = " + hm.hm.length); Deque sequences; try { sequences = SequencesFinders.thresholdStrategy(hm, availableProcessors.get(), diff --git a/src/tools/SupergraphSeqBuilderMain.java b/src/tools/SupergraphSeqBuilderMain.java index 790500a..4c540d8 100644 --- a/src/tools/SupergraphSeqBuilderMain.java +++ b/src/tools/SupergraphSeqBuilderMain.java @@ -5,9 +5,7 @@ import ru.ifmo.genetics.dna.kmers.KmerIteratorFactory; import ru.ifmo.genetics.dna.kmers.ShortKmer; import ru.ifmo.genetics.dna.kmers.ShortKmerIteratorFactory; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.structures.map.MutableLongIntEntry; +import ru.ifmo.genetics.structures.map.*; import ru.ifmo.genetics.utils.Misc; import ru.ifmo.genetics.utils.NumUtils; import ru.ifmo.genetics.utils.tool.ExecutionFailedException; @@ -24,7 +22,6 @@ public class SupergraphSeqBuilderMain extends Tool { public static final String NAME = "supergraph-sequence-builder"; public static final String DESCRIPTION = "NOT COMPLETED"; - static final int LOAD_TASK_SIZE = 1 << 15; private final int STAT_LEN = 1024; @@ -91,7 +88,7 @@ protected void runImpl() throws ExecutionFailedException { debug("MAXIMAL_SIZE = " + MAX_SIZE); - BigLong2IntHashMap superHM = new BigLong2IntHashMap( + BigLong2ShortHashMap superHM = new BigLong2ShortHashMap( (int) (Math.log(availableProcessors.get()) / Math.log(2)) + 4, 8); for (File file : inputFiles.get()) { @@ -105,27 +102,22 @@ protected void runImpl() throws ExecutionFailedException { } } - private void addToSupergraph(BigLong2IntHashMap superHM, File readsFile) throws ExecutionFailedException { - BigLong2IntHashMap hm = null; - try { - hm = IOUtils.loadBINQReads(new File[]{readsFile}, this.k.get(), LOAD_TASK_SIZE, - kmerIteratorFactory.get(), availableProcessors.get(), this.logger); - } catch (IOException e) { - throw new ExecutionFailedException("Couldn't load kmers from " + readsFile.getPath(), e); - } + private void addToSupergraph(BigLong2ShortHashMap superHM, File readsFile) throws ExecutionFailedException { + BigLong2ShortHashMap hm = + IOUtils.loadReads(new File[]{readsFile}, k.get(), 0, availableProcessors.get(), logger); int freqThreshold = getThreshold(hm); long uniqueKmers = 0, uniqueAdded = 0, newKmers = 0; - Iterator it = hm.entryIterator(); + Iterator it = hm.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); uniqueKmers++; int value = entry.getValue(); if (value > freqThreshold) { uniqueAdded++; - int old = superHM.addAndBound(entry.getKey(), 1); + int old = superHM.addAndBound(entry.getKey(), (short)1); if (old == 0) { newKmers++; } @@ -137,16 +129,16 @@ private void addToSupergraph(BigLong2IntHashMap superHM, File readsFile) throws ", new k-mers added = " + newKmers); } - private int getThreshold(BigLong2IntHashMap hm) { + private int getThreshold(BigLong2ShortHashMap hm) { if (maximalBadFrequency.get() != null) { return maximalBadFrequency.get(); } long totalKmers = 0; int[] stat = new int[STAT_LEN]; - Iterator it = hm.entryIterator(); + Iterator it = hm.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); int v = entry.getValue(); totalKmers += v; if (v >= stat.length) { @@ -184,7 +176,7 @@ private int getThreshold(BigLong2IntHashMap hm) { return 0; } - public void calcSequences(BigLong2IntHashMap hm, String fastaFP) throws FileNotFoundException { + public void calcSequences(BigLong2ShortHashMap hm, String fastaFP) throws FileNotFoundException { int freqThreshold = supergraphFreq.get(); int lenThreshold = sequenceLen.get(); int kValue = k.get(); @@ -201,9 +193,9 @@ public void calcSequences(BigLong2IntHashMap hm, String fastaFP) throws FileNotF PrintWriter fastaPW = new PrintWriter(fastaFP); - Iterator it = hm.entryIterator(); + Iterator it = hm.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); long key = entry.getKey(); int value = entry.getValue(); if (value <= freqThreshold) { @@ -225,7 +217,7 @@ public void calcSequences(BigLong2IntHashMap hm, String fastaFP) throws FileNotF minWeight = Math.min(minWeight, value); maxWeight = Math.max(maxWeight, value); - hm.addAndBound(kmerRepr, -(value + 1)); + hm.put(kmerRepr, BAN_VALUE); byte rightNuc = getRightNucleotide(kmer, hm, freqThreshold); if (rightNuc < 0) { @@ -266,15 +258,15 @@ public void calcSequences(BigLong2IntHashMap hm, String fastaFP) throws FileNotF fastaPW.close(); } - private void banKmers(BigLong2IntHashMap hm, int freqThreshold, int k) { - int BAN_VALUE = 1000000000; + final short BAN_VALUE = -1; + private void banKmers(BigLong2ShortHashMap hm, int freqThreshold, int k) { long totalKmers = 0, uniqueKmers = 0, totalBanned = 0, uniqueBanned = 0, totalUnderThreshold = 0, uniqueUnderThreshold = 0; - Iterator it = hm.entryIterator(); + Iterator it = hm.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); long key = entry.getKey(); int value = entry.getValue(); totalKmers += value; @@ -288,7 +280,7 @@ private void banKmers(BigLong2IntHashMap hm, int freqThreshold, int k) { ShortKmer kmer = new ShortKmer(key, k); if (getLeftNucleotide(kmer, hm, freqThreshold) == -2 || getRightNucleotide(kmer, hm, freqThreshold) == -2) { - hm.addAndBound(key, BAN_VALUE); + hm.put(key, BAN_VALUE); totalBanned += value; uniqueBanned++; } @@ -297,19 +289,9 @@ private void banKmers(BigLong2IntHashMap hm, int freqThreshold, int k) { info("Total k-mers = " + totalKmers + ", unique k-mers = " + uniqueKmers); info("Total k-mers [<=] threshold = " + totalUnderThreshold + ", unique = " + uniqueUnderThreshold); info("Total k-mers banned = " + totalBanned + ", unique = " + uniqueBanned); - - it = hm.entryIterator(); - while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); - long key = entry.getKey(); - int value = entry.getValue(); - if (value >= BAN_VALUE) { - hm.addAndBound(key, -(value + 1)); - } - } } - private static byte getLeftNucleotide(ShortKmer kmer, BigLong2IntHashMap hm, int freqThreshold) { + private static byte getLeftNucleotide(ShortKmer kmer, BigLong2ShortHashMap hm, int freqThreshold) { byte rightNuc = kmer.nucAt(kmer.length() - 1); byte ansNuc = -1; for (byte nuc = 0; nuc <= 3; nuc++) { @@ -325,7 +307,7 @@ private static byte getLeftNucleotide(ShortKmer kmer, BigLong2IntHashMap hm, int return ansNuc; } - private static byte getRightNucleotide(ShortKmer kmer, BigLong2IntHashMap hm, int freqThreshold) { + private static byte getRightNucleotide(ShortKmer kmer, BigLong2ShortHashMap hm, int freqThreshold) { byte leftNuc = kmer.nucAt(0); byte ansNuc = -1; for (byte nuc = 0; nuc <= 3; nuc++) { diff --git a/src/tools/ViewMain.java b/src/tools/ViewMain.java index 047edb7..552e62f 100644 --- a/src/tools/ViewMain.java +++ b/src/tools/ViewMain.java @@ -6,9 +6,7 @@ import ru.ifmo.genetics.dna.Dna; import ru.ifmo.genetics.dna.kmers.ShortKmer; import ru.ifmo.genetics.dna.kmers.ShortKmerIteratorFactory; -import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap; -import ru.ifmo.genetics.structures.map.BigLong2IntHashMap; -import ru.ifmo.genetics.structures.map.MutableLongIntEntry; +import ru.ifmo.genetics.structures.map.*; import ru.ifmo.genetics.utils.tool.ExecutionFailedException; import ru.ifmo.genetics.utils.tool.Parameter; import ru.ifmo.genetics.utils.tool.Tool; @@ -83,19 +81,14 @@ protected void runImpl() throws ExecutionFailedException { } if (kmersFile.get() != null) { - BigLong2IntHashMap kmersHM; - try { - kmersHM = IOUtils.loadKmers(new File[]{kmersFile.get()}, - 0, availableProcessors.get(), this.logger); - } catch (IOException e) { - throw new ExecutionFailedException("Couldn't load k-mers from " + kmersFile, e); - } - logger.info("Printing kmers..."); + BigLong2ShortHashMap kmersHM = + IOUtils.loadKmers(new File[]{kmersFile.get()}, 0, availableProcessors.get(), logger); + logger.info("Printing kmers..."); out.println("Kmer\tCount"); - Iterator it = kmersHM.entryIterator(); + Iterator it = kmersHM.entryIterator(); while (it.hasNext()) { - MutableLongIntEntry entry = it.next(); + MutableLongShortEntry entry = it.next(); out.println(new ShortKmer(entry.getKey(), k.get()) + "\t" + entry.getValue()); } } @@ -104,7 +97,7 @@ protected void runImpl() throws ExecutionFailedException { if (componentsFile.get() != null) { List components; try { - components = ConnectedComponent.loadComponents(componentsFile.get().getPath()); + components = ConnectedComponent.loadComponents(componentsFile.get()); info(components.size() + " components loaded from " + componentsFile.get()); } catch (IOException e) { throw new ExecutionFailedException("Couldn't load components", e);