Skip to content

Commit

Permalink
Great improvements io IO operations. Using new structures with 10 byt…
Browse files Browse the repository at this point in the history
…es/(kmer with info)
  • Loading branch information
svkazakov committed Nov 15, 2014
1 parent 165eb50 commit 57b7c83
Show file tree
Hide file tree
Showing 25 changed files with 652 additions and 827 deletions.
Binary file modified lib/itmo-assembler.jar
Binary file not shown.
23 changes: 8 additions & 15 deletions src/algo/AddSequencesShiftingRightTask.java
Original file line number Diff line number Diff line change
@@ -1,37 +1,30 @@
package algo;

import it.unimi.dsi.fastutil.longs.Long2IntMap;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import ru.ifmo.genetics.dna.Dna;
import ru.ifmo.genetics.dna.DnaTools;
import ru.ifmo.genetics.dna.kmers.ShortKmer;
import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap;
import ru.ifmo.genetics.structures.map.BigLong2IntHashMap;
import ru.ifmo.genetics.structures.map.Long2IntHashMap;
import ru.ifmo.genetics.structures.map.MutableLongIntEntry;
import ru.ifmo.genetics.utils.KmerUtils;
import ru.ifmo.genetics.structures.map.*;
import structures.Sequence;

import java.util.Iterator;
import java.util.List;
import java.util.Queue;

/**
* Task class to search and add simple sequences to queue <code>sequences</code>.
*/
public class AddSequencesShiftingRightTask implements Runnable {

final BigLong2IntHashMap hm;
final Long2IntHashMap openHM;
final BigLong2ShortHashMap hm;
final Long2ShortHashMap openHM;
final int k;
int freqThreshold;
int lenThreshold;
final Queue<Sequence> sequences;
final LongOpenHashSet used;

public AddSequencesShiftingRightTask(BigLong2IntHashMap hm,
Long2IntHashMap openHM,
public AddSequencesShiftingRightTask(BigLong2ShortHashMap hm,
Long2ShortHashMap openHM,
int k, int freqThreshold, int lenThreshold,
Queue<Sequence> sequences, LongOpenHashSet used) {
this.hm = hm;
Expand All @@ -45,11 +38,11 @@ public AddSequencesShiftingRightTask(BigLong2IntHashMap hm,

@Override
public void run() {
Iterator<MutableLongIntEntry> it = openHM.entryIterator();
Iterator<MutableLongShortEntry> it = openHM.entryIterator();
while (it.hasNext()) {
MutableLongIntEntry entry = it.next();
MutableLongShortEntry entry = it.next();
long key = entry.getKey();
int value = entry.getValue();
short value = entry.getValue();
if (value <= freqThreshold) {
continue;
}
Expand Down
99 changes: 48 additions & 51 deletions src/algo/ComponentsBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,110 +4,107 @@
import it.unimi.dsi.fastutil.longs.LongArrayFIFOQueue;
import org.apache.log4j.Logger;
import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap;
import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap;
import ru.ifmo.genetics.structures.map.MutableLongShortEntry;
import ru.ifmo.genetics.structures.set.BigLongHashSet;
import structures.ConnectedComponent;

import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
* Created by ulyantsev on 06.05.14.
*
*/
public class ComponentsBuilder {
public static List<ConnectedComponent> splitStrategy(ArrayLong2IntHashMap hm,
public static List<ConnectedComponent> splitStrategy(BigLong2ShortHashMap hm,
int k,
int b1,
int b2,
int b1, int b2,
String statFP,
Logger logger,
int availableProcessors) throws FileNotFoundException {
Logger logger)
throws FileNotFoundException {
List<ConnectedComponent> ans = new ArrayList<ConnectedComponent>();
BigLongHashSet processedKmers = new BigLongHashSet(hm.capacity());

PrintWriter statPW = new PrintWriter(statFP);
statPW.println("comp.size\tcomp.weight\tfreqThreshold");
statPW.println("# component.size\tcomponent.weight\tfreqThreshold");
for (int freqThreshold = 0; ; freqThreshold++) {
List<ConnectedComponent> components = getComponents(hm, k, freqThreshold, availableProcessors);
List<ConnectedComponent> components = getComponents(hm, k, freqThreshold, processedKmers);
if (components.size() == 0) {
break;
}
int added = 0;
for (ConnectedComponent comp : components) {
if (comp.size() < b1) {
banComponent(hm, comp);
} else if (comp.size() < b2) {
ans.add(comp);
added++;
statPW.println(comp.size() + "\t" + comp.getWeight() + "\t" + freqThreshold);
banComponent(hm, comp);
}
}
logger.debug("Freq = " + freqThreshold + ", components count = " + ans.size());
logger.debug("FreqThreshold = " + freqThreshold + ", " +
"components added = " + added + ", total components = " + ans.size());
}
statPW.close();

return ans;
}

private static void banComponent(ArrayLong2IntHashMap hm, ConnectedComponent component) {
private static void banComponent(BigLong2ShortHashMap hm, ConnectedComponent component) {
for (long kmer : component.kmers) {
hm.add(kmer, -hm.get(kmer));
hm.put(kmer, (short) 0);
}
}

private static List<ConnectedComponent> getComponents(ArrayLong2IntHashMap hm,
int k,
int freqThreshold,
int availableProcessors) {
private static List<ConnectedComponent> getComponents(BigLong2ShortHashMap hm,
int k, int freqThreshold,
BigLongHashSet processedKmers) {
List<ConnectedComponent> ans = new ArrayList<ConnectedComponent>();

ArrayLong2IntHashMap processedKmers =
new ArrayLong2IntHashMap((int) (Math.log(availableProcessors) / Math.log(2)) + 4);
for (int i = 0; i < hm.hm.length; ++i) {
for (Long2IntMap.Entry entry : hm.hm[i].long2IntEntrySet()) {
long kmer = entry.getLongKey();
if (processedKmers.get(kmer) > 0) {
continue;
}
processedKmers.reset();

int value = entry.getIntValue();
if (value > freqThreshold) {
ConnectedComponent comp = getComponent(hm, k, kmer, freqThreshold, processedKmers);
ans.add(comp);
}
Iterator<MutableLongShortEntry> it = hm.entryIterator();
while (it.hasNext()) {
MutableLongShortEntry entry = it.next();
long kmer = entry.getKey();
short value = entry.getValue();
if (value > freqThreshold && !processedKmers.contains(kmer)) {
ConnectedComponent comp = getComponent(hm, k, kmer, freqThreshold, processedKmers);
ans.add(comp);
}
}

return ans;
}

private static ConnectedComponent getComponent(ArrayLong2IntHashMap hm,
int kValue,
long kmer,
private static ConnectedComponent getComponent(BigLong2ShortHashMap hm,
int k,
long startKmer,
int freqThreshold,
ArrayLong2IntHashMap processedKmers) {
if (hm.get(kmer) <= freqThreshold || processedKmers.get(kmer) > 0) {
return null;
}
BigLongHashSet processedKmers) {

ConnectedComponent ans = new ConnectedComponent();
long weight = 0;

long weight = hm.get(kmer);
LongArrayFIFOQueue queue = new LongArrayFIFOQueue();

queue.enqueue(kmer);
processedKmers.add(kmer, 1);
queue.enqueue(startKmer);
processedKmers.add(startKmer);
ans.add(startKmer);
weight += hm.get(startKmer);

while (queue.size() > 0) {
long kmerRepr = queue.dequeue();
ans.add(kmerRepr);

for (long neighbour : KmerOperations.possibleNeighbours(kmerRepr, kValue)) {
int value = hm.get(neighbour);
if (value <= freqThreshold || processedKmers.get(neighbour) > 0) {
continue;
long kmer = queue.dequeue();

for (long neighbour : KmerOperations.possibleNeighbours(kmer, k)) {
short value = hm.get(neighbour);
if (value > freqThreshold && !processedKmers.contains(neighbour)) {
queue.enqueue(neighbour);
processedKmers.add(neighbour);
ans.add(neighbour);
weight += value;
}
weight += value;
processedKmers.add(neighbour, 1);
queue.enqueue(neighbour);
}
}

Expand Down
37 changes: 9 additions & 28 deletions src/algo/HashMapOperations.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,14 @@
import it.unimi.dsi.fastutil.longs.Long2IntMap;
import org.apache.log4j.Logger;
import ru.ifmo.genetics.dna.kmers.ShortKmer;
import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap;
import ru.ifmo.genetics.structures.map.BigLong2IntHashMap;
import ru.ifmo.genetics.structures.map.MutableLongIntEntry;
import ru.ifmo.genetics.structures.map.*;

import java.util.HashMap;
import java.util.Iterator;

public class HashMapOperations {

public static byte getLeftNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int freqThreshold) {
public static byte getLeftNucleotide(BigLong2ShortHashMap hm, ShortKmer kmer, int freqThreshold) {
byte rightNuc = kmer.nucAt(kmer.length() - 1);
byte ansNuc = -1;
for (byte nuc = 0; nuc <= 3; nuc++) {
Expand All @@ -30,7 +28,7 @@ public static byte getLeftNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int
return ansNuc;
}

public static byte getRightNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int freqThreshold) {
public static byte getRightNucleotide(BigLong2ShortHashMap hm, ShortKmer kmer, int freqThreshold) {
byte leftNuc = kmer.nucAt(0);
byte ansNuc = -1;
for (byte nuc = 0; nuc <= 3; nuc++) {
Expand All @@ -48,25 +46,18 @@ public static byte getRightNucleotide(BigLong2IntHashMap hm, ShortKmer kmer, int
return ansNuc;
}

public static HashMap<ShortKmer, Integer> getNeighbours(BigLong2IntHashMap hm,
ShortKmer kmer,
int depth) {

return null;
}

public static void banBranchingKmers(BigLong2IntHashMap hm,
public static void banBranchingKmers(BigLong2ShortHashMap hm,
int freqThreshold,
int k,
Logger logger) {
int BAN_VALUE = 1000000000;
final short BAN_VALUE = -1;
long totalKmers = 0, uniqueKmers = 0,
totalBanned = 0, uniqueBanned = 0,
totalUnderThreshold = 0, uniqueUnderThreshold = 0;

Iterator<MutableLongIntEntry> it = hm.entryIterator();
Iterator<MutableLongShortEntry> it = hm.entryIterator();
while (it.hasNext()) {
MutableLongIntEntry entry = it.next();
MutableLongShortEntry entry = it.next();
long key = entry.getKey();
int value = entry.getValue();
totalKmers += value;
Expand All @@ -80,24 +71,14 @@ public static void banBranchingKmers(BigLong2IntHashMap hm,
ShortKmer kmer = new ShortKmer(key, k);
if (getLeftNucleotide(hm, kmer, freqThreshold) == -2 ||
getRightNucleotide(hm, kmer, freqThreshold) == -2) {
hm.addAndBound(key, BAN_VALUE);
hm.put(key, BAN_VALUE);
totalBanned += value;
uniqueBanned++;
}
}

logger.info("Total k-mers = " + totalKmers + ", unique k-mers = " + uniqueKmers);
logger.info("Total k-mers [<=] threshold = " + totalUnderThreshold + ", unique = " + uniqueUnderThreshold);
logger.info("Total k-mers under threshold = " + totalUnderThreshold + ", unique = " + uniqueUnderThreshold);
logger.info("Total k-mers banned = " + totalBanned + ", unique = " + uniqueBanned);

it = hm.entryIterator();
while (it.hasNext()) {
MutableLongIntEntry entry = it.next();
int value = entry.getValue();
if (value >= BAN_VALUE) {
long key = entry.getKey();
hm.addAndBound(key, -(value + 1));
}
}
}
}
5 changes: 2 additions & 3 deletions src/algo/SequencesFinders.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@

import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import ru.ifmo.genetics.executors.BlockingThreadPoolExecutor;
import ru.ifmo.genetics.structures.map.ArrayLong2IntHashMap;
import ru.ifmo.genetics.structures.map.BigLong2IntHashMap;
import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap;
import structures.Sequence;

import java.util.*;
import java.util.concurrent.ConcurrentLinkedDeque;

public class SequencesFinders {

public static Deque<Sequence> thresholdStrategy(BigLong2IntHashMap hm,
public static Deque<Sequence> thresholdStrategy(BigLong2ShortHashMap hm,
int availableProcessors,
int freqThreshold,
int lenThreshold,
Expand Down
69 changes: 69 additions & 0 deletions src/io/BytesDispatcher.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
package io;

import org.apache.log4j.Logger;
import ru.ifmo.genetics.dna.Dna;
import ru.ifmo.genetics.io.sources.Source;
import ru.ifmo.genetics.structures.map.BigLong2ShortHashMap;
import ru.ifmo.genetics.utils.Misc;
import ru.ifmo.genetics.utils.NumUtils;
import ru.ifmo.genetics.utils.iterators.ProgressableIterator;
import ru.ifmo.genetics.utils.tool.Tool;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

public class BytesDispatcher {
final Logger logger = Logger.getLogger("bytes-dispatcher");

final InputStream is;
public final int workRangeSize;
long bytesRead = 0;

final BigLong2ShortHashMap hm; // for debug output

public BytesDispatcher(InputStream is, int workRangeSize, BigLong2ShortHashMap hmForMonitoring) {
this.is = is;
this.workRangeSize = workRangeSize;
Tool.debug(logger, "Using " + workRangeSize + " bytes as workRangeSize");
hm = hmForMonitoring;
}



public byte[] getNewEmptyWorkRange() {
return new byte[workRangeSize];
}

public synchronized int readWorkRange(byte[] range) {
try {
int read = 0;
while (read < range.length) {
int r = is.read(range, read, range.length - read);
if (r == -1) {
break;
}
read += r;
}

bytesRead += read;
/*
if ((bytesRead & ((1 << 29) - 1)) == 0) { // 512 Mb
logger.debug("Processed " + (bytesRead >> 20) + " Mb of data:");
if (hm != null) {
logger.debug("Total hm size = " + NumUtils.groupDigits(hm.size()) + ", " +
"size in hm.maps = {" + NumUtils.groupDigits(hm.maps[0].size()) + ", "
+ NumUtils.groupDigits(hm.maps[1].size()) + ", "
+ NumUtils.groupDigits(hm.maps[2].size()) + ", "
+ NumUtils.groupDigits(hm.maps[3].size()) + ", ...}");
}
logger.debug("Available memory (without running GC) = " + Misc.availableMemoryWithoutRunningGCAsString());
}
*/
return read;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
Loading

0 comments on commit 57b7c83

Please sign in to comment.