diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java index 02ff89b7..eade2599 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/compact/integer/VByte.java @@ -34,6 +34,7 @@ import java.nio.ByteBuffer; import org.rdfhdt.hdt.util.Mutable; +import org.rdfhdt.hdt.util.io.BigByteBuffer; import org.rdfhdt.hdt.util.io.BigMappedByteBuffer; /** @@ -142,7 +143,7 @@ public static int encode(byte[] data, int offset, int value) { return i; } - + public static int decode(byte[] data, int offset, Mutable value) { long out = 0; int i=0; @@ -157,6 +158,21 @@ public static int decode(byte[] data, int offset, Mutable value) { value.setValue(out); return i; } + + public static int decode(BigByteBuffer data, long offset, Mutable value) { + long out = 0; + int i = 0; + int shift=0; + while( (0x80 & data.get(offset+i))==0) { + out |= (data.get(offset+i) & 127) << shift; + i++; + shift+=7; + } + out |= (data.get(offset+i) & 127) << shift; + i++; + value.setValue(out); + return i; + } public static void show(byte[] data, int len) { for(int i=0;i getAllMappings(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java index cf533dd0..7a8333b8 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryCat.java @@ -36,12 +36,10 @@ public class FourSectionDictionaryCat implements DictionaryCat { - private String location; - private int DEFAULT_BLOCK_SIZE = 16; - private int BLOCK_PER_BUFFER = 1000000; + private final HashMap allMappings = new HashMap<>(); + private final String location; private long numShared; - private HashMap allMappings = new HashMap<>(); private CatMappingBack mappingS; @@ -49,7 +47,7 @@ public FourSectionDictionaryCat(String location) { this.location = location; } - public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener listener){ + public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener listener) throws IOException { allMappings.put("P1",new CatMapping(location,"P1",dictionary1.getPredicates().getNumberOfElements())); allMappings.put("P2",new CatMapping(location,"P2",dictionary2.getPredicates().getNumberOfElements())); allMappings.put("S1",new CatMapping(location,"S1",dictionary1.getSubjects().getNumberOfElements())); @@ -64,7 +62,6 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener int numCommonPredicates = 0; CatIntersection commonP1P2 = new CatIntersection(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1"),new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); - long maxPredicates = dictionary1.getPredicates().getNumberOfElements()+dictionary2.getPredicates().getNumberOfElements(); while (commonP1P2.hasNext()){ commonP1P2.next(); numCommonPredicates++; @@ -76,8 +73,6 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener addPredicatesList.add(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1")); addPredicatesList.add(new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); CatUnion itAddPredicates = new CatUnion(addPredicatesList); -// while (itAddPredicates.hasNext()) -// System.out.println(itAddPredicates.next().entity); SectionUtil.createSection(location,numPredicates, 4,itAddPredicates, new CatUnion(new ArrayList<>()),allMappings,0, listener); System.out.println("SUBJECTS-------------------"); ArrayList> skipSubjectList = new ArrayList<>(); @@ -160,7 +155,6 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener i2.next(); numCommonS1O2++; } - Iterator it = dictionary2.getSubjects().getSortedEntries(); i2 = new CatIntersection(new CatWrapper(dictionary1.getObjects().getSortedEntries(),"O1"), new CatWrapper(dictionary2.getSubjects().getSortedEntries(),"S2")); int numCommonO1S2=0; while (i2.hasNext()){ @@ -210,7 +204,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } try { InputStream in = new FileInputStream(location + "section" + j); - int b = 0; + int b; while ((b = in.read(buf)) >= 0) { outFinal.write(buf, 0, b); outFinal.flush(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java index c1c07631..4c41b3dd 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionaryDiff.java @@ -4,17 +4,28 @@ import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionaryDiff; -import org.rdfhdt.hdt.dictionary.impl.utilCat.*; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatElement; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatIntersection; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMapping; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatUnion; +import org.rdfhdt.hdt.dictionary.impl.utilCat.SectionUtil; import org.rdfhdt.hdt.dictionary.impl.utilDiff.DiffWrapper; -import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; -import java.io.*; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; public class FourSectionDictionaryDiff implements DictionaryDiff { @@ -29,7 +40,7 @@ public FourSectionDictionaryDiff(String location) { } @Override - public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) { + public void diff(Dictionary dictionary, Map bitmaps, ProgressListener listener) throws IOException { allMappings.put("predicate", new CatMapping(location, "predicate", dictionary.getPredicates().getNumberOfElements())); allMappings.put("subject", new CatMapping(location, "subject", dictionary.getSubjects().getNumberOfElements())); allMappings.put("object", new CatMapping(location, "object", dictionary.getObjects().getNumberOfElements())); @@ -60,11 +71,7 @@ public void diff(Dictionary dictionary, Map bitmaps, P listSkipSubj.add(itSkipSubs); SharedWrapper sharedWrapper = new SharedWrapper(0, bitmaps.get("SH_S"), bitmaps.get("SH_O"), dictionary.getShared().getSortedEntries()); - long numNewSubj = 0; - while (sharedWrapper.hasNext()) { - sharedWrapper.next(); - numNewSubj++; - } + long numNewSubj = sharedWrapper.count(); sharedWrapper = new SharedWrapper(0, bitmaps.get("SH_S"), bitmaps.get("SH_O"), dictionary.getShared().getSortedEntries()); listSkipSubj.add(sharedWrapper); @@ -84,11 +91,7 @@ public void diff(Dictionary dictionary, Map bitmaps, P // flag = 1 for objects sharedWrapper = new SharedWrapper(1, bitmaps.get("SH_S"), bitmaps.get("SH_O"), dictionary.getShared().getSortedEntries()); - long numNewObj = 0; - while (sharedWrapper.hasNext()) { - numNewObj++; - sharedWrapper.next(); - } + long numNewObj = sharedWrapper.count(); sharedWrapper = new SharedWrapper(1, bitmaps.get("SH_S"), bitmaps.get("SH_O"), dictionary.getShared().getSortedEntries()); listSkipObjs.add(sharedWrapper); @@ -215,6 +218,15 @@ public boolean hasNext() { public CatElement next() { return next; } + + public int count() { + int i = 0; + while (hasNext()) { + // next(); + i++; + } + return i; + } } @Override diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java index d9b692bb..e65c6a99 100755 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryCat.java @@ -47,20 +47,20 @@ public class MultipleSectionDictionaryCat implements DictionaryCat { - private String location; - private int DEFAULT_BLOCK_SIZE = 16; - private int BLOCK_PER_BUFFER = 1000000; + private static final int DEFAULT_BLOCK_SIZE = 16; + private static final int BLOCK_PER_BUFFER = 1000000; + private static final String NO_DT_OBJECTS = "NO_DATATYPE"; + private final String location; private long numShared; - private HashMap allMappings = new HashMap<>(); + private final HashMap allMappings = new HashMap<>(); private CatMappingBack mappingS; - private String NO_DT_OBJECTS = "NO_DATATYPE"; public MultipleSectionDictionaryCat(String location) { this.location = location; } - public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener listener){ + public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener listener) throws IOException { // Initialize all mappings ...... @@ -73,27 +73,26 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener allMappings.put("O2",new CatMapping(location, "O2",dictionary2.getNAllObjects())); allMappings.put("SH1",new CatMapping(location,"SH1",dictionary1.getShared().getNumberOfElements())); allMappings.put("SH2",new CatMapping(location,"SH2",dictionary2.getShared().getNumberOfElements())); - Iterator hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); + Iterator> hmIterator1 = dictionary1.getAllObjects().entrySet().iterator(); int countSubSections1 = 0; int countSubSections2 = 0; while (hmIterator1.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator1.next(); + Map.Entry entry = hmIterator1.next(); String prefix = "sub"+countSubSections1; if((entry.getKey()).equals(NO_DT_OBJECTS)) - prefix = (String)entry.getKey(); + prefix = entry.getKey(); allMappings.put(prefix+"1",new CatMapping(location,prefix+"1", - ((DictionarySection)entry.getValue()).getNumberOfElements())); + entry.getValue().getNumberOfElements())); countSubSections1++; } - Iterator hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); + Iterator> hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); while (hmIterator2.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator2.next(); + Map.Entry entry = hmIterator2.next(); String prefix = "sub"+countSubSections2; if((entry.getKey()).equals(NO_DT_OBJECTS)) - prefix = (String)entry.getKey(); - allMappings.put(prefix+"2",new CatMapping(location,prefix+"2", - ((DictionarySection)entry.getValue()).getNumberOfElements())); + prefix = entry.getKey(); + allMappings.put(prefix+"2",new CatMapping(location,prefix+"2", entry.getValue().getNumberOfElements())); countSubSections2++; } @@ -104,7 +103,6 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener int numCommonPredicates = 0; CatIntersection commonP1P2 = new CatIntersection(new CatWrapper(dictionary1.getPredicates().getSortedEntries(),"P1"), new CatWrapper(dictionary2.getPredicates().getSortedEntries(),"P2")); - long maxPredicates = dictionary1.getPredicates().getNumberOfElements()+dictionary2.getPredicates().getNumberOfElements(); while (commonP1P2.hasNext()){ commonP1P2.next(); numCommonPredicates++; @@ -210,16 +208,16 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener if(hmIterator1.hasNext()){ if(!skip1) { - Map.Entry entry1 = (Map.Entry) hmIterator1.next(); - section1 = (DictionarySection)entry1.getValue(); - dataType1 = entry1.getKey().toString(); + Map.Entry entry1 = hmIterator1.next(); + section1 = entry1.getValue(); + dataType1 = entry1.getKey(); } } if(hmIterator2.hasNext()){ if(!skip2){ - Map.Entry entry2 = (Map.Entry)hmIterator2.next(); - section2 = (DictionarySection)entry2.getValue(); - dataType2 = entry2.getKey().toString(); + Map.Entry entry2 = hmIterator2.next(); + section2 = entry2.getValue(); + dataType2 = entry2.getKey(); } } if(section1 != null && section2 != null && dataType1.equals(dataType2)) { @@ -298,9 +296,9 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener ArrayList> countObjectsList = new ArrayList<>(); if(hmIterator1.hasNext()){ if(!skip1) { - Map.Entry entry = (Map.Entry) hmIterator1.next(); - dataType1 = (String) entry.getKey(); - section1 = ((DictionarySection) entry.getValue()); + Map.Entry entry = hmIterator1.next(); + dataType1 = entry.getKey(); + section1 = entry.getValue(); prefix1 = "sub" + countSubSections1; if (dataType1.equals(NO_DT_OBJECTS)) prefix1 = dataType1; @@ -309,9 +307,9 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } if(hmIterator2.hasNext()){ if(!skip2) { - Map.Entry entry = (Map.Entry) hmIterator2.next(); - dataType2 = (String) entry.getKey(); - section2 = ((DictionarySection) entry.getValue()); + Map.Entry entry = hmIterator2.next(); + dataType2 = entry.getKey(); + section2 = entry.getValue(); prefix2 = "sub" + countSubSections2; if (dataType2.equals(NO_DT_OBJECTS)) prefix2 = dataType2; @@ -427,7 +425,6 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener numCommonS1O2++; } } - Iterator it = dictionary2.getSubjects().getSortedEntries(); int numCommonO1S2 = 0; if(dictionary1.getAllObjects().containsKey(NO_DT_OBJECTS)) { CatIntersection i2 = new CatIntersection(new CatWrapper(dictionary1.getAllObjects().get(NO_DT_OBJECTS).getSortedEntries(), NO_DT_OBJECTS + "1"), new CatWrapper(dictionary2.getSubjects().getSortedEntries(), "S2")); @@ -484,7 +481,7 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } } InputStream in = new FileInputStream(location + "section" + i); - int b = 0; + int b; while ((b = in.read(buf)) >= 0) { outFinal.write(buf, 0, b); outFinal.flush(); @@ -505,8 +502,8 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener countSubSections1 = 0; countSubSections2 = 0; while (hmIterator1.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator1.next(); - String dataType = (String)entry.getKey(); + Map.Entry entry = hmIterator1.next(); + String dataType = entry.getKey(); String prefix = "sub"+countSubSections1; if(dataType.equals(NO_DT_OBJECTS)) prefix = dataType+"1"; @@ -529,8 +526,8 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener oldId = 0; hmIterator2 = dictionary2.getAllObjects().entrySet().iterator(); while (hmIterator2.hasNext()){ - Map.Entry entry = (Map.Entry)hmIterator2.next(); - String dataType = (String)entry.getKey(); + Map.Entry entry = hmIterator2.next(); + String dataType = entry.getKey(); String prefix = "sub"+countSubSections2; if(dataType.equals(NO_DT_OBJECTS)) prefix = dataType+"2"; @@ -548,8 +545,6 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } } } - //printMappings(); - //System.out.println("Num shared: "+numShared); //calculate the inverse mapping for the subjects, i.e. from the new dictionary subject section to the old ones mappingS = new CatMappingBack(location,numSubjects+numShared); @@ -577,43 +572,30 @@ public void cat(Dictionary dictionary1, Dictionary dictionary2, ProgressListener } } } - public void printMappings(){ - Iterator iterMap = allMappings.entrySet().iterator(); - while (iterMap.hasNext()){ - Map.Entry entry = (Map.Entry)iterMap.next(); - CatMapping mapping = (CatMapping)entry.getValue(); - System.out.println(entry.getKey()); - for(int i=0;i mappings, ProgressListener listener) { - CRCOutputStream out_buffer = null; - long numberElements = 0; - try { - out_buffer = new CRCOutputStream(new FileOutputStream(location+"section_buffer_"+type), new CRC32()); - } catch (FileNotFoundException e) { - e.printStackTrace(); + private void catSection(long numEntries, int type, CatUnion itAdd , CatUnion itSkip , HashMap mappings, ProgressListener listener) throws IOException { + long numberElements = 0; + String name; + switch (type) { + case 2: + name = "subject"; + break; + case 3: + name = "object"; + break; + case 4: + name = "predicate"; + break; + default: + name = ""; + break; } - - try { - String name = ""; - switch (type) { - case 2: - name = "subject"; - break; - case 3: - name = "object"; - break; - case 4: - name = "predicate"; - } - long storedBuffersSize = 0; - long numBlocks = 0; - SequenceLog64BigDisk blocks = new SequenceLog64BigDisk(location+"SequenceLog64BigDisk"+type,64, numEntries/16); - ByteArrayOutputStream byteOut = new ByteArrayOutputStream(16*1024); + long storedBuffersSize = 0; + long numBlocks = 0; + SequenceLog64BigDisk blocks; + ByteArrayOutputStream byteOut; + try (CRCOutputStream outBuffer = new CRCOutputStream(new FileOutputStream(location+"section_buffer_"+type), new CRC32())) { + blocks = new SequenceLog64BigDisk(location+"SequenceLog64BigDisk"+type,64, numEntries/16); + byteOut = new ByteArrayOutputStream(16*1024); if (numEntries > 0) { CharSequence previousStr=null; @@ -624,32 +606,29 @@ public long catSection(long numEntries, int type, CatUnion itAdd , CatUnion itSk while (itAdd.hasNext()){ ListenerUtil.notifyCond(listener, "Analyze section "+name+" ", numberElements, numberElements, numEntries); CatElement nextElement = itAdd.next(); - Boolean skip = false; - if(skipElement!= null && nextElement.entity.toString().equals(skipElement.entity.toString())) - skip = true; - else { + if (skipElement!= null && nextElement.entity.toString().equals(skipElement.entity.toString())) { + if(itSkip.hasNext()) + skipElement = itSkip.next(); + else + skipElement = null; + } else { for (int i = 0; i < nextElement.IDs.size(); i++) { long id = nextElement.IDs.get(i).pos; String iter = nextElement.IDs.get(i).iter.toString(); mappings.get(iter).set(id - 1, numberElements + 1, type); } - } - if(skip){ - if(itSkip.hasNext()) - skipElement = itSkip.next(); - else - skipElement = null; - }else{ + String str = nextElement.entity.toString(); if (numberElements % DEFAULT_BLOCK_SIZE == 0) { blocks.append(storedBuffersSize + byteOut.size()); numBlocks++; // if a buffer is filled, flush the byteOut and store it - if (((numBlocks - 1) % BLOCK_PER_BUFFER == 0) && ((numBlocks - 1) / BLOCK_PER_BUFFER != 0)) { + if (((numBlocks - 1) % BLOCK_PER_BUFFER == 0) && ((numBlocks - 1) / BLOCK_PER_BUFFER != 0) || byteOut.size() > 200000) { storedBuffersSize += byteOut.size(); byteOut.flush(); - IOUtil.writeBuffer(out_buffer, byteOut.toByteArray(), 0, byteOut.toByteArray().length, null); + byte[] array = byteOut.toByteArray(); + IOUtil.writeBuffer(outBuffer, array, 0, array.length, null); byteOut.close(); byteOut = new ByteArrayOutputStream(16 * 1024); } @@ -676,39 +655,35 @@ public long catSection(long numEntries, int type, CatUnion itAdd , CatUnion itSk blocks.aggressiveTrimToSize(); byteOut.flush(); //section.addBuffer(buffer, byteOut.toByteArray()); - IOUtil.writeBuffer(out_buffer, byteOut.toByteArray(), 0, byteOut.toByteArray().length, null); - out_buffer.writeCRC(); - out_buffer.close(); + IOUtil.writeBuffer(outBuffer, byteOut.toByteArray(), 0, byteOut.toByteArray().length, null); + outBuffer.writeCRC(); + } //Save the section conforming to the HDT format - CRCOutputStream out = new CRCOutputStream(new FileOutputStream(location+"section"+type), new CRC8()); + try (CRCOutputStream out = new CRCOutputStream(new FileOutputStream(location+"section"+type), new CRC8())) { //write the index type out.write(2); //write the number of strings VByte.encode(out, numberElements); //write the datasize - VByte.encode(out, storedBuffersSize+byteOut.size()); + VByte.encode(out, storedBuffersSize + byteOut.size()); //wirte the blocksize VByte.encode(out, DEFAULT_BLOCK_SIZE); //write CRC out.writeCRC(); //write the blocks - blocks.save(out, null); // Write blocks directly to output, they have their own CRC check. + blocks.save(out, null); // Write blocks directly to output, they have their own CRC check. blocks.close(); //write out_buffer byte[] buf = new byte[100000]; - InputStream in = new FileInputStream(location+"section_buffer_"+type); - int b = 0; - while ( (b = in.read(buf)) >= 0) { + InputStream in = new FileInputStream(location + "section_buffer_" + type); + int b; + while ((b = in.read(buf)) >= 0) { out.write(buf, 0, b); out.flush(); } - out.close(); - Files.delete(Paths.get(location+"section_buffer_"+type)); - Files.delete(Paths.get(location+"SequenceLog64BigDisk"+type)); - } catch (IOException e) { - e.printStackTrace(); } - return numberElements; + Files.deleteIfExists(Paths.get(location+"section_buffer_"+type)); + Files.deleteIfExists(Paths.get(location+"SequenceLog64BigDisk"+type)); } public CatMappingBack getMappingS() { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java index bdf6c8ab..c8691ba6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/MultipleSectionDictionaryDiff.java @@ -5,15 +5,22 @@ import org.rdfhdt.hdt.dictionary.Dictionary; import org.rdfhdt.hdt.dictionary.DictionaryDiff; import org.rdfhdt.hdt.dictionary.DictionarySection; -import org.rdfhdt.hdt.dictionary.impl.utilCat.*; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatElement; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatIntersection; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatMapping; +import org.rdfhdt.hdt.dictionary.impl.utilCat.CatUnion; +import org.rdfhdt.hdt.dictionary.impl.utilCat.SectionUtil; import org.rdfhdt.hdt.dictionary.impl.utilDiff.DiffWrapper; -import org.rdfhdt.hdt.hdt.HDTVocabulary; import org.rdfhdt.hdt.listener.ProgressListener; import org.rdfhdt.hdt.options.ControlInfo; import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.util.io.IOUtil; -import java.io.*; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Paths; import java.util.ArrayList; @@ -23,8 +30,8 @@ public class MultipleSectionDictionaryDiff implements DictionaryDiff { - private String location; - private HashMap allMappings = new HashMap<>(); + private final String location; + private final HashMap allMappings = new HashMap<>(); private CatMapping mappingBack; public long numShared; public MultipleSectionDictionaryDiff(String location){ @@ -82,11 +89,7 @@ public void diff(Dictionary dictionary, Map bitmaps, P SharedWrapper sharedWrapper = new SharedWrapper(0, bitmaps.get("SH_S"), bitmaps.get("SH_O"), dictionary.getShared().getSortedEntries()); - long numNewSubj = 0; - while (sharedWrapper.hasNext()){ - numNewSubj++; - sharedWrapper.next(); - } + long numNewSubj = sharedWrapper.count(); sharedWrapper = new SharedWrapper(0, bitmaps.get("SH_S"), bitmaps.get("SH_O"), dictionary.getShared().getSortedEntries()); listSkipSubj.add(sharedWrapper); @@ -235,7 +238,7 @@ public void diff(Dictionary dictionary, Map bitmaps, P } } - private long createNoDataTypeSection(Map bitmaps,Dictionary dictionary,long numObjectsAlreadyAdded,int type){ + private long createNoDataTypeSection(Map bitmaps,Dictionary dictionary,long numObjectsAlreadyAdded,int type) throws IOException { Bitmap objectsBitMap = bitmaps.get("NO_DATATYPE"); Iterator objects = dictionary.getAllObjects().get("NO_DATATYPE").getSortedEntries(); @@ -246,11 +249,7 @@ private long createNoDataTypeSection(Map bitmaps,Dicti // flag = 1 for objects SharedWrapper sharedWrapper = new SharedWrapper(1, bitmaps.get("SH_S"), bitmaps.get("SH_O"),dictionary.getShared().getSortedEntries()); - long numNewObj = 0; - while (sharedWrapper.hasNext()){ - numNewObj++; - sharedWrapper.next(); - } + long numNewObj = sharedWrapper.count(); sharedWrapper = new SharedWrapper(1, bitmaps.get("SH_S"), bitmaps.get("SH_O"),dictionary.getShared().getSortedEntries()); listSkipObjs.add(sharedWrapper); @@ -259,7 +258,7 @@ private long createNoDataTypeSection(Map bitmaps,Dicti SectionUtil.createSection(location,numObject,type,new CatUnion(listSkipObjs),new CatUnion(new ArrayList<>()),allMappings,numObjectsAlreadyAdded,null); return numObject; } - private class SharedWrapper implements Iterator { + private static class SharedWrapper implements Iterator { private final Bitmap bitmapSub; private final Bitmap bitmapObj; private final Iterator sectionIter; @@ -296,6 +295,15 @@ public boolean hasNext() { public CatElement next() { return next; } + + public int count() { + int i = 0; + while (hasNext()) { + // next(); + i++; + } + return i; + } } public HashMap getAllMappings() { return allMappings; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java index 6199abb9..c22a87f5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySectionBig.java @@ -34,6 +34,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; +import java.nio.file.Files; import java.util.Iterator; import org.rdfhdt.hdt.compact.integer.VByte; @@ -50,7 +51,7 @@ import org.rdfhdt.hdt.util.crc.CRC8; import org.rdfhdt.hdt.util.crc.CRCInputStream; import org.rdfhdt.hdt.util.crc.CRCOutputStream; -import org.rdfhdt.hdt.util.io.IOUtil; +import org.rdfhdt.hdt.util.io.BigByteBuffer; import org.rdfhdt.hdt.util.string.ByteStringUtil; import org.rdfhdt.hdt.util.string.CompactString; import org.rdfhdt.hdt.util.string.ReplazableString; @@ -74,11 +75,11 @@ public class PFCDictionarySectionBig implements DictionarySectionPrivate { public static final int DEFAULT_BLOCK_SIZE = 16; public static final int BLOCK_PER_BUFFER = 1000000; - byte [][] data; + BigByteBuffer[] data; long [] posFirst; protected SequenceLog64Big blocks; protected int blocksize; - protected int numstrings; + protected long numstrings; protected long size; static int filecounter = 0; @@ -97,7 +98,7 @@ public void load(TempDictionarySection other, ProgressListener listener) { this.blocks = new SequenceLog64Big(BitUtil.log2(other.size()), other.getNumberOfElements()/blocksize); log.info("numbits:{}", BitUtil.log2(other.size())); Iterator it = other.getSortedEntries(); - this.load((Iterator)it, other.getNumberOfElements(), listener); + this.load(it, other.getNumberOfElements(), listener); } @@ -176,7 +177,7 @@ public void load(Iterator it, long numentries, ProgressL long numBlocks = blocks.getNumberOfElements(); //System.out.println("numblocks:"+numBlocks); - long numBuffers = -1; + long numBuffers; if(numBlocks > 0){ // non empty section numBuffers = 1+numBlocks/BLOCK_PER_BUFFER; @@ -184,15 +185,17 @@ public void load(Iterator it, long numentries, ProgressL // else empty section then it's zero numBuffers = 0; } - data = new byte[(int)numBuffers][]; + data = new BigByteBuffer[(int)numBuffers]; posFirst = new long[(int)numBuffers]; while(block it, long numentries, ProgressL finally { try { out.close(); - file.delete(); + Files.delete(file.toPath()); } catch (IOException e) { log.error("Unexpected exception.", e); } @@ -224,19 +227,19 @@ public void load(Iterator it, long numentries, ProgressL /** * Locate the block of a string doing binary search. */ - protected int locateBlock(CharSequence str) { - int low = 0; - int high = (int)blocks.getNumberOfElements() - 1; - int max = high; + protected long locateBlock(CharSequence str) { + long low = 0; + long high = blocks.getNumberOfElements() - 1; + long max = high; while (low <= high) { - int mid = (low + high) >>> 1; + long mid = (low + high) >>> 1; int cmp; if(mid==max) { cmp = -1; } else { - cmp = ByteStringUtil.strcmp(str, data[mid/BLOCK_PER_BUFFER], (int)(blocks.get(mid)-posFirst[mid/BLOCK_PER_BUFFER])); + cmp = ByteStringUtil.strcmp(str, data[(int)(mid/BLOCK_PER_BUFFER)], blocks.get(mid)-posFirst[(int)(mid/BLOCK_PER_BUFFER)]); //System.out.println("Comparing against block: "+ mid + " which is "+ ByteStringUtil.asString(data[mid], 0)+ " Result: "+cmp); } @@ -278,30 +281,30 @@ public long locate(CharSequence str) { return 0; } - protected int locateInBlock(long blocknum, CharSequence str) { + protected long locateInBlock(long blocknum, CharSequence str) { ReplazableString tempString = new ReplazableString(); Mutable delta = new Mutable<>(0L); - int idInBlock = 0; + long idInBlock = 0; int cshared=0; - - byte [] block = data[(int) (blocknum/BLOCK_PER_BUFFER)]; - int pos = (int) (blocks.get(blocknum)-posFirst[(int) (blocknum/BLOCK_PER_BUFFER)]); + + BigByteBuffer block = data[(int) (blocknum/BLOCK_PER_BUFFER)]; + long pos = (blocks.get(blocknum)-posFirst[(int) (blocknum/BLOCK_PER_BUFFER)]); // Read the first string in the block - int slen = ByteStringUtil.strlen(block, pos); + int slen = (int) ByteStringUtil.strlen(block, pos); tempString.append(block, pos, slen); pos+=slen+1; idInBlock++; - while( (idInBlock=cshared) @@ -326,7 +329,7 @@ protected int locateInBlock(long blocknum, CharSequence str) { } // Not found - if(pos==block.length || idInBlock== blocksize) { + if(pos==block.size() || idInBlock== blocksize) { idInBlock=0; } @@ -346,22 +349,22 @@ public CharSequence extract(long id) { // Locate block long blockid = (id-1)/blocksize; long nstring = (id-1)%blocksize; - - byte [] block = data[(int) (blockid/BLOCK_PER_BUFFER)]; - int pos = (int) (blocks.get(blockid)-posFirst[(int) (blockid/BLOCK_PER_BUFFER)]); + + BigByteBuffer block = data[(int) (blockid/BLOCK_PER_BUFFER)]; + long pos = (blocks.get(blockid)-posFirst[(int) (blockid/BLOCK_PER_BUFFER)]); // Copy first string - int len = ByteStringUtil.strlen(block, pos); + int len = (int) ByteStringUtil.strlen(block, pos); Mutable delta = new Mutable<>(0L); ReplazableString tempString = new ReplazableString(); tempString.append(block, pos, len); // Copy strings until we find our's. - for(int i=0;i getSortedEntries() { - return new Iterator() { - int pos; + return new Iterator<>() { + long pos; @Override public boolean hasNext() { @@ -421,10 +424,9 @@ public void save(OutputStream output, ProgressListener listener) throws IOExcept VByte.encode(out, numstrings); long datasize=0; - - for (int i =0; i=blocks.getNumberOfElements()) { return 0; } ReplazableString tempString = new ReplazableString(); - int idInBlock = 0; + long idInBlock = 0; int cshared=0; // dumpBlock(block); BigMappedByteBuffer buffer = buffers[(int) (block/BLOCKS_PER_BYTEBUFFER)].duplicate(); - buffer.position((int)(blocks.get(block)-posFirst[(int) (block/BLOCKS_PER_BYTEBUFFER)])); + buffer.position(blocks.get(block)-posFirst[(int) (block/BLOCKS_PER_BYTEBUFFER)]); try { if(!buffer.hasRemaining()) { @@ -279,7 +270,7 @@ public CharSequence extract(long id) { long block = (id-1)/blocksize; BigMappedByteBuffer buffer = buffers[(int) (block/BLOCKS_PER_BYTEBUFFER)].duplicate(); - buffer.position((int)(blocks.get(block)-posFirst[(int) (block/BLOCKS_PER_BYTEBUFFER)])); + buffer.position(blocks.get(block)-posFirst[(int) (block/BLOCKS_PER_BYTEBUFFER)]); try { ReplazableString tempString = new ReplazableString(); @@ -318,7 +309,7 @@ public Iterator getSortedEntries() { if (buffers[0]==null){ return Collections.emptyIterator(); } else { - return new Iterator() { + return new Iterator<>() { long id = 0; final ReplazableString tempString = new ReplazableString(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java index d1b32eb5..e11ebc63 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/CatUnion.java @@ -15,11 +15,11 @@ public class CatUnion implements Iterator { ArrayList list; - private List> listIters; + private final List> listIters; public CatUnion(List> listIters) { - this.list = new ArrayList(); + this.list = new ArrayList<>(); this.listIters = new ArrayList<>(listIters); int count = 0; for (Iterator iter : this.listIters) { @@ -34,19 +34,14 @@ public CatUnion(List> listIters) { @Override public boolean hasNext() { - if (list.size() > 0) { - return true; - } else { - return false; - } + return list.size() > 0; } @Override public CatElement next() { - CharSequence element = null; ArrayList ids = new ArrayList<>(); list.sort(new IteratorPlusElementComparator()); - element = list.get(0).element.entity; + CharSequence element = list.get(0).element.entity; CompactString elementCompactString = new CompactString(element); ListIterator iteratorPlusElementIterator = list.listIterator(); while (iteratorPlusElementIterator.hasNext()) { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java index 9f018e67..8805809f 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/dictionary/impl/utilCat/SectionUtil.java @@ -17,76 +17,67 @@ public class SectionUtil { - - private static int DEFAULT_BLOCK_SIZE = 16; - private static int BLOCK_PER_BUFFER = 1000000; + private static final int DEFAULT_BLOCK_SIZE = 16; + private static final int BLOCK_PER_BUFFER = 1000000; public static void createSection(String location,long numEntries, int type, CatUnion itAdd , - CatUnion itSkip , HashMap mappings,long offset, ProgressListener listener) { - CRCOutputStream out_buffer = null; - try { - out_buffer = new CRCOutputStream(new FileOutputStream(location+"section_buffer_"+type), new CRC32()); - } catch (FileNotFoundException e) { - e.printStackTrace(); + CatUnion itSkip , HashMap mappings,long offset, ProgressListener listener) throws IOException { + String name = ""; + switch (type) { + case 2: + name = "subject"; + break; + case 3: + name = "object"; + break; + case 4: + name = "predicate"; } - - try { - String name = ""; - switch (type) { - case 2: - name = "subject"; - break; - case 3: - name = "object"; - break; - case 4: - name = "predicate"; - } - long storedBuffersSize = 0; - long numBlocks = 0; - long numberElements = 0; - SequenceLog64BigDisk blocks = new SequenceLog64BigDisk(location+"SequenceLog64BigDisk"+type,64, numEntries/16); - ByteArrayOutputStream byteOut = new ByteArrayOutputStream(16*1024); + long storedBuffersSize = 0; + long numBlocks = 0; + long numberElements = 0; + SequenceLog64BigDisk blocks; + ByteArrayOutputStream byteOut; + try (CRCOutputStream outBuffer = new CRCOutputStream(new FileOutputStream(location+"section_buffer_"+type), new CRC32())) { + blocks = new SequenceLog64BigDisk(location + "SequenceLog64BigDisk" + type, 64, numEntries / 16); + byteOut = new ByteArrayOutputStream(16 * 1024); if (numEntries > 0) { - CharSequence previousStr=null; + CharSequence previousStr = null; CatElement skipElement = null; - if(itSkip.hasNext()){ + if (itSkip.hasNext()) { skipElement = itSkip.next(); } - while (itAdd.hasNext()){ - ListenerUtil.notifyCond(listener, "Analyze section "+name+" ", numberElements, numberElements, numEntries); + while (itAdd.hasNext()) { + ListenerUtil.notifyCond(listener, "Analyze section " + name + " ", numberElements, numberElements, numEntries); CatElement nextElement = itAdd.next(); - Boolean skip = false; - if(skipElement!= null && nextElement.entity.toString().equals(skipElement.entity.toString())) - skip = true; - else { + if (skipElement != null && nextElement.entity.toString().equals(skipElement.entity.toString())) { + if (itSkip.hasNext()) { + skipElement = itSkip.next(); + } else { + skipElement = null; + } + } else { for (int i = 0; i < nextElement.IDs.size(); i++) { long id = nextElement.IDs.get(i).pos; String iter = nextElement.IDs.get(i).iter.toString(); - if(iter.equals("shared")) - mappings.get(iter).set(id - 1, offset+numberElements + 1, type); + if (iter.equals("shared")) + mappings.get(iter).set(id - 1, offset + numberElements + 1, type); else mappings.get(iter).set(id - 1, numberElements + 1, type); } - } - if(skip){ - if(itSkip.hasNext()) - skipElement = itSkip.next(); - else - skipElement = null; - }else{ String str = nextElement.entity.toString(); if (numberElements % DEFAULT_BLOCK_SIZE == 0) { blocks.append(storedBuffersSize + byteOut.size()); numBlocks++; // if a buffer is filled, flush the byteOut and store it - if (((numBlocks - 1) % BLOCK_PER_BUFFER == 0) && ((numBlocks - 1) / BLOCK_PER_BUFFER != 0) || byteOut.size()>200000) { + if (((numBlocks - 1) % BLOCK_PER_BUFFER == 0) && ((numBlocks - 1) / BLOCK_PER_BUFFER != 0) || byteOut.size() > 200000) { storedBuffersSize += byteOut.size(); byteOut.flush(); - IOUtil.writeBuffer(out_buffer, byteOut.toByteArray(), 0, byteOut.toByteArray().length, null); + byte[] arr = byteOut.toByteArray(); + IOUtil.writeBuffer(outBuffer, arr, 0, arr.length, null); byteOut.close(); byteOut = new ByteArrayOutputStream(16 * 1024); } @@ -108,46 +99,41 @@ public static void createSection(String location,long numEntries, int type, CatU } } // Ending block pointer. - blocks.append(storedBuffersSize+byteOut.size()); + blocks.append(storedBuffersSize + byteOut.size()); // Trim text/blocks blocks.aggressiveTrimToSize(); byteOut.flush(); //section.addBuffer(buffer, byteOut.toByteArray()); - IOUtil.writeBuffer(out_buffer, byteOut.toByteArray(), 0, byteOut.toByteArray().length, null); - out_buffer.writeCRC(); - out_buffer.close(); - //Save the section conforming to the HDT format - CRCOutputStream out = new CRCOutputStream(new FileOutputStream(location+"section"+type), new CRC8()); + byte[] arr = byteOut.toByteArray(); + IOUtil.writeBuffer(outBuffer, arr, 0, arr.length, null); + outBuffer.writeCRC(); + } + //Save the section conforming to the HDT format + try (CRCOutputStream out = new CRCOutputStream(new FileOutputStream(location + "section" + type), new CRC8())) { //write the index type out.write(2); //write the number of strings VByte.encode(out, numberElements); //write the datasize - VByte.encode(out, storedBuffersSize+byteOut.size()); + VByte.encode(out, storedBuffersSize + byteOut.size()); //wirte the blocksize VByte.encode(out, DEFAULT_BLOCK_SIZE); //write CRC out.writeCRC(); //write the blocks - blocks.save(out, null); // Write blocks directly to output, they have their own CRC check. + blocks.save(out, null); // Write blocks directly to output, they have their own CRC check. blocks.close(); //write out_buffer byte[] buf = new byte[100000]; - InputStream in = new FileInputStream(location+"section_buffer_"+type); - int b = 0; - while ( (b = in.read(buf)) >= 0) { + InputStream in = new FileInputStream(location + "section_buffer_" + type); + int b; + while ((b = in.read(buf)) >= 0) { out.write(buf, 0, b); out.flush(); } - out.close(); - try { - Files.delete(Paths.get(location + "section_buffer_" + type)); - Files.delete(Paths.get(location + "SequenceLog64BigDisk" + type)); - } catch (Exception e) { - // swallow this exception intentionally. See javadoc on Files.delete for details. - } - } catch (IOException e) { - e.printStackTrace(); } + + Files.deleteIfExists(Paths.get(location + "section_buffer_" + type)); + Files.deleteIfExists(Paths.get(location + "SequenceLog64BigDisk" + type)); } } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java index 9a8a509d..9671074b 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/HDTImpl.java @@ -28,27 +28,22 @@ package org.rdfhdt.hdt.hdt.impl; -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.Date; -import java.util.Iterator; -import java.util.Map; -import java.util.zip.GZIPInputStream; - import org.rdfhdt.hdt.compact.bitmap.Bitmap; import org.rdfhdt.hdt.compact.bitmap.BitmapFactory; import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; -import org.rdfhdt.hdt.dictionary.*; -import org.rdfhdt.hdt.dictionary.impl.*; +import org.rdfhdt.hdt.dictionary.Dictionary; +import org.rdfhdt.hdt.dictionary.DictionaryCat; +import org.rdfhdt.hdt.dictionary.DictionaryDiff; +import org.rdfhdt.hdt.dictionary.DictionaryFactory; +import org.rdfhdt.hdt.dictionary.DictionaryPrivate; +import org.rdfhdt.hdt.dictionary.DictionarySection; +import org.rdfhdt.hdt.dictionary.TempDictionary; +import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionaryBig; +import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionaryCat; +import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionary; +import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionaryBig; +import org.rdfhdt.hdt.dictionary.impl.MultipleSectionDictionaryCat; import org.rdfhdt.hdt.enums.ResultEstimationType; import org.rdfhdt.hdt.enums.TripleComponentRole; import org.rdfhdt.hdt.exceptions.IllegalFormatException; @@ -69,8 +64,20 @@ import org.rdfhdt.hdt.options.ControlInformation; import org.rdfhdt.hdt.options.HDTOptions; import org.rdfhdt.hdt.options.HDTSpecification; -import org.rdfhdt.hdt.triples.*; -import org.rdfhdt.hdt.triples.impl.*; +import org.rdfhdt.hdt.triples.DictionaryEntriesDiff; +import org.rdfhdt.hdt.triples.IteratorTripleID; +import org.rdfhdt.hdt.triples.IteratorTripleString; +import org.rdfhdt.hdt.triples.TempTriples; +import org.rdfhdt.hdt.triples.TripleID; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.triples.Triples; +import org.rdfhdt.hdt.triples.TriplesFactory; +import org.rdfhdt.hdt.triples.TriplesPrivate; +import org.rdfhdt.hdt.triples.impl.BitmapTriples; +import org.rdfhdt.hdt.triples.impl.BitmapTriplesCat; +import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorCat; +import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorDiff; +import org.rdfhdt.hdt.triples.impl.BitmapTriplesIteratorMapDiff; import org.rdfhdt.hdt.util.StopWatch; import org.rdfhdt.hdt.util.StringUtil; import org.rdfhdt.hdt.util.io.CountInputStream; @@ -79,6 +86,22 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Date; +import java.util.Iterator; +import java.util.Map; +import java.util.zip.GZIPInputStream; + /** * Basic implementation of HDT interface * @@ -86,7 +109,7 @@ public class HDTImpl implements HDTPrivate { private static final Logger log = LoggerFactory.getLogger(HDTImpl.class); - private HDTOptions spec; + private final HDTOptions spec; protected HeaderPrivate header; protected DictionaryPrivate dictionary; @@ -504,9 +527,7 @@ public void loadOrCreateIndex(ProgressListener listener) { triples.loadIndex(in, ci, listener); } } catch (Exception e) { - if(e instanceof FileNotFoundException) { -// System.out.println("The .hdt.index doesn't exist, generating a new one."); - } else { + if (!(e instanceof FileNotFoundException)) { System.out.println("Error reading .hdt.index, generating a new one. The error was: "+e.getMessage()); e.printStackTrace(); } @@ -577,7 +598,7 @@ public boolean isMapped() { * @param hdt2 * @param listener */ - public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener){ + public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { try { System.out.println("Generating dictionary"); FourSectionDictionaryCat dictionaryCat = new FourSectionDictionaryCat(location); @@ -638,96 +659,88 @@ public void cat(String location, HDT hdt1, HDT hdt2, ProgressListener listener){ System.out.println("Generating header"); this.header = HeaderFactory.createHeader(spec); this.populateHeaderStructure("http://wdaqua.eu/hdtCat/"); - } catch (FileNotFoundException e) { - e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } - public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener listener){ - try { - System.out.println("Generating dictionary"); - DictionaryCat dictionaryCat = new MultipleSectionDictionaryCat(location); - dictionaryCat.cat(hdt1.getDictionary(),hdt2.getDictionary(), listener); - //map the generated dictionary - ControlInfo ci2 = new ControlInformation(); - CountInputStream fis = new CountInputStream(new BufferedInputStream(new FileInputStream(location + "dictionary"))); - HDTSpecification spec = new HDTSpecification(); - spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); - MultipleSectionDictionaryBig dictionary = new MultipleSectionDictionaryBig(spec); - fis.mark(1024); - ci2.load(fis); - fis.reset(); - dictionary.mapFromFile(fis, new File(location + "dictionary"),null); - this.dictionary = dictionary; - - System.out.println("Generating triples"); - BitmapTriplesIteratorCat it = new BitmapTriplesIteratorCat(hdt1.getTriples(),hdt2.getTriples(),dictionaryCat); - BitmapTriplesCat bitmapTriplesCat = new BitmapTriplesCat(location); - bitmapTriplesCat.cat(it,listener); - //Delete the mappings since they are not necessary anymore - Iterator iter = hdt1.getDictionary().getAllObjects().entrySet().iterator(); - int countSubSections = 0; - while (iter.hasNext()){ - Map.Entry entry = (Map.Entry)iter.next(); - String dataType = (String)entry.getKey(); - String prefix = "sub"+countSubSections; - if(dataType.equals("NO_DATATYPE")) - prefix = dataType; - Files.delete(Paths.get(location+prefix+"1")); - Files.delete(Paths.get(location+prefix+"1"+"Types")); - countSubSections++; - } - iter = hdt2.getDictionary().getAllObjects().entrySet().iterator(); - countSubSections = 0; - while (iter.hasNext()){ - Map.Entry entry = (Map.Entry)iter.next(); - String dataType = (String)entry.getKey(); - String prefix = "sub"+countSubSections; - if(dataType.equals("NO_DATATYPE")) - prefix = dataType; - Files.delete(Paths.get(location+prefix+"2")); - Files.delete(Paths.get(location+prefix+"2"+"Types")); - countSubSections++; - } - Files.delete(Paths.get(location+"P1")); - Files.delete(Paths.get(location+"P1"+"Types")); - Files.delete(Paths.get(location+"P2")); - Files.delete(Paths.get(location+"P2"+"Types")); - Files.delete(Paths.get(location+"SH1")); - Files.delete(Paths.get(location+"SH1"+"Types")); - Files.delete(Paths.get(location+"SH2")); - Files.delete(Paths.get(location+"SH2"+"Types")); - Files.delete(Paths.get(location+"S1")); - Files.delete(Paths.get(location+"S1"+"Types")); - Files.delete(Paths.get(location+"S2")); - Files.delete(Paths.get(location+"S2"+"Types")); - Files.delete(Paths.get(location+"O1")); - Files.delete(Paths.get(location+"O1"+"Types")); - Files.delete(Paths.get(location+"O2")); - Files.delete(Paths.get(location+"O2"+"Types")); - //map the triples - CountInputStream fis2 = new CountInputStream(new BufferedInputStream(new FileInputStream(location + "triples"))); - ci2 = new ControlInformation(); - ci2.clear(); - fis2.mark(1024); - ci2.load(fis2); - fis2.reset(); - triples = TriplesFactory.createTriples(ci2); - triples.mapFromFile(fis2,new File(location + "triples"),null); - Files.delete(Paths.get(location+"mapping_back_1")); - Files.delete(Paths.get(location+"mapping_back_2")); - Files.delete(Paths.get(location+"mapping_back_type_1")); - Files.delete(Paths.get(location+"mapping_back_type_2")); - System.out.println("Generating header"); - this.header = HeaderFactory.createHeader(spec); - this.populateHeaderStructure("http://wdaqua.eu/hdtCat/"); - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); + public void catCustom(String location, HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { + System.out.println("Generating dictionary"); + DictionaryCat dictionaryCat = new MultipleSectionDictionaryCat(location); + dictionaryCat.cat(hdt1.getDictionary(),hdt2.getDictionary(), listener); + //map the generated dictionary + ControlInfo ci2 = new ControlInformation(); + CountInputStream fis = new CountInputStream(new BufferedInputStream(new FileInputStream(location + "dictionary"))); + HDTSpecification spec = new HDTSpecification(); + spec.setOptions("tempDictionary.impl=multHash;dictionary.type=dictionaryMultiObj;"); + MultipleSectionDictionaryBig dictionary = new MultipleSectionDictionaryBig(spec); + fis.mark(1024); + ci2.load(fis); + fis.reset(); + dictionary.mapFromFile(fis, new File(location + "dictionary"),null); + this.dictionary = dictionary; + + System.out.println("Generating triples"); + BitmapTriplesIteratorCat it = new BitmapTriplesIteratorCat(hdt1.getTriples(),hdt2.getTriples(),dictionaryCat); + BitmapTriplesCat bitmapTriplesCat = new BitmapTriplesCat(location); + bitmapTriplesCat.cat(it,listener); + //Delete the mappings since they are not necessary anymore + Iterator> iter = hdt1.getDictionary().getAllObjects().entrySet().iterator(); + int countSubSections = 0; + while (iter.hasNext()){ + Map.Entry entry = iter.next(); + String dataType = entry.getKey(); + String prefix = "sub"+countSubSections; + if(dataType.equals("NO_DATATYPE")) + prefix = dataType; + Files.delete(Paths.get(location+prefix+"1")); + Files.delete(Paths.get(location+prefix+"1"+"Types")); + countSubSections++; } + iter = hdt2.getDictionary().getAllObjects().entrySet().iterator(); + countSubSections = 0; + while (iter.hasNext()){ + Map.Entry entry = iter.next(); + String dataType = entry.getKey(); + String prefix = "sub"+countSubSections; + if(dataType.equals("NO_DATATYPE")) + prefix = dataType; + Files.delete(Paths.get(location+prefix+"2")); + Files.delete(Paths.get(location+prefix+"2"+"Types")); + countSubSections++; + } + Files.delete(Paths.get(location+"P1")); + Files.delete(Paths.get(location+"P1"+"Types")); + Files.delete(Paths.get(location+"P2")); + Files.delete(Paths.get(location+"P2"+"Types")); + Files.delete(Paths.get(location+"SH1")); + Files.delete(Paths.get(location+"SH1"+"Types")); + Files.delete(Paths.get(location+"SH2")); + Files.delete(Paths.get(location+"SH2"+"Types")); + Files.delete(Paths.get(location+"S1")); + Files.delete(Paths.get(location+"S1"+"Types")); + Files.delete(Paths.get(location+"S2")); + Files.delete(Paths.get(location+"S2"+"Types")); + Files.delete(Paths.get(location+"O1")); + Files.delete(Paths.get(location+"O1"+"Types")); + Files.delete(Paths.get(location+"O2")); + Files.delete(Paths.get(location+"O2"+"Types")); + //map the triples + CountInputStream fis2 = new CountInputStream(new BufferedInputStream(new FileInputStream(location + "triples"))); + ci2 = new ControlInformation(); + ci2.clear(); + fis2.mark(1024); + ci2.load(fis2); + fis2.reset(); + triples = TriplesFactory.createTriples(ci2); + triples.mapFromFile(fis2,new File(location + "triples"),null); + Files.delete(Paths.get(location+"mapping_back_1")); + Files.delete(Paths.get(location+"mapping_back_2")); + Files.delete(Paths.get(location+"mapping_back_type_1")); + Files.delete(Paths.get(location+"mapping_back_type_2")); + System.out.println("Generating header"); + this.header = HeaderFactory.createHeader(spec); + this.populateHeaderStructure("http://wdaqua.eu/hdtCat/"); } public void diff(HDT hdt1, HDT hdt2, ProgressListener listener) throws IOException { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/BigByteBuffer.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/BigByteBuffer.java new file mode 100644 index 00000000..48e40f3a --- /dev/null +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/io/BigByteBuffer.java @@ -0,0 +1,226 @@ +package org.rdfhdt.hdt.util.io; + +import org.rdfhdt.hdt.listener.ProgressListener; +import org.rdfhdt.hdt.util.listener.ListenerUtil; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.List; + +public class BigByteBuffer { + static final int BUFFER_SIZE = 1024 * 8; + static int maxBufferSize = Integer.MAX_VALUE - 5; + + /** + * allocate a large byte buffer + * + * @param size the size to allocate + * @return BigByteBuffer + */ + public static BigByteBuffer allocate(long size) { + if (size < 0) { + throw new IllegalArgumentException("Can't allocate ByteBuffer with a negative size: " + size); + } + if (size == 0) { + return new BigByteBuffer(List.of()); + } + int bufferCount = (int) ((size - 1) / maxBufferSize + 1); + + List buffers = new ArrayList<>(bufferCount); + + int lastNodeSize = (int) (size % maxBufferSize); + if (lastNodeSize == 0) { + for (int i = 0; i < bufferCount; i++) { + buffers.add(new byte[maxBufferSize]); + } + } else { + for (int i = 0; i < bufferCount - 1; i++) { + buffers.add(new byte[maxBufferSize]); + } + buffers.add(new byte[lastNodeSize]); + } + return new BigByteBuffer(buffers); + } + + private final List buffers; + + /** + * cat multiple buffers + * + * @param buffers the buffers + */ + private BigByteBuffer(List buffers) { + this.buffers = buffers; + } + + List getBuffers() { + return buffers; + } + + /** + * @return the capacity of the big buffer + */ + public long size() { + return buffers.stream().mapToLong(l -> l.length).sum(); + } + + private int getBufferOffset(long index) { + return (int) (index % maxBufferSize); + } + + private int getBufferIndex(long index) { + return (int) (index / maxBufferSize); + } + + /** + * get a byte at a particular index + * + * @param index the byte index + * @return byte + */ + public byte get(long index) { + int buffer = getBufferIndex(index); + int inBufferIndex = getBufferOffset(index); + + if (buffer < 0 || buffer >= buffers.size()) + throw new IndexOutOfBoundsException(); + + return buffers.get(buffer)[inBufferIndex]; + } + + /** + * set a byte at a particular index + * + * @param index the byte index + * @param value the byte to set + */ + public void set(long index, byte value) { + int buffer = getBufferIndex(index); + int inBufferIndex = getBufferOffset(index); + + if (buffer < 0 || buffer >= buffers.size()) + throw new IndexOutOfBoundsException(); + + buffers.get(buffer)[inBufferIndex] = value; + } + + /** + * set a byte at a particular index + * + * @param index the byte index + * @param value the byte to set + * @param offset the offset in the array + * @param length the length in the array after the offset + */ + public void set(long index, byte[] value, int offset, int length) { + int buffer1 = getBufferIndex(index); + int buffer2 = getBufferIndex(index + length - 1); + + if (buffer1 == buffer2) { + // same array + byte[] b = buffers.get(buffer1); + System.arraycopy(value, offset, b, getBufferOffset(index), length); + } else { + byte[] b1 = buffers.get(buffer1); + byte[] b2 = buffers.get(buffer2); + + int toRead = b1.length - getBufferOffset(index); + + System.arraycopy(value, offset, b1, getBufferOffset(index), toRead); + System.arraycopy(value, offset + toRead, b2, getBufferOffset(index + toRead), length - toRead); + } + } + + + /** + * read a particular number of bytes in the buffer + * + * @param dst the destination array + * @param position index to start reading + * @param offset the offset in the offset + * @param length the length to read + */ + public void get(byte[] dst, long position, int offset, int length) { + int buffer1 = getBufferIndex(position); + int buffer2 = getBufferIndex(position + length - 1); + + if (buffer1 == buffer2) { + // all the bytes are in the same buffer + byte[] b = buffers.get(buffer1); + System.arraycopy(b, getBufferOffset(position), dst, offset, length); + } else { + // we are using 2 buffers + byte[] b1 = buffers.get(buffer1); + byte[] b2 = buffers.get(buffer2); + + int toRead = b1.length - getBufferOffset(position); + + System.arraycopy(b1, getBufferOffset(position), dst, offset, toRead); + System.arraycopy(b2, getBufferOffset(position + toRead), dst, offset + toRead, length - toRead); + } + } + + /** + * read a stream into this ByteBuffer + * @param input the input stream to read + * @param index the index to start writing + * @param length the length to read + * @param listener listener to notify the state + * @throws IOException any error with the stream + */ + public void readStream(InputStream input, long index, long length, ProgressListener listener) throws IOException { + long remaining = length; + long currentIndex = index; + int b = getBufferIndex(index); + ListenerUtil.notify(listener, "Reading buffer", 0, length); + while (remaining > 0) { + int offset = getBufferOffset(currentIndex); + byte[] buffer = buffers.get(b); + + int read = (int) Math.min(buffer.length - offset, currentIndex + remaining); + readStreamInto(input, buffer, offset, read, listener, currentIndex - index, length); + remaining -= read; + currentIndex += read; + ListenerUtil.notify(listener, "Reading buffer", length - remaining, length); + b++; + } + } + + private void readStreamInto(InputStream input, byte[] dst, int start, int length, ProgressListener listener, long offset, long end) throws IOException { + int nRead; + int pos = 0; + + while ((nRead = input.read(dst, start, length - pos)) > 0) { + pos += nRead; + ListenerUtil.notify(listener, "Reading buffer", pos + offset, end); + } + if (pos != length) { + throw new IOException("EOF while reading array from InputStream"); + } + } + + /** + * Write a part of this byte buffer into a stream + * @param stream the stream to fill + * @param offset the offset to start + * @param length the length to write + * @param listener listener to notify the state + * @throws IOException exception while writing into the output stream + */ + public void writeStream(OutputStream stream, long offset, long length, ProgressListener listener) throws IOException { + byte[] buffer = new byte[BUFFER_SIZE]; + + long end = offset + length; + long write = 0; + ListenerUtil.notify(listener, "Writing buffer", 0, length); + for (long index = offset; index < end; index += BUFFER_SIZE) { + int toWrite = (int) Math.min(end - index, BUFFER_SIZE); + get(buffer, index, 0, toWrite); + stream.write(buffer, 0, toWrite); + write += toWrite; + ListenerUtil.notify(listener, "Writing buffer", write, length); + } + } +} diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java index c670f980..7e57abb5 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/util/string/ByteStringUtil.java @@ -32,6 +32,7 @@ import java.nio.charset.Charset; import org.rdfhdt.hdt.exceptions.NotImplementedException; +import org.rdfhdt.hdt.util.io.BigByteBuffer; import org.rdfhdt.hdt.util.io.BigMappedByteBuffer; import static java.nio.charset.StandardCharsets.UTF_8; @@ -65,7 +66,7 @@ public static String asString(ByteBuffer buff, int offset) { } return new String(arr, STRING_ENCODING); } - + public static int strlen(byte [] buff, int off) { int len = buff.length; int pos = off; @@ -74,6 +75,15 @@ public static int strlen(byte [] buff, int off) { } return pos-off; } + + public static long strlen(BigByteBuffer buff, long off) { + long len = buff.size(); + long pos = off; + while(pos genParam() { + List list = new ArrayList<>(); + for (HdtDiffTest.DictionaryTestData data : HdtDiffTest.DICTIONARY_TEST_DATA) { + list.add(new Object[]{data.dictionaryType, data.dictionaryTempType}); + } + return list; + } + + + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + private final HDTSpecification spec; + + public HdtCatRandomTest(String dictionaryType, String tempDictionaryImpl) { + spec = new HDTSpecification(); + spec.set("dictionary.type", dictionaryType); + spec.set("tempDictionary.impl", tempDictionaryImpl); + } + + @Test + @Ignore("large") + public void largeFakeTest() throws ParserException, IOException { + File root = tempDir.newFolder(); + String location = new File(root, "catHdt").getAbsolutePath(); + String hdt1F = new File(root, "hdt1").getAbsolutePath(); + String hdt2F = new File(root, "hdt2").getAbsolutePath(); + String catOutput = new File(root, "catResult").getAbsolutePath(); + + LargeFakeDataSetStreamSupplier supplier = LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_000, 484); + supplier.maxFakeType = 4; + supplier.maxElementSplit = 1000; + supplier.createAndSaveFakeHDT(spec, hdt1F); + supplier.createAndSaveFakeHDT(spec, hdt2F); + + HDT cat = HDTManager.catHDT(location, hdt1F, hdt2F, spec, null); + cat.saveToHDT(catOutput, null); + cat.close(); + + HDT loadedHDT = HDTManager.loadIndexedHDT(catOutput, null, spec); + loadedHDT.close(); + + HDT mappedHDT = HDTManager.mapIndexedHDT(catOutput, spec, null); + mappedHDT.close(); + } + +} diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java new file mode 100644 index 00000000..faf9a394 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -0,0 +1,188 @@ +package org.rdfhdt.hdt.util; + +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.ParserException; +import org.rdfhdt.hdt.hdt.HDT; +import org.rdfhdt.hdt.hdt.HDTManager; +import org.rdfhdt.hdt.options.HDTOptions; +import org.rdfhdt.hdt.triples.TripleString; + +import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Iterator; +import java.util.Random; + +public class LargeFakeDataSetStreamSupplier { + + private static final Charset DEFAULT_CHARSET = Charset.defaultCharset(); + + /** + * create a lowercase name from a number, to create string without any number in it + * + * @param i id + * @return string + */ + public static String stringNameOfInt(int i) { + String table = "abcdefghijklmnopqrstuvwxyz"; + StringBuilder out = new StringBuilder(); + int c = i; + do { + out.append(table.charAt(c % table.length())); + c /= table.length(); + } while (c != 0); + return out.toString(); + } + + /** + * estimate the size of a triple + * + * @param triple the triple + * @return the size in byte to store the triple + */ + public static long estimateTripleSize(TripleString triple) { + try { + return triple.asNtriple().toString().getBytes(DEFAULT_CHARSET).length; + } catch (IOException e) { + throw new RuntimeException("Can't estimate the size of the triple " + triple, e); + } + } + + public static LargeFakeDataSetStreamSupplier createSupplierWithMaxSize(long maxSize, long seed) { + return new LargeFakeDataSetStreamSupplier(maxSize, Long.MAX_VALUE, seed); + } + + public static LargeFakeDataSetStreamSupplier createSupplierWithMaxTriples(long maxTriples, long seed) { + return new LargeFakeDataSetStreamSupplier(Long.MAX_VALUE, maxTriples, seed); + } + + private final long seed; + private Random random; + private final long maxSize; + private final long maxTriples; + public int maxFakeType = 10; + public int maxElementSplit = Integer.MAX_VALUE; + + private LargeFakeDataSetStreamSupplier(long maxSize, long maxTriples, long seed) { + this.maxSize = maxSize; + this.maxTriples = maxTriples; + this.seed = seed; + reset(); + } + + public void reset() { + random = new Random(seed); + } + + public Iterator createTripleStringStream() { + return new FakeStatementIterator(); + } + + public void createNTFile(String file) throws IOException { + try (FileWriter writer = new FileWriter(file)) { + for (Iterator it = createTripleStringStream(); it.hasNext(); ) { + it.next().dumpNtriple(writer); + } + } + } + + public HDT createFakeHDTTwoPass(HDTOptions spec) throws ParserException, IOException { + Path f = Paths.get("tempNtFile.nt").toAbsolutePath(); + try { + createNTFile(f.toString()); + spec.set("loader.type", "two-pass"); + return HDTManager.generateHDT(f.toString(), "http://w", RDFNotation.NTRIPLES, spec, null); + } finally { + Files.deleteIfExists(f); + } + } + public HDT createFakeHDT(HDTOptions spec) throws ParserException, IOException { + return HDTManager.generateHDT(createTripleStringStream(), "http://w", spec, null); + } + + public void createAndSaveFakeHDT(HDTOptions spec, String location) throws ParserException, IOException { + HDT hdt = createFakeHDT(spec); + hdt.saveToHDT(location, null); + hdt.close(); + } + public void createAndSaveFakeHDTTwoPass(HDTOptions spec, String location) throws ParserException, IOException { + HDT hdt = createFakeHDTTwoPass(spec); + hdt.saveToHDT(location, null); + hdt.close(); + } + + private CharSequence createSubject() { + return createPredicate(); + } + + private CharSequence createPredicate() { + return ""; + } + + private CharSequence createType() { + return ""; + } + + private CharSequence createValue() { + if (random.nextBoolean()) { + return createPredicate(); + } + + String text = "\"" + stringNameOfInt(random.nextInt(maxElementSplit)) + "\""; + if (random.nextBoolean()) { + // language node + return text + "@" + stringNameOfInt(random.nextInt(maxElementSplit)); + } else { + // typed node + return text + "^^" + createType(); + } + } + + private class FakeStatementIterator implements Iterator { + private long size; + private long count; + private TripleString next; + + @Override + public boolean hasNext() { + if (size >= maxSize || count >= maxTriples) { + return false; + } + if (next != null) { + return true; + } + + next = new TripleString( + createSubject(), + createPredicate(), + createValue() + ); + + long estimation = estimateTripleSize( + new TripleString( + next.getSubject().toString(), + next.getPredicate().toString(), + next.getObject().toString() + ) + ); + size += estimation; + count++; + + return size < maxSize && count < maxTriples; + } + + @Override + public TripleString next() { + if (!hasNext()) { + return null; + } + TripleString next = this.next; + this.next = null; + return next; + } + } + +} \ No newline at end of file diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/BigByteBufferTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/BigByteBufferTest.java new file mode 100644 index 00000000..69db5ad8 --- /dev/null +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/io/BigByteBufferTest.java @@ -0,0 +1,209 @@ +package org.rdfhdt.hdt.util.io; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.rdfhdt.hdt.compact.bitmap.BitmapFactory; +import org.rdfhdt.hdt.compact.bitmap.ModifiableBitmap; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Objects; +import java.util.Random; +import java.util.function.Consumer; + +public class BigByteBufferTest { + + public static void assertArrayEquals(byte[] arr, int start, byte[] arr2, int start2, int length) { + for (int i = 0; i < length; i++) { + Assert.assertEquals("index diff " + i, arr[start + i], arr2[start2 + i]); + } + } + + private int oldSize; + + @Rule + public TemporaryFolder tempDir = new TemporaryFolder(); + + @Before + public void prepare() { + // save the size if we want to update it + oldSize = BigByteBuffer.maxBufferSize; + } + @After + public void complete() { + BigByteBuffer.maxBufferSize = oldSize; + } + + @Test + public void capacityBuffer() { + long size = 1000; + BigByteBuffer.maxBufferSize = (int) (size / 7); + + BigByteBuffer buffer = BigByteBuffer.allocate(size); + Assert.assertEquals(8, buffer.getBuffers().size()); + Assert.assertEquals(size, buffer.size()); + } + @Test + @Ignore("large, should be run with at least 3G or ram -Xmx3G") + public void capacityBufferLarge() { + long size = Integer.MAX_VALUE * 12L / 10; + System.out.println(Runtime.getRuntime().maxMemory()); + BigByteBuffer buffer = BigByteBuffer.allocate(size); + Assert.assertEquals(2, buffer.getBuffers().size()); + Assert.assertEquals(size, buffer.size()); + } + + @Test + public void get() { + int size = 10000; + BigByteBuffer.maxBufferSize = size / 7; + + BigByteBuffer buffer = BigByteBuffer.allocate(size); + + RandomEntryFluxSupplier supplier = new RandomEntryFluxSupplier(72); + supplier.generate(size / 10, size, e -> buffer.set(e.index, e.value)); + supplier.reset(); + supplier.generate(size / 10, size, e -> Assert.assertEquals(e.value, buffer.get(e.index))); + } + @Test + public void getArr() { + int size = 10000; + final byte[] real = new byte[size]; + BigByteBuffer.maxBufferSize = size * 2 / 3; + + BigByteBuffer buffer = BigByteBuffer.allocate(size); + Assert.assertEquals(2, buffer.getBuffers().size()); + + RandomEntryFluxSupplier supplier = new RandomEntryFluxSupplier(72); + supplier.generate(size / 10, size, e -> { + buffer.set(e.index, e.value); + real[(int) e.index] = e.value; + }); + + byte[] test = new byte[size]; + buffer.get(test, 0, 0, size); + + Assert.assertArrayEquals(real, test); + + buffer.get(test, size / 2, 0, size / 2); + + assertArrayEquals(real, size / 2, test, 0, size / 2); + + buffer.get(test, size / 3, 0, size / 3); + + assertArrayEquals(real, size / 3, test, 0, size / 3); + + buffer.get(test, size * 2 / 3, 0, size / 3); + + assertArrayEquals(real, size * 2 / 3, test, 0, size / 3); + } + + @Test + public void readFileTest() throws IOException { + final String rawFileName = Objects.requireNonNull(getClass().getClassLoader().getResource("dbpedia.hdt"), "can't find dbpedia hdt").getFile(); + + Path path = Paths.get(rawFileName); + + long size = Files.size(path); + + BigByteBuffer.maxBufferSize = (int) (size * 2 / 3); // test with huge split + + BigByteBuffer buffer = BigByteBuffer.allocate(size); + + String file = Objects.requireNonNull(getClass().getClassLoader().getResource("dbpedia.hdt"), "Can't find dbpedia.hdt").getFile(); + + try (InputStream stream = IOUtil.getFileInputStream(file)) { + buffer.readStream(stream, 0, size, null); + } + + byte[] real = Files.readAllBytes(Paths.get(file)); + byte[] test = new byte[(int) buffer.size()]; + + int delta = (int) (size / 10); + + for (int i = 0; i < test.length; i += delta) { + buffer.get(test, i, 0, test.length - i); + assertArrayEquals(real, i, test, 0, test.length - i); + } + } + @Test + public void writeFileTest() throws IOException { + int size = BigByteBuffer.BUFFER_SIZE * 10; + BigByteBuffer.maxBufferSize = size * 2 / 3; + + BigByteBuffer buffer = BigByteBuffer.allocate(size); + + RandomEntryFluxSupplier supplier = new RandomEntryFluxSupplier(274); + supplier.generate(size / 10, size, e -> buffer.set(e.index, e.value)); + supplier.reset(); + + File f = tempDir.newFile(); + + int deltaf = size / 10; + for (int start = 0; start < size; start += deltaf) { + try (OutputStream stream = new FileOutputStream(f)) { + buffer.writeStream(stream, start, buffer.size() - start, null); + } + + byte[] test = Files.readAllBytes(f.toPath()); + byte[] real = new byte[size]; + + int delta = size / 10; + + for (int i = 0; i < buffer.size() - start; i += delta) { + buffer.get(real, start + i, 0, (int) buffer.size() - start - i); + assertArrayEquals(test, i, real, 0, (int) buffer.size() - start - i); + } + + Files.deleteIfExists(f.toPath()); + } + } + + + private static class Entry { + long index; + byte value; + + public Entry(long index, byte value) { + this.index = index; + this.value = value; + } + } + private static class RandomEntryFluxSupplier { + private Random random; + private final long seed; + + public RandomEntryFluxSupplier(long seed) { + this.seed = seed; + reset(); + } + + public void reset() { + random = new Random(seed); + } + + public void generate(long count, int max, Consumer e) { + ModifiableBitmap bitmap = BitmapFactory.createRWBitmap(max); + for (long i = 0; i < count; i++) { + long index = random.nextInt(max); + byte value = (byte) (random.nextInt() & 255); + if (bitmap.access(index)) { + continue; + } + bitmap.set(index, true); + e.accept(new Entry(index, value)); + } + } + } +}