From fea21828efa9a600abc14da11dabc5d6bdd642ba Mon Sep 17 00:00:00 2001 From: qaate47 Date: Mon, 16 May 2022 17:44:31 +0200 Subject: [PATCH] allow using simple parser in RDF2HDT cli/generateHDT and reduce string copy in triple string read --- .../org/rdfhdt/hdt/triples/TripleString.java | 87 ++++++++++++++----- .../org/rdfhdt/hdt/util/UnicodeEscape.java | 33 ++++--- .../java/org/rdfhdt/hdt/tools/RDF2HDT.java | 11 ++- .../org/rdfhdt/hdt/hdt/HDTManagerImpl.java | 11 ++- .../hdt/hdt/impl/TempHDTImporterOnePass.java | 8 +- .../hdt/hdt/impl/TempHDTImporterTwoPass.java | 10 ++- .../org/rdfhdt/hdt/rdf/RDFParserFactory.java | 18 ++-- .../rdfhdt/hdt/rdf/parsers/RDFParserDir.java | 11 ++- .../rdfhdt/hdt/rdf/parsers/RDFParserList.java | 11 ++- .../rdfhdt/hdt/rdf/parsers/RDFParserRAR.java | 12 ++- .../hdt/rdf/parsers/RDFParserSimple.java | 27 ++++-- .../rdfhdt/hdt/rdf/parsers/RDFParserTar.java | 20 +++-- .../rdfhdt/hdt/rdf/parsers/RDFParserZip.java | 22 +++-- .../hdt/rdf/parsers/RDFParserSimpleTest.java | 74 +++++++++++++++- .../util/LargeFakeDataSetStreamSupplier.java | 6 +- 15 files changed, 283 insertions(+), 78 deletions(-) diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java index f2c7887a..8d4b4005 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java @@ -204,46 +204,91 @@ public boolean hasEmpty() { * @throws ParserException if the line is not RDF complient */ public void read(String line) throws ParserException { + read(line, 0, line.length()); + } + + private int searchNextTabOrSpace(String line, int start, int end) { + // searching space + int sindex = line.indexOf(' ', start); + if (sindex != -1 && sindex < end) { + return sindex; + } + + // not found, searching tabs + int tindex = line.indexOf('\t', start); + if (tindex != -1 && tindex < end) { + return tindex; + } + + // not found + return -1; + } + + /** + * Read from a line, where each component is separated by space. + * @param line line to read + * @throws ParserException if the line is not RDF complient + */ + public void read(String line, int start, int end) throws ParserException { int split, posa, posb; this.clear(); - line = line.replace("\\t"," "); - // SET SUBJECT - posa = 0; - posb = split = line.indexOf(' ', posa); + posa = start; + posb = split = searchNextTabOrSpace(line, posa, end); - if(posb==-1) return; // Not found, error. - if(line.charAt(posa)=='<') posa++; // Remove < - if(line.charAt(posb-1)=='>') posb--; // Remove > + if (posb == -1) { + // Not found, error. + return; + } + if (line.charAt(posa) == '<') { + posa++; // Remove < + if (line.charAt(posb-1) == '>') { + posb--; // Remove > + } + } - this.setSubject(UnicodeEscape.unescapeString(line.substring(posa, posb))); + this.setSubject(UnicodeEscape.unescapeString(line, posa, posb)); // SET PREDICATE - posa = split+1; - posb = split = line.indexOf(' ', posa); + posa = split + 1; + posb = split = searchNextTabOrSpace(line, posa, end); - if(posb==-1) return; - if(line.charAt(posa)=='<') posa++; - if(posb>posa && line.charAt(posb-1)=='>') posb--; + if (posb == -1) { + return; + } + if (line.charAt(posa) == '<') { + posa++; + if (posb > posa && line.charAt(posb - 1) == '>') { + posb--; + } + } - this.setPredicate(UnicodeEscape.unescapeString(line.substring(posa, posb))); + this.setPredicate(UnicodeEscape.unescapeString(line, posa, posb)); // SET OBJECT - posa = split+1; - posb = line.length(); + posa = split + 1; + posb = end; - if(line.charAt(posb-1)=='.') posb--; // Remove trailing from NTRIPLES. - if(line.charAt(posb-1)==' ') posb--; + // Remove trailing from NTRIPLES. + if (line.charAt(posb-1) == '.') { + posb--; + } + char prev = line.charAt(posb-1); + if (prev == ' ' || prev == '\t') { + posb--; + } - if(line.charAt(posa)=='<') { + if (line.charAt(posa) == '<') { posa++; // Remove trailing > only if < appears, so "some"^^ is kept as-is. - if(posb>posa && line.charAt(posb-1)=='>') posb--; + if (posb > posa && line.charAt(posb-1)=='>') { + posb--; + } } - this.setObject(UnicodeEscape.unescapeString(line.substring(posa, posb))); + this.setObject(UnicodeEscape.unescapeString(line, posa, posb)); } /* diff --git a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java index 6bb4a008..88015230 100644 --- a/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java +++ b/hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java @@ -140,32 +140,45 @@ else if (cInt >= 0x10000 && cInt <= 0x10FFFF) { appendable.append(label.subSequence(last+1, label.length())); } - + /** * Unescapes an escaped Unicode string. Any Unicode sequences * (\uxxxx and \Uxxxxxxxx) are restored to the * value indicated by the hexadecimal argument and any backslash-escapes * (\", \\, etc.) are decoded to their original form. - * + * * @param s An escaped Unicode string. * @return The unescaped string. * @throws IllegalArgumentException If the supplied string is not a * correctly escaped N-Triples string. */ public static String unescapeString(String s) { - int backSlashIdx = s.indexOf('\\'); + return unescapeString(s, 0, s.length()); + } + /** + * Unescapes an escaped Unicode string. Any Unicode sequences + * (\uxxxx and \Uxxxxxxxx) are restored to the + * value indicated by the hexadecimal argument and any backslash-escapes + * (\", \\, etc.) are decoded to their original form. + * + * @param s An escaped Unicode string. + * @return The unescaped string. + * @throws IllegalArgumentException If the supplied string is not a + * correctly escaped N-Triples string. + */ + public static String unescapeString(String s, int start, int sLength) { + int backSlashIdx = s.indexOf('\\', start); - if (backSlashIdx == -1) { + if (backSlashIdx == -1 || backSlashIdx >= sLength) { // No escaped characters found - return s; + return s.substring(start, sLength); } - int startIdx = 0; - int sLength = s.length(); + int startIdx = start; StringBuilder sb = new StringBuilder(sLength); - while (backSlashIdx != -1) { - sb.append(s.substring(startIdx, backSlashIdx)); + while (backSlashIdx != -1 && backSlashIdx < sLength) { + sb.append(s, startIdx, backSlashIdx); if (backSlashIdx + 1 >= sLength) { throw new IllegalArgumentException("Unescaped backslash in: " + s); @@ -238,7 +251,7 @@ else if (c == 'U') { backSlashIdx = s.indexOf('\\', startIdx); } - sb.append(s.substring(startIdx)); + sb.append(s, startIdx, sLength); return sb.toString(); } diff --git a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java index 3f073277..494235f9 100644 --- a/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java +++ b/hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java @@ -71,9 +71,12 @@ public class RDF2HDT implements ProgressListener { @Parameter(names = "-index", description = "Generate also external indices to solve all queries") public boolean generateIndex; - + @Parameter(names = "-quiet", description = "Do not show progress of the conversion") public boolean quiet; + + @Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples") + public boolean ntSimpleLoading; public void execute() throws ParserException, IOException { HDTSpecification spec; @@ -88,7 +91,7 @@ public void execute() throws ParserException, IOException { if(baseURI==null) { baseURI = "file://"+rdfInput; } - + RDFNotation notation=null; if(rdfType!=null) { try { @@ -107,6 +110,10 @@ public void execute() throws ParserException, IOException { } } + if (ntSimpleLoading) { + spec.set("parser.ntSimpleParser", "true"); + } + StopWatch sw = new StopWatch(); HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this); System.out.println("File converted in: "+sw.stopAndShow()); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java index 4f77fb63..0608d0c2 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/HDTManagerImpl.java @@ -25,6 +25,11 @@ public class HDTManagerImpl extends HDTManager { + private boolean useSimple(HDTOptions spec) { + String value = spec.get("parser.ntSimpleParser"); + return value != null && !value.isEmpty() && !value.equals("false"); + } + @Override public HDTOptions doReadOptions(String file) throws IOException { return new HDTSpecification(file); @@ -90,9 +95,9 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota String loaderType = spec.get("loader.type"); TempHDTImporter loader; if ("two-pass".equals(loaderType)) { - loader = new TempHDTImporterTwoPass(); + loader = new TempHDTImporterTwoPass(useSimple(spec)); } else { - loader = new TempHDTImporterOnePass(); + loader = new TempHDTImporterOnePass(useSimple(spec)); } // Create TempHDT @@ -118,7 +123,7 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota @Override public HDT doGenerateHDT(Iterator triples, String baseURI, HDTOptions spec, ProgressListener listener) throws IOException { //choose the importer - TempHDTImporterOnePass loader = new TempHDTImporterOnePass(); + TempHDTImporterOnePass loader = new TempHDTImporterOnePass(false); // Create TempHDT TempHDT modHdt = loader.loadFromTriples(spec, triples, baseURI, listener); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterOnePass.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterOnePass.java index 8b2e9a2b..9a5a40f1 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterOnePass.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterOnePass.java @@ -73,11 +73,17 @@ public void processTriple(TripleString triple, long pos) { } } + private final boolean useSimple; + + public TempHDTImporterOnePass(boolean useSimple) { + this.useSimple = useSimple; + } + @Override public TempHDT loadFromRDF(HDTOptions specs, String filename, String baseUri, RDFNotation notation, ProgressListener listener) throws ParserException { - RDFParserCallback parser = RDFParserFactory.getParserCallback(notation); + RDFParserCallback parser = RDFParserFactory.getParserCallback(notation, useSimple); // Create Modifiable Instance TempHDT modHDT = new TempHDTImpl(specs, baseUri, ModeOfLoading.ONE_PASS); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTwoPass.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTwoPass.java index dae279ce..811fd8a6 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTwoPass.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/hdt/impl/TempHDTImporterTwoPass.java @@ -99,11 +99,17 @@ public void processTriple(TripleString triple, long pos) { } } - @Override + private final boolean useSimple; + + public TempHDTImporterTwoPass(boolean useSimple) { + this.useSimple = useSimple; + } + + @Override public TempHDT loadFromRDF(HDTOptions specs, String filename, String baseUri, RDFNotation notation, ProgressListener listener) throws ParserException { - RDFParserCallback parser = RDFParserFactory.getParserCallback(notation); + RDFParserCallback parser = RDFParserFactory.getParserCallback(notation, useSimple); // Create Modifiable Instance and parser TempHDT modHDT = new TempHDTImpl(specs, baseUri, ModeOfLoading.TWO_PASS); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java index 3d458377..246396c9 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/RDFParserFactory.java @@ -34,6 +34,7 @@ import org.rdfhdt.hdt.rdf.parsers.RDFParserList; import org.rdfhdt.hdt.rdf.parsers.RDFParserRAR; import org.rdfhdt.hdt.rdf.parsers.RDFParserRIOT; +import org.rdfhdt.hdt.rdf.parsers.RDFParserSimple; import org.rdfhdt.hdt.rdf.parsers.RDFParserTar; import org.rdfhdt.hdt.rdf.parsers.RDFParserZip; @@ -43,24 +44,29 @@ */ public class RDFParserFactory { public static RDFParserCallback getParserCallback(RDFNotation notation) { - + return getParserCallback(notation, false); + } + public static RDFParserCallback getParserCallback(RDFNotation notation, boolean useSimple) { switch(notation) { - case NTRIPLES: + case NTRIPLES: + if (useSimple) { + return new RDFParserSimple(); + } case NQUAD: case TURTLE: case N3: case RDFXML: return new RDFParserRIOT(); case DIR: - return new RDFParserDir(); + return new RDFParserDir(useSimple); case LIST: return new RDFParserList(); case ZIP: - return new RDFParserZip(); + return new RDFParserZip(useSimple); case TAR: - return new RDFParserTar(); + return new RDFParserTar(useSimple); case RAR: - return new RDFParserRAR(); + return new RDFParserRAR(useSimple); case HDT: return new RDFParserHDT(); case JSONLD: diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserDir.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserDir.java index 04f89e88..df77e3ba 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserDir.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserDir.java @@ -20,6 +20,15 @@ */ public class RDFParserDir implements RDFParserCallback { private static final Logger log = LoggerFactory.getLogger(RDFParserDir.class); + private final boolean simple; + + public RDFParserDir(boolean simple) { + this.simple = simple; + } + + public RDFParserDir() { + this(false); + } @Override public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException { @@ -46,7 +55,7 @@ private void doParse(Path p, String baseUri, RDFNotation notation, boolean keepB try { // get the notation of the file childNotation = RDFNotation.guess(child.toFile()); - rdfParserCallback = RDFParserFactory.getParserCallback(childNotation); + rdfParserCallback = RDFParserFactory.getParserCallback(childNotation, simple); } catch (IllegalArgumentException e) { log.warn("Ignore file {}", child, e); return; diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserList.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserList.java index 43e95d0f..40c20570 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserList.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserList.java @@ -45,6 +45,15 @@ * */ public class RDFParserList implements RDFParserCallback { + private final boolean simple; + + public RDFParserList(boolean simple) { + this.simple = simple; + } + + public RDFParserList() { + this(false); + } /* (non-Javadoc) * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.RDFCallback) @@ -88,7 +97,7 @@ private void doParse(BufferedReader reader, String baseUri, RDFNotation notation RDFNotation guessnot = RDFNotation.guess(line); System.out.println("Parse from list: "+line+" as "+guessnot); - RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot); + RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot, simple); parser.doParse(line, baseUri, guessnot, keepBNode, callback); } diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java index 69c1242c..b32f1acb 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserRAR.java @@ -32,7 +32,15 @@ public class RDFParserRAR implements RDFParserCallback { private final static String [] cmdList = { "unrar", "vb" , ""}; private final static String [] cmdExtractFile = { "unrar", "p", "-inul", "", "" }; private static Boolean available; - + private boolean simple; + + public RDFParserRAR(boolean simple) { + this.simple = simple; + } + public RDFParserRAR() { + this(false); + } + // List files in rar // unrar vb FILE.rar @@ -83,7 +91,7 @@ public void doParse(String rarFile, String baseUri, RDFNotation notation, boolea if(guessnot!=null) { // Create log.info("Parse from rar: {} as {}", fileName, guessnot); - RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot); + RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot, simple); cmdExtract[4]=fileName; ProcessBuilder extractProcessBuilder = new ProcessBuilder(cmdExtract); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java index cfcec133..54919cdf 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimple.java @@ -85,11 +85,28 @@ private void doParse(BufferedReader reader, String baseUri, RDFNotation notation long numLine = 1; TripleString triple = new TripleString(); while((line=reader.readLine())!=null) { - - line = line.trim().replaceAll("\\t"," "); - if(!line.startsWith("#")) { - triple.read(line); - if(!triple.hasEmpty()) { + // trim, find start + int start = 0; + while (start < line.length()) { + char c = line.charAt(start); + if (c != ' ' && c != '\t') { + break; + } + start++; + } + // trim, find end + int end = line.length() - 1; + while (end >= 0 ) { + char c = line.charAt(end); + if (c != ' ' && c != '\t') { + break; + } + end--; + } + // check that we have at least one element and this line isn't a comment + if (start + 1 < end && line.charAt(start) != '#') { + triple.read(line, start, end); + if (!triple.hasEmpty()) { //System.out.println(triple); callback.processTriple(triple, 0); } else { diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserTar.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserTar.java index 5d44ed36..f9709ff1 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserTar.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserTar.java @@ -1,8 +1,5 @@ package org.rdfhdt.hdt.rdf.parsers; -import java.io.FileNotFoundException; -import java.io.InputStream; - import org.apache.commons.compress.archivers.ArchiveStreamFactory; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; @@ -15,6 +12,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.InputStream; + /** * Parses a tar file (optionally .tgz or .tar.gz or .tar.bz2) directly, processing each file that contains rdf separately. * @@ -27,8 +26,16 @@ public class RDFParserTar implements RDFParserCallback { private static final Logger log = LoggerFactory.getLogger(RDFParserTar.class); + private final boolean simple; + + public RDFParserTar(boolean simple) { + this.simple = simple; + } + + public RDFParserTar() { + this(false); + } - /* (non-Javadoc) * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.Callback) */ @@ -60,7 +67,7 @@ public void doParse(InputStream input, String baseUri, RDFNotation notation, boo try { RDFNotation guessnot = RDFNotation.guess(entry.getName()); log.info("Parse from tar: {} as {}", entry.getName(), guessnot); - RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot); + RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot, simple); parser.doParse(nonCloseIn, baseUri, guessnot, keepBNode, callback); }catch (IllegalArgumentException | ParserException e1) { @@ -68,9 +75,6 @@ public void doParse(InputStream input, String baseUri, RDFNotation notation, boo } } } - } catch (FileNotFoundException e) { - e.printStackTrace(); - throw new ParserException(); } catch (Exception e) { e.printStackTrace(); throw new ParserException(); diff --git a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserZip.java b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserZip.java index a9474f77..7a508324 100644 --- a/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserZip.java +++ b/hdt-java-core/src/main/java/org/rdfhdt/hdt/rdf/parsers/RDFParserZip.java @@ -24,7 +24,16 @@ public class RDFParserZip implements RDFParserCallback { - + private final boolean simple; + + public RDFParserZip(boolean simple) { + this.simple = simple; + } + + public RDFParserZip() { + this(false); + } + /* (non-Javadoc) * @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.Callback) */ @@ -54,24 +63,19 @@ public void doParse(InputStream input, String baseUri, RDFNotation notation, boo try { RDFNotation guessnot = RDFNotation.guess(zipEntry.getName()); System.out.println("Parse from zip: "+zipEntry.getName()+" as "+guessnot); - RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot); + RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot, simple); parser.doParse(nonCloseIn, baseUri, guessnot, keepBNode, callback); - }catch (IllegalArgumentException e1) { - e1.printStackTrace(); - }catch (ParserException e1) { + } catch (IllegalArgumentException | ParserException e1) { e1.printStackTrace(); } } } // Don't close passed stream. - } catch (FileNotFoundException e) { - e.printStackTrace(); - throw new ParserException(); } catch (Exception e) { e.printStackTrace(); - throw new ParserException(); + throw new ParserException(e); } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java index 662b85b5..72a30761 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/rdf/parsers/RDFParserSimpleTest.java @@ -1,11 +1,77 @@ package org.rdfhdt.hdt.rdf.parsers; +import org.junit.Assert; +import org.junit.Test; +import org.rdfhdt.hdt.enums.RDFNotation; +import org.rdfhdt.hdt.exceptions.ParserException; import org.rdfhdt.hdt.rdf.RDFParserCallback; +import org.rdfhdt.hdt.triples.TripleString; +import org.rdfhdt.hdt.util.LargeFakeDataSetStreamSupplier; +import org.rdfhdt.hdt.util.StopWatch; + +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.io.PrintStream; +import java.util.Iterator; public class RDFParserSimpleTest extends AbstractNTriplesParserTest { + @Override + protected RDFParserCallback createParser() { + return new RDFParserSimple(); + } + + @Test + public void ingestTest() throws IOException, InterruptedException, ParserException { + LargeFakeDataSetStreamSupplier supplier = + LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_001, 42); + LargeFakeDataSetStreamSupplier supplier2 = + LargeFakeDataSetStreamSupplier.createSupplierWithMaxTriples(1_000_001, 42); + + PipedOutputStream out = new PipedOutputStream(); + PipedInputStream in = new PipedInputStream(); + in.connect(out); + + RuntimeException[] re = new RuntimeException[1]; + Thread t = new Thread(() -> { + try { + Iterator it = supplier.createTripleStringStream(); + PrintStream ps = new PrintStream(out); + while (it.hasNext()) { + TripleString next = it.next(); + next.dumpNtriple(ps); + ps.flush(); + } + out.close(); + } catch (RuntimeException tt) { + re[0] = tt; + } catch (Throwable tt) { + re[0] = new RuntimeException(tt); + } + }); + t.start(); + + RDFParserCallback parser = createParser(); + + Iterator it = supplier2.createTripleStringStream(); + + int[] count = new int[1]; + StopWatch watch = new StopWatch(); + watch.reset(); + parser.doParse(in, "http://example.org/#", RDFNotation.NTRIPLES, true, + (triple, pos) -> { + Assert.assertTrue(it.hasNext()); + Assert.assertEquals(it.next(), triple); + if (count[0] % 100_000 == 0) { + System.out.println(count[0] + " triples " +watch.stopAndShow()); + } + count[0]++; + } + ); - @Override - protected RDFParserCallback createParser() { - return new RDFParserSimple(); - } + t.join(); + if (re[0] != null) { + throw re[0]; + } + } } diff --git a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java index faf9a394..e096411d 100644 --- a/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java +++ b/hdt-java-core/src/test/java/org/rdfhdt/hdt/util/LargeFakeDataSetStreamSupplier.java @@ -119,11 +119,11 @@ private CharSequence createSubject() { } private CharSequence createPredicate() { - return ""; + return "http://w" + random.nextInt(maxElementSplit) + "i.test.org/#Obj" + random.nextInt(maxElementSplit); } private CharSequence createType() { - return ""; + return "http://wti.test.org/#Obj" + random.nextInt(maxFakeType); } private CharSequence createValue() { @@ -137,7 +137,7 @@ private CharSequence createValue() { return text + "@" + stringNameOfInt(random.nextInt(maxElementSplit)); } else { // typed node - return text + "^^" + createType(); + return text + "^^<" + createType() + ">"; } }