Skip to content

Commit

Permalink
Merge pull request #163 from ate47/simple_parser
Browse files Browse the repository at this point in the history
allow using nt simple parser in RDF2HDT cli/generateHDT
  • Loading branch information
D063520 authored May 17, 2022
2 parents 306f784 + fea2182 commit 162627c
Show file tree
Hide file tree
Showing 15 changed files with 283 additions and 78 deletions.
87 changes: 66 additions & 21 deletions hdt-api/src/main/java/org/rdfhdt/hdt/triples/TripleString.java
Original file line number Diff line number Diff line change
Expand Up @@ -204,46 +204,91 @@ public boolean hasEmpty() {
* @throws ParserException if the line is not RDF complient
*/
public void read(String line) throws ParserException {
read(line, 0, line.length());
}

private int searchNextTabOrSpace(String line, int start, int end) {
// searching space
int sindex = line.indexOf(' ', start);
if (sindex != -1 && sindex < end) {
return sindex;
}

// not found, searching tabs
int tindex = line.indexOf('\t', start);
if (tindex != -1 && tindex < end) {
return tindex;
}

// not found
return -1;
}

/**
* Read from a line, where each component is separated by space.
* @param line line to read
* @throws ParserException if the line is not RDF complient
*/
public void read(String line, int start, int end) throws ParserException {
int split, posa, posb;
this.clear();

line = line.replace("\\t"," ");

// SET SUBJECT
posa = 0;
posb = split = line.indexOf(' ', posa);
posa = start;
posb = split = searchNextTabOrSpace(line, posa, end);

if(posb==-1) return; // Not found, error.
if(line.charAt(posa)=='<') posa++; // Remove <
if(line.charAt(posb-1)=='>') posb--; // Remove >
if (posb == -1) {
// Not found, error.
return;
}
if (line.charAt(posa) == '<') {
posa++; // Remove <
if (line.charAt(posb-1) == '>') {
posb--; // Remove >
}
}

this.setSubject(UnicodeEscape.unescapeString(line.substring(posa, posb)));
this.setSubject(UnicodeEscape.unescapeString(line, posa, posb));

// SET PREDICATE
posa = split+1;
posb = split = line.indexOf(' ', posa);
posa = split + 1;
posb = split = searchNextTabOrSpace(line, posa, end);

if(posb==-1) return;
if(line.charAt(posa)=='<') posa++;
if(posb>posa && line.charAt(posb-1)=='>') posb--;
if (posb == -1) {
return;
}
if (line.charAt(posa) == '<') {
posa++;
if (posb > posa && line.charAt(posb - 1) == '>') {
posb--;
}
}

this.setPredicate(UnicodeEscape.unescapeString(line.substring(posa, posb)));
this.setPredicate(UnicodeEscape.unescapeString(line, posa, posb));

// SET OBJECT
posa = split+1;
posb = line.length();
posa = split + 1;
posb = end;

if(line.charAt(posb-1)=='.') posb--; // Remove trailing <space> <dot> from NTRIPLES.
if(line.charAt(posb-1)==' ') posb--;
// Remove trailing <space> <dot> from NTRIPLES.
if (line.charAt(posb-1) == '.') {
posb--;
}
char prev = line.charAt(posb-1);
if (prev == ' ' || prev == '\t') {
posb--;
}

if(line.charAt(posa)=='<') {
if (line.charAt(posa) == '<') {
posa++;

// Remove trailing > only if < appears, so "some"^^<http://datatype> is kept as-is.
if(posb>posa && line.charAt(posb-1)=='>') posb--;
if (posb > posa && line.charAt(posb-1)=='>') {
posb--;
}
}

this.setObject(UnicodeEscape.unescapeString(line.substring(posa, posb)));
this.setObject(UnicodeEscape.unescapeString(line, posa, posb));
}

/*
Expand Down
33 changes: 23 additions & 10 deletions hdt-api/src/main/java/org/rdfhdt/hdt/util/UnicodeEscape.java
Original file line number Diff line number Diff line change
Expand Up @@ -140,32 +140,45 @@ else if (cInt >= 0x10000 && cInt <= 0x10FFFF) {

appendable.append(label.subSequence(last+1, label.length()));
}

/**
* Unescapes an escaped Unicode string. Any Unicode sequences
* (<code>&#x5C;uxxxx</code> and <code>&#x5C;Uxxxxxxxx</code>) are restored to the
* value indicated by the hexadecimal argument and any backslash-escapes
* (<code>\"</code>, <code>\\</code>, etc.) are decoded to their original form.
*
*
* @param s An escaped Unicode string.
* @return The unescaped string.
* @throws IllegalArgumentException If the supplied string is not a
* correctly escaped N-Triples string.
*/
public static String unescapeString(String s) {
int backSlashIdx = s.indexOf('\\');
return unescapeString(s, 0, s.length());
}
/**
* Unescapes an escaped Unicode string. Any Unicode sequences
* (<code>&#x5C;uxxxx</code> and <code>&#x5C;Uxxxxxxxx</code>) are restored to the
* value indicated by the hexadecimal argument and any backslash-escapes
* (<code>\"</code>, <code>\\</code>, etc.) are decoded to their original form.
*
* @param s An escaped Unicode string.
* @return The unescaped string.
* @throws IllegalArgumentException If the supplied string is not a
* correctly escaped N-Triples string.
*/
public static String unescapeString(String s, int start, int sLength) {
int backSlashIdx = s.indexOf('\\', start);

if (backSlashIdx == -1) {
if (backSlashIdx == -1 || backSlashIdx >= sLength) {
// No escaped characters found
return s;
return s.substring(start, sLength);
}

int startIdx = 0;
int sLength = s.length();
int startIdx = start;
StringBuilder sb = new StringBuilder(sLength);

while (backSlashIdx != -1) {
sb.append(s.substring(startIdx, backSlashIdx));
while (backSlashIdx != -1 && backSlashIdx < sLength) {
sb.append(s, startIdx, backSlashIdx);

if (backSlashIdx + 1 >= sLength) {
throw new IllegalArgumentException("Unescaped backslash in: " + s);
Expand Down Expand Up @@ -238,7 +251,7 @@ else if (c == 'U') {
backSlashIdx = s.indexOf('\\', startIdx);
}

sb.append(s.substring(startIdx));
sb.append(s, startIdx, sLength);

return sb.toString();
}
Expand Down
11 changes: 9 additions & 2 deletions hdt-java-cli/src/main/java/org/rdfhdt/hdt/tools/RDF2HDT.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,12 @@ public class RDF2HDT implements ProgressListener {

@Parameter(names = "-index", description = "Generate also external indices to solve all queries")
public boolean generateIndex;

@Parameter(names = "-quiet", description = "Do not show progress of the conversion")
public boolean quiet;

@Parameter(names = "-canonicalntfile", description = "Only for NTriples input. Use a Fast NT file parser the input should be in a canonical form. See https://www.w3.org/TR/n-triples/#h2_canonical-ntriples")
public boolean ntSimpleLoading;

public void execute() throws ParserException, IOException {
HDTSpecification spec;
Expand All @@ -88,7 +91,7 @@ public void execute() throws ParserException, IOException {
if(baseURI==null) {
baseURI = "file://"+rdfInput;
}

RDFNotation notation=null;
if(rdfType!=null) {
try {
Expand All @@ -107,6 +110,10 @@ public void execute() throws ParserException, IOException {
}
}

if (ntSimpleLoading) {
spec.set("parser.ntSimpleParser", "true");
}

StopWatch sw = new StopWatch();
HDT hdt = HDTManager.generateHDT(rdfInput, baseURI,notation , spec, this);
System.out.println("File converted in: "+sw.stopAndShow());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@

public class HDTManagerImpl extends HDTManager {

private boolean useSimple(HDTOptions spec) {
String value = spec.get("parser.ntSimpleParser");
return value != null && !value.isEmpty() && !value.equals("false");
}

@Override
public HDTOptions doReadOptions(String file) throws IOException {
return new HDTSpecification(file);
Expand Down Expand Up @@ -90,9 +95,9 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota
String loaderType = spec.get("loader.type");
TempHDTImporter loader;
if ("two-pass".equals(loaderType)) {
loader = new TempHDTImporterTwoPass();
loader = new TempHDTImporterTwoPass(useSimple(spec));
} else {
loader = new TempHDTImporterOnePass();
loader = new TempHDTImporterOnePass(useSimple(spec));
}

// Create TempHDT
Expand All @@ -118,7 +123,7 @@ public HDT doGenerateHDT(String rdfFileName, String baseURI, RDFNotation rdfNota
@Override
public HDT doGenerateHDT(Iterator<TripleString> triples, String baseURI, HDTOptions spec, ProgressListener listener) throws IOException {
//choose the importer
TempHDTImporterOnePass loader = new TempHDTImporterOnePass();
TempHDTImporterOnePass loader = new TempHDTImporterOnePass(false);

// Create TempHDT
TempHDT modHdt = loader.loadFromTriples(spec, triples, baseURI, listener);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,17 @@ public void processTriple(TripleString triple, long pos) {
}
}

private final boolean useSimple;

public TempHDTImporterOnePass(boolean useSimple) {
this.useSimple = useSimple;
}

@Override
public TempHDT loadFromRDF(HDTOptions specs, String filename, String baseUri, RDFNotation notation, ProgressListener listener)
throws ParserException {

RDFParserCallback parser = RDFParserFactory.getParserCallback(notation);
RDFParserCallback parser = RDFParserFactory.getParserCallback(notation, useSimple);

// Create Modifiable Instance
TempHDT modHDT = new TempHDTImpl(specs, baseUri, ModeOfLoading.ONE_PASS);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,17 @@ public void processTriple(TripleString triple, long pos) {
}
}

@Override
private final boolean useSimple;

public TempHDTImporterTwoPass(boolean useSimple) {
this.useSimple = useSimple;
}

@Override
public TempHDT loadFromRDF(HDTOptions specs, String filename, String baseUri, RDFNotation notation, ProgressListener listener)
throws ParserException {

RDFParserCallback parser = RDFParserFactory.getParserCallback(notation);
RDFParserCallback parser = RDFParserFactory.getParserCallback(notation, useSimple);

// Create Modifiable Instance and parser
TempHDT modHDT = new TempHDTImpl(specs, baseUri, ModeOfLoading.TWO_PASS);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.rdfhdt.hdt.rdf.parsers.RDFParserList;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRAR;
import org.rdfhdt.hdt.rdf.parsers.RDFParserRIOT;
import org.rdfhdt.hdt.rdf.parsers.RDFParserSimple;
import org.rdfhdt.hdt.rdf.parsers.RDFParserTar;
import org.rdfhdt.hdt.rdf.parsers.RDFParserZip;

Expand All @@ -43,24 +44,29 @@
*/
public class RDFParserFactory {
public static RDFParserCallback getParserCallback(RDFNotation notation) {

return getParserCallback(notation, false);
}
public static RDFParserCallback getParserCallback(RDFNotation notation, boolean useSimple) {
switch(notation) {
case NTRIPLES:
case NTRIPLES:
if (useSimple) {
return new RDFParserSimple();
}
case NQUAD:
case TURTLE:
case N3:
case RDFXML:
return new RDFParserRIOT();
case DIR:
return new RDFParserDir();
return new RDFParserDir(useSimple);
case LIST:
return new RDFParserList();
case ZIP:
return new RDFParserZip();
return new RDFParserZip(useSimple);
case TAR:
return new RDFParserTar();
return new RDFParserTar(useSimple);
case RAR:
return new RDFParserRAR();
return new RDFParserRAR(useSimple);
case HDT:
return new RDFParserHDT();
case JSONLD:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@
*/
public class RDFParserDir implements RDFParserCallback {
private static final Logger log = LoggerFactory.getLogger(RDFParserDir.class);
private final boolean simple;

public RDFParserDir(boolean simple) {
this.simple = simple;
}

public RDFParserDir() {
this(false);
}

@Override
public void doParse(String fileName, String baseUri, RDFNotation notation, boolean keepBNode, RDFCallback callback) throws ParserException {
Expand All @@ -46,7 +55,7 @@ private void doParse(Path p, String baseUri, RDFNotation notation, boolean keepB
try {
// get the notation of the file
childNotation = RDFNotation.guess(child.toFile());
rdfParserCallback = RDFParserFactory.getParserCallback(childNotation);
rdfParserCallback = RDFParserFactory.getParserCallback(childNotation, simple);
} catch (IllegalArgumentException e) {
log.warn("Ignore file {}", child, e);
return;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,15 @@
*
*/
public class RDFParserList implements RDFParserCallback {
private final boolean simple;

public RDFParserList(boolean simple) {
this.simple = simple;
}

public RDFParserList() {
this(false);
}

/* (non-Javadoc)
* @see hdt.rdf.RDFParserCallback#doParse(java.lang.String, java.lang.String, hdt.enums.RDFNotation, hdt.rdf.RDFParserCallback.RDFCallback)
Expand Down Expand Up @@ -88,7 +97,7 @@ private void doParse(BufferedReader reader, String baseUri, RDFNotation notation

RDFNotation guessnot = RDFNotation.guess(line);
System.out.println("Parse from list: "+line+" as "+guessnot);
RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot);
RDFParserCallback parser = RDFParserFactory.getParserCallback(guessnot, simple);

parser.doParse(line, baseUri, guessnot, keepBNode, callback);
}
Expand Down
Loading

0 comments on commit 162627c

Please sign in to comment.