Skip to content

Commit

Permalink
Showing 8 changed files with 233 additions and 118 deletions.
45 changes: 10 additions & 35 deletions src/org/exist/xquery/functions/fn/FunAnalyzeString.java
Original file line number Diff line number Diff line change
@@ -21,6 +21,8 @@

import javax.xml.XMLConstants;

import static org.exist.xquery.regex.RegexUtil.*;

/**
* XPath and XQuery 3.0 F+O fn:analyze-string()
*
@@ -103,14 +105,15 @@ public Sequence eval(final Sequence[] args, final Sequence contextSequence) thro
return (NodeValue)builder.getDocument().getDocumentElement();
}

private void analyzeString(final MemTreeBuilder builder, final String input, final String pattern, final String flags) throws XPathException {
final Pattern ptn;
if (flags != null) {
final int iFlags = parseStringFlags(flags);
ptn = PatternFactory.getInstance().getPattern(pattern, iFlags);
} else {
ptn = PatternFactory.getInstance().getPattern(pattern);
private void analyzeString(final MemTreeBuilder builder, final String input, String pattern, final String flags) throws XPathException {

final int iFlags = parseFlags(this, flags);

if(!hasLiteral(iFlags)) {
pattern = translateRegexp(this, pattern, hasIgnoreWhitespace(iFlags), hasCaseInsensitive(iFlags));
}

final Pattern ptn = PatternFactory.getInstance().getPattern(pattern, iFlags);

final Matcher matcher = ptn.matcher(input);

@@ -181,32 +184,4 @@ private void nonMatch(final MemTreeBuilder builder, final String nonMatch) {
builder.characters(nonMatch);
builder.endElement();
}

private int parseStringFlags(final String flags) {
int iFlags = 0;
for (final char c : flags.toCharArray()) {
switch(c) {
case 's':
iFlags |= Pattern.DOTALL;
break;

case 'm':
iFlags |= Pattern.MULTILINE;
break;

case 'i':
iFlags |= Pattern.CASE_INSENSITIVE;
break;

case 'x' :
iFlags |= Pattern.COMMENTS;
break;

case 'q' :
iFlags |= Pattern.LITERAL;
break;
}
}
return iFlags;
}
}
83 changes: 10 additions & 73 deletions src/org/exist/xquery/functions/fn/FunMatches.java
Original file line number Diff line number Diff line change
@@ -30,8 +30,6 @@
import org.exist.storage.NativeValueIndex;
import org.exist.util.PatternFactory;
import org.exist.xquery.pragmas.Optimize;
import org.exist.xquery.regex.JDK15RegexTranslator;
import org.exist.xquery.regex.RegexSyntaxException;
import org.exist.xquery.*;
import org.exist.xquery.util.Error;
import org.exist.xquery.value.BooleanValue;
@@ -48,6 +46,8 @@
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import static org.exist.xquery.regex.RegexUtil.*;

/**
* Implements the fn:matches() function.
*
@@ -214,7 +214,7 @@ public NodeSet preSelect(Sequence contextSequence, boolean useContext) throws XP
final int flags;
if(getSignature().getArgumentCount() == 3) {
final String flagsArg = getArgument(2).eval(contextSequence).getStringValue();
flags = parseFlags(flagsArg);
flags = parseFlags(this, flagsArg);
} else {
flags = 0;
}
@@ -232,7 +232,7 @@ public NodeSet preSelect(Sequence contextSequence, boolean useContext) throws XP
} else {
final boolean ignoreWhitespace = hasIgnoreWhitespace(flags);
final boolean caseBlind = !caseSensitive;
pattern = translateRegexp(getArgument(1).eval(contextSequence).getStringValue(), ignoreWhitespace, caseBlind);
pattern = translateRegexp(this, getArgument(1).eval(contextSequence).getStringValue(), ignoreWhitespace, caseBlind);
}
}

@@ -250,18 +250,6 @@ public NodeSet preSelect(Sequence contextSequence, boolean useContext) throws XP
return preselectResult;
}

protected boolean hasLiteral(final int flags) {
return (flags & Pattern.LITERAL) != 0;
}

protected boolean hasCaseInsensitive(final int flags) {
return (flags & Pattern.CASE_INSENSITIVE) != 0 || (flags & Pattern.UNICODE_CASE) != 0;
}

protected boolean hasIgnoreWhitespace(final int flags) {
return (flags & Pattern.COMMENTS) != 0;
}

@Override
public int getDependencies() {
final Expression stringArg = getArgument(0);
@@ -386,7 +374,7 @@ private Sequence evalWithIndex(Sequence contextSequence, Item contextItem, Seque
final int flags;
if(getSignature().getArgumentCount() == 3) {
final String flagsArg = getArgument(2).eval(contextSequence, contextItem).getStringValue();
flags = parseFlags(flagsArg);
flags = parseFlags(this, flagsArg);
} else {
flags = 0;
}
@@ -406,7 +394,7 @@ private Sequence evalWithIndex(Sequence contextSequence, Item contextItem, Seque
} else {
final boolean ignoreWhitespace = hasIgnoreWhitespace(flags);
final boolean caseBlind = !caseSensitive;
pattern = translateRegexp(getArgument(1).eval(contextSequence, contextItem).getStringValue(), ignoreWhitespace, caseBlind);
pattern = translateRegexp(this, getArgument(1).eval(contextSequence, contextItem).getStringValue(), ignoreWhitespace, caseBlind);
}
}

@@ -480,27 +468,6 @@ private Sequence evalFallback(NodeSet nodes, String pattern, int flags, int inde
return result;
}

/**
* Translates the regular expression from XPath2 syntax to java regex
* syntax.
*
* @param pattern a String containing a regular expression in the syntax of XML Schemas Part 2
* @param ignoreWhitespace true if whitespace is to be ignored ('x' flag)
* @param caseBlind true if case is to be ignored ('i' flag)
* @return The translated regexp
* @throws XPathException
*/
protected String translateRegexp(final String pattern, final boolean ignoreWhitespace, final boolean caseBlind) throws XPathException {
// convert pattern to Java regex syntax
try {
final int xmlVersion = 11;
return JDK15RegexTranslator.translate(pattern, xmlVersion, true, ignoreWhitespace, caseBlind);
} catch (final RegexSyntaxException e) {
throw new XPathException(this, "Conversion from XPath2 to Java regular expression " +
"syntax failed: " + e.getMessage(), e);
}
}

/**
* @param contextSequence
* @param contextItem
@@ -513,7 +480,7 @@ private Sequence evalGeneric(Sequence contextSequence, Item contextItem, Sequenc

final int flags;
if(getSignature().getArgumentCount() == 3) {
flags = parseFlags(getArgument(2).eval(contextSequence, contextItem).getStringValue());
flags = parseFlags(this, getArgument(2).eval(contextSequence, contextItem).getStringValue());
} else {
flags = 0;
}
@@ -529,7 +496,7 @@ private Sequence evalGeneric(Sequence contextSequence, Item contextItem, Sequenc
} else {
final boolean ignoreWhitespace = hasIgnoreWhitespace(flags);
final boolean caseBlind = hasCaseInsensitive(flags);
pattern = translateRegexp(getArgument(1).eval(contextSequence, contextItem).getStringValue(), ignoreWhitespace, caseBlind);
pattern = translateRegexp(this, getArgument(1).eval(contextSequence, contextItem).getStringValue(), ignoreWhitespace, caseBlind);
}
}

@@ -560,43 +527,13 @@ private boolean match(String string, String pattern, int flags) throws XPathExce
}
}

protected final static int parseFlags(final String s) throws XPathException {
int flags = 0;
for(int i = 0; i < s.length(); i++) {
final char ch = s.charAt(i);
switch(ch) {
case 'm':
flags |= Pattern.MULTILINE;
break;
case 'i':
flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
break;
case 'x':
flags |= Pattern.COMMENTS;
break;
case 's':
flags |= Pattern.DOTALL;
break;
case 'q' :
flags |= Pattern.LITERAL;
break;
default:
throw new XPathException("err:FORX0001: Invalid regular expression flag: " + ch);
}
}
return flags;
}

@Override
public void reset() {
super.reset();
hasUsedIndex = false;
}

/*
* (non-Javadoc)
*
* @see org.exist.xquery.AbstractExpression#resetState()
*/
@Override
public void resetState(boolean postOptimization) {
super.resetState(postOptimization);
if (!postOptimization)
6 changes: 4 additions & 2 deletions src/org/exist/xquery/functions/fn/FunReplace.java
Original file line number Diff line number Diff line change
@@ -46,6 +46,8 @@
import org.exist.xquery.value.StringValue;
import org.exist.xquery.value.Type;

import static org.exist.xquery.regex.RegexUtil.*;

/**
* @author Wolfgang Meier (wolfgang@exist-db.org)
*/
@@ -166,7 +168,7 @@ public Sequence eval(Sequence contextSequence, Item contextItem) throws XPathExc
} else {
final int flags;
if (getSignature().getArgumentCount() == 4) {
flags = parseFlags(getArgument(3).eval(contextSequence, contextItem).getStringValue());
flags = parseFlags(this, getArgument(3).eval(contextSequence, contextItem).getStringValue());
} else {
flags = 0;
}
@@ -187,7 +189,7 @@ public Sequence eval(Sequence contextSequence, Item contextItem) throws XPathExc
.replace("\\", "\\\\")
.replace("$", "\\$");
} else {
pattern = translateRegexp(patternSeq.getStringValue(), hasIgnoreWhitespace(flags), hasCaseInsensitive(flags));
pattern = translateRegexp(this, patternSeq.getStringValue(), hasIgnoreWhitespace(flags), hasCaseInsensitive(flags));
}

//An error is raised [err:FORX0004] if the value of $replacement contains a "$" character that is not immediately followed by a digit 0-9 and not immediately preceded by a "\".
5 changes: 3 additions & 2 deletions src/org/exist/xquery/functions/fn/FunTokenize.java
Original file line number Diff line number Diff line change
@@ -40,6 +40,7 @@
import org.exist.xquery.value.ValueSequence;

import static org.exist.xquery.FunctionDSL.*;
import static org.exist.xquery.regex.RegexUtil.*;

/**
* @author Wolfgang Meier (wolfgang@exist-db.org)
@@ -100,7 +101,7 @@ public Sequence eval(final Sequence contextSequence, final Item contextItem) thr
} else {
final int flags;
if (getSignature().getArgumentCount() == 3) {
flags = parseFlags(getArgument(2).eval(contextSequence, contextItem)
flags = parseFlags(this, getArgument(2).eval(contextSequence, contextItem)
.getStringValue());
} else {
flags = 0;
@@ -117,7 +118,7 @@ public Sequence eval(final Sequence contextSequence, final Item contextItem) thr
} else {
final boolean ignoreWhitespace = hasIgnoreWhitespace(flags);
final boolean caseBlind = !hasCaseInsensitive(flags);
pattern = translateRegexp(getArgument(1).eval(contextSequence, contextItem).getStringValue(), ignoreWhitespace, caseBlind);
pattern = translateRegexp(this, getArgument(1).eval(contextSequence, contextItem).getStringValue(), ignoreWhitespace, caseBlind);
}
}

62 changes: 57 additions & 5 deletions src/org/exist/xquery/regex/JDK15RegexTranslator.java
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@

import org.exist.util.FastStringBuffer;
import org.exist.util.UTF16CharacterSet;
import org.exist.util.XMLString;
import org.exist.xquery.value.StringValue;

/**
@@ -18,6 +19,8 @@
* characters, since JDK 1.5 handles these natively.
*
* Copied from Saxon-HE 9.2 package net.sf.saxon.regex.
*
* Updated for Non-capturing Groups in XQuery 3.0 by Adam Retter
*/
public class JDK15RegexTranslator extends RegexTranslator {

@@ -450,11 +453,20 @@ protected boolean translateAtom() throws RegexSyntaxException {
return false;
case '(':
copyCurChar();
final int thisCapture = ++currentCapture;
translateRegExp();
expect(')');
captures.add(thisCapture);
copyCurChar();
final boolean nonCapturing = isNonCapturing();
if(nonCapturing) {
copyCurChar(); // ?
copyCurChar(); // :
translateRegExp();
expect(')');
copyCurChar();
} else {
final int thisCapture = ++currentCapture;
translateRegExp();
expect(')');
captures.add(thisCapture);
copyCurChar();
}
return true;
case '\\':
advance();
@@ -506,6 +518,46 @@ protected boolean translateAtom() throws RegexSyntaxException {
return true;
}

private boolean isNonCapturing() {
int localPos = pos;
if (localPos + 1 < length) {
char localChar = curChar;

if (ignoreWhitespace) {
while (XMLString.isWhiteSpace(localChar)) {
if (localPos + 1 < length) {
localChar = regExp.charAt(localPos++);
} else {
return false;
}
}
}

if(localChar == '?') {

if (localPos + 1 < length) {
localChar = regExp.charAt(localPos++);

if (ignoreWhitespace) {
while (XMLString.isWhiteSpace(localChar)) {
if (localPos + 1 < length) {
localChar = regExp.charAt(localPos++);
} else {
return false;
}
}
}

if(localChar == ':') {
return true;
}
}
}
}

return false;
}

private static CharClass makeNameCharClass(byte mask) {
final List ranges = new ArrayList();
// Add colon to the set of characters matched
3 changes: 2 additions & 1 deletion src/org/exist/xquery/regex/RegexTranslator.java
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.exist.util.FastStringBuffer;
import org.exist.util.UTF16CharacterSet;
@@ -28,7 +29,7 @@ public abstract class RegexTranslator {
protected char curChar;
protected boolean eos = false;
protected int currentCapture = 0;
protected HashSet captures = new HashSet(); //IntHashSet
protected Set<Integer> captures = new HashSet<>();
protected final FastStringBuffer result = new FastStringBuffer(64);

protected void translateTop() throws RegexSyntaxException {
132 changes: 132 additions & 0 deletions src/org/exist/xquery/regex/RegexUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* eXist Open Source Native XML Database
* Copyright (C) 2001-2017 The eXist Project
* http://exist-db.org
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/

package org.exist.xquery.regex;

import org.exist.xquery.ErrorCodes;
import org.exist.xquery.Expression;
import org.exist.xquery.XPathException;
import org.exist.xquery.value.StringValue;

import javax.annotation.Nullable;
import java.util.regex.Pattern;

/**
* @author Adam Retter <adam@exist-db.org>
*/
public class RegexUtil {

/**
* Parses the flags for an XQuery Regular Expression.
*
* @param context The calling expression
* @param strFlags The XQuery Regular Expression flags.
*
* @return The flags for a Java Regular Expression.
*/
public static int parseFlags(final Expression context, @Nullable final String strFlags) throws XPathException {
int flags = 0;
if(strFlags != null) {
for (int i = 0; i < strFlags.length(); i++) {
final char ch = strFlags.charAt(i);
switch (ch) {
case 'm':
flags |= Pattern.MULTILINE;
break;

case 'i':
flags = flags | Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CASE;
break;

case 'x':
flags |= Pattern.COMMENTS;
break;

case 's':
flags |= Pattern.DOTALL;
break;

case 'q':
flags |= Pattern.LITERAL;
break;

default:
throw new XPathException(context, ErrorCodes.FORX0001, "Invalid regular expression flag: " + ch, new StringValue(String.valueOf(ch)));
}
}
}
return flags;
}

/**
* Determines if the Java Regular Expression flags have the literal flag set.
*
* @param flags The Java Regular Expression flags
*
* @return true if the literal flag is set
*/
public static boolean hasLiteral(final int flags) {
return (flags & Pattern.LITERAL) != 0;
}

/**
* Determines if the Java Regular Expression flags have the case-insensitive flag set.
*
* @param flags The Java Regular Expression flags
*
* @return true if the case-insensitive flag is set
*/
public static boolean hasCaseInsensitive(final int flags) {
return (flags & Pattern.CASE_INSENSITIVE) != 0 || (flags & Pattern.UNICODE_CASE) != 0;
}

/**
* Determines if the Java Regular Expression flags have the ignore-whitespace flag set.
*
* @param flags The Java Regular Expression flags
*
* @return true if the ignore-whitespace flag is set
*/
public static boolean hasIgnoreWhitespace(final int flags) {
return (flags & Pattern.COMMENTS) != 0;
}

/**
* Translates the Regular Expression from XPath3 syntax to Java regex
* syntax.
*
* @param pattern a String containing a regular expression in the syntax of XPath F&O 3.0.
* @param ignoreWhitespace true if whitespace is to be ignored ('x' flag)
* @param caseBlind true if case is to be ignored ('i' flag)
*
* @return The Java Regular Expression
*
* @throws XPathException if the XQuery Regular Expression is invalid.
*/
public static String translateRegexp(final Expression context, final String pattern, final boolean ignoreWhitespace, final boolean caseBlind) throws XPathException {
// convert pattern to Java regex syntax
try {
final int xmlVersion = 11;
return JDK15RegexTranslator.translate(pattern, xmlVersion, true, ignoreWhitespace, caseBlind);
} catch (final RegexSyntaxException e) {
throw new XPathException(context, ErrorCodes.FORX0002, "Conversion from XPath F&O 3.0 regular expression syntax to Java regular expression syntax failed: " + e.getMessage(), new StringValue(pattern), e);
}
}
}
15 changes: 15 additions & 0 deletions test/src/xquery/regex.xml
Original file line number Diff line number Diff line change
@@ -77,6 +77,21 @@
<code>fn:replace("a/b/c", "/", "$", "q")</code>
<expected>a$b$c</expected>
</test>
<test output="text">
<task>fn:replace-capturing-1</task>
<code>fn:replace("hello", "hel(lo)", "$1")</code>
<expected>lo</expected>
</test>
<test output="text">
<task>fn:replace-non-capturing-1</task>
<code>fn:replace("hello", "hel(?:lo)", "$1")</code>
<error>FORX0001</error>
</test>
<test output="text">
<task>fn:replace-non-capturing-2</task>
<code>fn:replace("hello", "h(?:el(lo))", "$1")</code>
<expected>lo</expected>
</test>

<test output="text">
<task>fn:tokenize-qflag-1</task>

0 comments on commit e6030ca

Please sign in to comment.