Skip to content

Commit

Permalink
[SPARK-48977][SQL] Optimize string searching under UTF8_LCASE collation
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Modify string search under UTF8_LCASE collation by utilizing UTF8String character iterator to reduce one order of algorithmic complexity.

### Why are the changes needed?
Optimize implementation for `contains`, `startsWith`, `endsWith`, `locate` expressions.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Existing tests.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes apache#47444 from uros-db/optimize-search.

Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
  • Loading branch information
uros-db authored and cloud-fan committed Jul 31, 2024
1 parent 5954ed1 commit 0e07873
Showing 1 changed file with 73 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,44 @@ private static int lowercaseMatchLengthFrom(
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int len = 0; len <= target.numChars() - startPos; ++len) {
if (lowerCaseCodePoints(target.substring(startPos, startPos + len))
.equals(lowercasePattern)) {
return len;
// Use code point iterators for efficient string search.
Iterator<Integer> targetIterator = target.codePointIterator();
Iterator<Integer> patternIterator = lowercasePattern.codePointIterator();
// Skip to startPos in the target string.
for (int i = 0; i < startPos; ++i) {
if (targetIterator.hasNext()) {
targetIterator.next();
} else {
return MATCH_NOT_FOUND;
}
}
return MATCH_NOT_FOUND;
// Compare the characters in the target and pattern strings.
int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
while (targetIterator.hasNext() && patternIterator.hasNext()) {
if (codePointBuffer != -1) {
targetCodePoint = codePointBuffer;
codePointBuffer = -1;
} else {
// Use buffered lowercase code point iteration to handle one-to-many case mappings.
targetCodePoint = getLowercaseCodePoint(targetIterator.next());
if (targetCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
targetCodePoint = CODE_POINT_LOWERCASE_I;
codePointBuffer = CODE_POINT_COMBINING_DOT;
}
++matchLength;
}
patternCodePoint = patternIterator.next();
if (targetCodePoint != patternCodePoint) {
return MATCH_NOT_FOUND;
}
}
// If the pattern string has more characters, or the match is found at the middle of a
// character that maps to multiple characters in lowercase, then match is not found.
if (patternIterator.hasNext() || codePointBuffer != -1) {
return MATCH_NOT_FOUND;
}
// If all characters are equal, return the length of the match in the target string.
return matchLength;
}

/**
Expand Down Expand Up @@ -155,13 +186,45 @@ private static int lowercaseMatchLengthUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int len = 0; len <= endPos; ++len) {
if (lowerCaseCodePoints(target.substring(endPos - len, endPos)).equals(lowercasePattern)) {
return len;
assert endPos >= 0;
// Use code point iterators for efficient string search.
Iterator<Integer> targetIterator = target.reverseCodePointIterator();
Iterator<Integer> patternIterator = lowercasePattern.reverseCodePointIterator();
// Skip to startPos in the target string.
for (int i = endPos; i < target.numChars(); ++i) {
if (targetIterator.hasNext()) {
targetIterator.next();
} else {
return MATCH_NOT_FOUND;
}
}
return MATCH_NOT_FOUND;
// Compare the characters in the target and pattern strings.
int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
while (targetIterator.hasNext() && patternIterator.hasNext()) {
if (codePointBuffer != -1) {
targetCodePoint = codePointBuffer;
codePointBuffer = -1;
} else {
// Use buffered lowercase code point iteration to handle one-to-many case mappings.
targetCodePoint = getLowercaseCodePoint(targetIterator.next());
if (targetCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
targetCodePoint = CODE_POINT_COMBINING_DOT;
codePointBuffer = CODE_POINT_LOWERCASE_I;
}
++matchLength;
}
patternCodePoint = patternIterator.next();
if (targetCodePoint != patternCodePoint) {
return MATCH_NOT_FOUND;
}
}
// If the pattern string has more characters, or the match is found at the middle of a
// character that maps to multiple characters in lowercase, then match is not found.
if (patternIterator.hasNext() || codePointBuffer != -1) {
return MATCH_NOT_FOUND;
}
// If all characters are equal, return the length of the match in the target string.
return matchLength;
}

/**
Expand Down

0 comments on commit 0e07873

Please sign in to comment.