[SPARK-48977][SQL] Optimize string searching under UTF8_LCASE collation

### What changes were proposed in this pull request? Modify string search under UTF8_LCASE collation by utilizing UTF8String character iterator to reduce one order of algorithmic complexity. ### Why are the changes needed? Optimize implementation for `contains`, `startsWith`, `endsWith`, `locate` expressions. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#47444 from uros-db/optimize-search. Authored-by: Uros Bojanic <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
rohan-flutterint · Jul 31, 2024 · 0e07873 · 0e07873
1 parent 5954ed1
commit 0e07873
Showing 1 changed file with 73 additions and 10 deletions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -85,13 +85,44 @@ private static int lowercaseMatchLengthFrom(
       final UTF8String lowercasePattern,
       int startPos) {
     assert startPos >= 0;
-    for (int len = 0; len <= target.numChars() - startPos; ++len) {
-      if (lowerCaseCodePoints(target.substring(startPos, startPos + len))
-          .equals(lowercasePattern)) {
-        return len;
+    // Use code point iterators for efficient string search.
+    Iterator<Integer> targetIterator = target.codePointIterator();
+    Iterator<Integer> patternIterator = lowercasePattern.codePointIterator();
+    // Skip to startPos in the target string.
+    for (int i = 0; i < startPos; ++i) {
+      if (targetIterator.hasNext()) {
+        targetIterator.next();
+      } else {
+        return MATCH_NOT_FOUND;
       }
     }
-    return MATCH_NOT_FOUND;
+    // Compare the characters in the target and pattern strings.
+    int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
+    while (targetIterator.hasNext() && patternIterator.hasNext()) {
+      if (codePointBuffer != -1) {
+        targetCodePoint = codePointBuffer;
+        codePointBuffer = -1;
+      } else {
+        // Use buffered lowercase code point iteration to handle one-to-many case mappings.
+        targetCodePoint = getLowercaseCodePoint(targetIterator.next());
+        if (targetCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
+          targetCodePoint = CODE_POINT_LOWERCASE_I;
+          codePointBuffer = CODE_POINT_COMBINING_DOT;
+        }
+        ++matchLength;
+      }
+      patternCodePoint = patternIterator.next();
+      if (targetCodePoint != patternCodePoint) {
+        return MATCH_NOT_FOUND;
+      }
+    }
+    // If the pattern string has more characters, or the match is found at the middle of a
+    // character that maps to multiple characters in lowercase, then match is not found.
+    if (patternIterator.hasNext() || codePointBuffer != -1) {
+      return MATCH_NOT_FOUND;
+    }
+    // If all characters are equal, return the length of the match in the target string.
+    return matchLength;
   }
 
   /**
@@ -155,13 +186,45 @@ private static int lowercaseMatchLengthUntil(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {
-    assert endPos <= target.numChars();
-    for (int len = 0; len <= endPos; ++len) {
-      if (lowerCaseCodePoints(target.substring(endPos - len, endPos)).equals(lowercasePattern)) {
-        return len;
+    assert endPos >= 0;
+    // Use code point iterators for efficient string search.
+    Iterator<Integer> targetIterator = target.reverseCodePointIterator();
+    Iterator<Integer> patternIterator = lowercasePattern.reverseCodePointIterator();
+    // Skip to startPos in the target string.
+    for (int i = endPos; i < target.numChars(); ++i) {
+      if (targetIterator.hasNext()) {
+        targetIterator.next();
+      } else {
+        return MATCH_NOT_FOUND;
       }
     }
-    return MATCH_NOT_FOUND;
+    // Compare the characters in the target and pattern strings.
+    int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
+    while (targetIterator.hasNext() && patternIterator.hasNext()) {
+      if (codePointBuffer != -1) {
+        targetCodePoint = codePointBuffer;
+        codePointBuffer = -1;
+      } else {
+        // Use buffered lowercase code point iteration to handle one-to-many case mappings.
+        targetCodePoint = getLowercaseCodePoint(targetIterator.next());
+        if (targetCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
+          targetCodePoint = CODE_POINT_COMBINING_DOT;
+          codePointBuffer = CODE_POINT_LOWERCASE_I;
+        }
+        ++matchLength;
+      }
+      patternCodePoint = patternIterator.next();
+      if (targetCodePoint != patternCodePoint) {
+        return MATCH_NOT_FOUND;
+      }
+    }
+    // If the pattern string has more characters, or the match is found at the middle of a
+    // character that maps to multiple characters in lowercase, then match is not found.
+    if (patternIterator.hasNext() || codePointBuffer != -1) {
+      return MATCH_NOT_FOUND;
+    }
+    // If all characters are equal, return the length of the match in the target string.
+    return matchLength;
   }
 
   /**