diff --git a/CHANGELOG-3.0.md b/CHANGELOG-3.0.md index c5f9611910fa9..bc5e63dbdf8ce 100644 --- a/CHANGELOG-3.0.md +++ b/CHANGELOG-3.0.md @@ -37,6 +37,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Stop minimizing automata used for case-insensitive matches ([#17268](https://github.com/opensearch-project/OpenSearch/pull/17268)) - Refactor the `:server` module `org.opensearch.client` to `org.opensearch.transport.client` to eliminate top level split packages for JPMS support ([#17272](https://github.com/opensearch-project/OpenSearch/pull/17272)) - Use Lucene `BM25Similarity` as default since the `LegacyBM25Similarity` is marked as deprecated ([#17306](https://github.com/opensearch-project/OpenSearch/pull/17306)) +- Wildcard field index only 3gram of the input data [#17349](https://github.com/opensearch-project/OpenSearch/pull/17349) ### Deprecated diff --git a/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/mixed_cluster/40_wildcard.yml b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/mixed_cluster/40_wildcard.yml new file mode 100644 index 0000000000000..e06854af7e924 --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/mixed_cluster/40_wildcard.yml @@ -0,0 +1,200 @@ +# refactored from rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml +--- +"search on mixed state": + # "term query matches exact value" + - do: + search: + index: test + body: + query: + term: + my_field: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.doc_values: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # term query matches lowercase-normalized value + - do: + search: + index: test + body: + query: + term: + my_field.lower: "abcd" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field.lower: "ABCD" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field: "abcd" + - match: { hits.total.value: 0 } + + # wildcard query matches + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*Node*Exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + # wildcard query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*node*exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*NODE*EXCEPTION*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*node*exception*" + - match: { hits.total.value: 0 } + + # prefix query matches + - do: + search: + index: test + body: + query: + prefix: + my_field: + value: "[2024-06-08T" + - match: { hits.total.value: 3 } + + # regexp query matches + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*cluster-manager node.*" + - match: { hits.total.value: 2 } + + # regexp query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + regexp: + my_field.lower: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 2 } + + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 0 } + + # wildcard match-all works + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*" + - match: { hits.total.value: 6 } + + # regexp match-all works + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*" + - match: { hits.total.value: 6 } + + # terms query on wildcard field matches + - do: + search: + index: test + body: + query: + terms: { my_field: [ "AbCd" ] } + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # case insensitive query on wildcard field + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + case_insensitive: true + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } diff --git a/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/old_cluster/40_wildcard.yml b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/old_cluster/40_wildcard.yml new file mode 100644 index 0000000000000..b19882c69ddd7 --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/old_cluster/40_wildcard.yml @@ -0,0 +1,235 @@ +# refactored from rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml +--- +"Create index with Wildcard field": + - do: + indices.create: + index: test + body: + mappings: + properties: + my_field: + type: wildcard + fields: + lower: + type: wildcard + normalizer: lowercase + doc_values: + type: wildcard + doc_values: true + + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "test", "_id":1}}' + - '{"my_field": "org.opensearch.transport.NodeDisconnectedException: [node_s0][127.0.0.1:39953][disconnected] disconnected"}' + - '{"index": {"_index": "test", "_id":2}}' + - '{"my_field": "[2024-06-08T06:31:37,443][INFO ][o.o.c.c.Coordinator ] [node_s2] cluster-manager node [{node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true}] failed, restarting discovery"}' + - '{"index": {"_index": "test", "_id":3}}' + - '{"my_field": "[2024-06-08T06:31:37,451][INFO ][o.o.c.s.ClusterApplierService] [node_s2] cluster-manager node changed {previous [{node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true}], current []}, term: 1, version: 24, reason: becoming candidate: onLeaderFailure"}' + - '{"index": {"_index": "test", "_id":4}}' + - '{"my_field": "[2024-06-08T06:31:37,452][WARN ][o.o.c.NodeConnectionsService] [node_s1] failed to connect to {node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true} (tried [1] times)"}' + - '{"index": {"_index": "test", "_id":5}}' + - '{"my_field": "AbCd"}' + - '{"index": {"_index": "test", "_id":6}}' + - '{"other_field": "test"}' + - '{"index": {"_index": "test", "_id":7}}' + - '{"my_field": "ABCD"}' + + # "term query matches exact value" + - do: + search: + index: test + body: + query: + term: + my_field: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.doc_values: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # term query matches lowercase-normalized value + - do: + search: + index: test + body: + query: + term: + my_field.lower: "abcd" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field.lower: "ABCD" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field: "abcd" + - match: { hits.total.value: 0 } + + # wildcard query matches + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*Node*Exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + # wildcard query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*node*exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*NODE*EXCEPTION*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*node*exception*" + - match: { hits.total.value: 0 } + + # prefix query matches + - do: + search: + index: test + body: + query: + prefix: + my_field: + value: "[2024-06-08T" + - match: { hits.total.value: 3 } + + # regexp query matches + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*cluster-manager node.*" + - match: { hits.total.value: 2 } + + # regexp query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + regexp: + my_field.lower: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 2 } + + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 0 } + + # wildcard match-all works + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*" + - match: { hits.total.value: 6 } + + # regexp match-all works + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*" + - match: { hits.total.value: 6 } + + # terms query on wildcard field matches + - do: + search: + index: test + body: + query: + terms: { my_field: [ "AbCd" ] } + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # case insensitive query on wildcard field + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + case_insensitive: true + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } diff --git a/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/upgraded_cluster/40_wildcard.yml b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/upgraded_cluster/40_wildcard.yml new file mode 100644 index 0000000000000..29518931a5b8b --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/upgraded_cluster/40_wildcard.yml @@ -0,0 +1,200 @@ +# refactored from rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml +--- +"search after upgrade": + # "term query matches exact value" + - do: + search: + index: test + body: + query: + term: + my_field: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.doc_values: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # term query matches lowercase-normalized value + - do: + search: + index: test + body: + query: + term: + my_field.lower: "abcd" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field.lower: "ABCD" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field: "abcd" + - match: { hits.total.value: 0 } + + # wildcard query matches + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*Node*Exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + # wildcard query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*node*exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*NODE*EXCEPTION*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*node*exception*" + - match: { hits.total.value: 0 } + + # prefix query matches + - do: + search: + index: test + body: + query: + prefix: + my_field: + value: "[2024-06-08T" + - match: { hits.total.value: 3 } + + # regexp query matches + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*cluster-manager node.*" + - match: { hits.total.value: 2 } + + # regexp query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + regexp: + my_field.lower: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 2 } + + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 0 } + + # wildcard match-all works + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*" + - match: { hits.total.value: 6 } + + # regexp match-all works + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*" + - match: { hits.total.value: 6 } + + # terms query on wildcard field matches + - do: + search: + index: test + body: + query: + terms: { my_field: [ "AbCd" ] } + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # case insensitive query on wildcard field + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + case_insensitive: true + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java index 07dbe695bbbbb..20c5ce87ad1c7 100644 --- a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java @@ -159,6 +159,7 @@ public WildcardFieldMapper build(BuilderContext context) { } + public static final int NGRAM_SIZE = 3; public static final String CONTENT_TYPE = "wildcard"; public static final TypeParser PARSER = new TypeParser((n, c) -> new WildcardFieldMapper.Builder(n, c.getIndexAnalyzers())); @@ -230,97 +231,49 @@ protected void parseCreateField(ParseContext context) throws IOException { /** * Tokenizer to emit tokens to support wildcard first-phase matching. *

- * Will emit all substrings of length 1,2, and 3, with 0-valued anchors for the prefix/suffix. + * Will emit all substrings of only 3, with 0-valued anchors for the prefix/suffix. *

* For example, given the string "lucene", output the following terms: *

- * [0, 'l'] + * [0, 0, 'l'] * [0, 'l', 'u'] - * ['l'] - * ['l', 'u'] * ['l', 'u', 'c'] - * ['u'] - * ['u','c'] * ['u','c','e'] - * ['c'] - * ['c', 'e'] * ['c', 'e', 'n'] - * ['e'] - * ['e', 'n'] * ['e', 'n', 'e'] - * ['n'] - * ['n', 'e'] * ['n', 'e', 0] - * ['e'] - * ['e', 0] + * ['e', 0, 0] *

* Visible for testing. */ static final class WildcardFieldTokenizer extends Tokenizer { private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); - private final char[] buffer = new char[3]; // Ring buffer for up to 3 chars - private int offset = 0; // Position in the buffer - private int length = 2; // First token is anchor + first char + private final char[] buffer = new char[NGRAM_SIZE]; // Ring buffer for up to 3 chars + private int offset = NGRAM_SIZE - 1; // next position in buffer to store next input char @Override public void reset() throws IOException { super.reset(); - buffer[0] = 0; - int firstChar = input.read(); - if (firstChar != -1) { - buffer[1] = (char) firstChar; - int secondChar = input.read(); - if (secondChar != -1) { - buffer[2] = (char) secondChar; - } else { - buffer[2] = 0; - } - } else { - buffer[1] = 0; + for (int i = 0; i < NGRAM_SIZE - 1; i++) { + buffer[i] = 0; } - } @Override public boolean incrementToken() throws IOException { - charTermAttribute.setLength(length); - int numZeroes = 0; - for (int i = 0; i < length; i++) { - char curChar = buffer[(i + offset) % 3]; - if (curChar == 0) { - numZeroes++; - } - charTermAttribute.buffer()[i] = buffer[(i + offset) % 3]; - } - if (numZeroes == 2) { - // Two zeroes usually means we're done. - if (length == 3 && charTermAttribute.buffer()[1] != 0) { - // The only case where we're not done is if the input has exactly 1 character, so the buffer - // contains 0, char, 0. In that case, we return char now, then return char, 0 on the next iteration - charTermAttribute.buffer()[0] = charTermAttribute.buffer()[1]; - charTermAttribute.buffer()[1] = 0; - charTermAttribute.setLength(1); - length = 2; - offset = 1; - return true; - } - return false; - } - if (length == 3) { - // Read the next character, overwriting the current offset - int nextChar = input.read(); - if (nextChar != -1) { - buffer[offset] = (char) nextChar; - } else { - // End of input. Pad with extra 0 to trigger the logic above. - buffer[offset] = 0; - } - offset = (offset + 1) % 3; - length = 1; - } else { - length = length + 1; + charTermAttribute.setLength(NGRAM_SIZE); + int c = input.read(); + c = c == -1 ? 0 : c; + + buffer[offset++ % NGRAM_SIZE] = (char) c; + boolean has_next = false; + for (int i = 0; i < NGRAM_SIZE; i++) { + char curChar = buffer[(offset + i) % NGRAM_SIZE]; + charTermAttribute.buffer()[i] = curChar; + has_next |= curChar != 0; } - return true; + + return has_next; } } @@ -479,8 +432,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo Query approximation; if (requiredNGrams.isEmpty()) { // This only happens when all characters are wildcard characters (* or ?), - // or it's the empty string. - if (value.length() == 0 || value.contains("?")) { + // or it's only contains sequential characters less than NGRAM_SIZE (which defaults to 3). + if (findNonWildcardSequence(value, 0) != value.length() || value.length() == 0 || value.contains("?")) { approximation = this.existsQuery(context); } else { return existsQuery(context); @@ -502,15 +455,20 @@ static Set getRequiredNGrams(String value, boolean regexpMode) { int pos = 0; String rawSequence = null; String currentSequence = null; + char[] buffer = new char[NGRAM_SIZE]; if (!value.startsWith("?") && !value.startsWith("*")) { // Can add prefix term rawSequence = getNonWildcardSequence(value, 0); currentSequence = performEscape(rawSequence, regexpMode); - if (currentSequence.length() == 1) { - terms.add(new String(new char[] { 0, currentSequence.charAt(0) })); - } else { - terms.add(new String(new char[] { 0, currentSequence.charAt(0), currentSequence.charAt(1) })); + + // buffer[0] is automatically set to 0 + Arrays.fill(buffer, (char) 0); + int startIdx = Math.max(NGRAM_SIZE - currentSequence.length(), 1); + for (int j = 0; j < currentSequence.length() && j < NGRAM_SIZE - 1; j++) { + buffer[startIdx + j] = currentSequence.charAt(j); } + + terms.add(new String(buffer)); } else { pos = findNonWildcardSequence(value, pos); rawSequence = getNonWildcardSequence(value, pos); @@ -518,23 +476,27 @@ static Set getRequiredNGrams(String value, boolean regexpMode) { while (pos < value.length()) { boolean isEndOfValue = pos + rawSequence.length() == value.length(); currentSequence = performEscape(rawSequence, regexpMode); - if (!currentSequence.isEmpty() && currentSequence.length() < 3 && !isEndOfValue && pos > 0) { - // If this is a prefix or suffix of length < 3, then we already have a longer token including the anchor. - terms.add(currentSequence); - } else { - for (int i = 0; i < currentSequence.length() - 2; i++) { - terms.add(currentSequence.substring(i, i + 3)); - } + + for (int i = 0; i < currentSequence.length() - NGRAM_SIZE + 1; i++) { + terms.add(currentSequence.substring(i, i + 3)); } if (isEndOfValue) { // This is the end of the input. We can attach a suffix anchor. - if (currentSequence.length() == 1) { - terms.add(new String(new char[] { currentSequence.charAt(0), 0 })); - } else { - char a = currentSequence.charAt(currentSequence.length() - 2); - char b = currentSequence.charAt(currentSequence.length() - 1); - terms.add(new String(new char[] { a, b, 0 })); + // special case when we should generate '0xxxxxxx0', where we have (NGRAM_SIZE - 2) * x + Arrays.fill(buffer, (char) 0); + if (pos == 0 && currentSequence.length() == NGRAM_SIZE - 2) { + for (int i = 0; i < currentSequence.length(); i++) { + buffer[i + 1] = currentSequence.charAt(i); + } + terms.add(new String(buffer)); + Arrays.fill(buffer, (char) 0); + } + int rightStartIdx = NGRAM_SIZE - currentSequence.length() - 2; + rightStartIdx = rightStartIdx < 0 ? NGRAM_SIZE - 2 : rightStartIdx; + for (int j = 0; j < currentSequence.length() && j < NGRAM_SIZE - 1; j++) { + buffer[rightStartIdx - j] = currentSequence.charAt(currentSequence.length() - j - 1); } + terms.add(new String(buffer)); } pos = findNonWildcardSequence(value, pos + rawSequence.length()); rawSequence = getNonWildcardSequence(value, pos); diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java index b19e3687cf944..25aacb41f029d 100644 --- a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldMapperTests.java @@ -82,22 +82,11 @@ public void testTokenizer() throws IOException { List.of( WildcardFieldTypeTests.prefixAnchored("p"), WildcardFieldTypeTests.prefixAnchored("pi"), - "p", - "pi", "pic", - "i", - "ic", "ick", - "c", - "ck", "ckl", - "k", - "kl", "kle", - "l", - "le", WildcardFieldTypeTests.suffixAnchored("le"), - "e", WildcardFieldTypeTests.suffixAnchored("e") ), terms @@ -111,7 +100,14 @@ public void testTokenizer() throws IOException { terms.add(charTermAttribute.toString()); } } - assertEquals(List.of(WildcardFieldTypeTests.prefixAnchored("a"), "a", WildcardFieldTypeTests.suffixAnchored("a")), terms); + assertEquals( + List.of( + WildcardFieldTypeTests.prefixAnchored("a"), + WildcardFieldTypeTests.suffixAnchored((char) 0 + "a"), + WildcardFieldTypeTests.suffixAnchored("a") + ), + terms + ); } public void testEnableDocValues() throws IOException { @@ -188,13 +184,8 @@ public void testNormalizer() throws IOException { List.of( WildcardFieldTypeTests.prefixAnchored("a"), WildcardFieldTypeTests.prefixAnchored("ab"), - "a", - "ab", "abc", - "b", - "bc", WildcardFieldTypeTests.suffixAnchored("bc"), - "c", WildcardFieldTypeTests.suffixAnchored("c") ), terms @@ -242,13 +233,8 @@ public void testNullValue() throws IOException { List.of( WildcardFieldTypeTests.prefixAnchored("u"), WildcardFieldTypeTests.prefixAnchored("ur"), - "u", - "ur", "uri", - "r", - "ri", WildcardFieldTypeTests.suffixAnchored("ri"), - "i", WildcardFieldTypeTests.suffixAnchored("i") ), terms @@ -281,16 +267,9 @@ public void testDefaults() throws Exception { List.of( WildcardFieldTypeTests.prefixAnchored("1"), WildcardFieldTypeTests.prefixAnchored("12"), - "1", - "12", "123", - "2", - "23", "234", - "3", - "34", WildcardFieldTypeTests.suffixAnchored("34"), - "4", WildcardFieldTypeTests.suffixAnchored("4") ), terms diff --git a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java index 1a813495e9033..851e791660d82 100644 --- a/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/WildcardFieldTypeTests.java @@ -20,11 +20,19 @@ public class WildcardFieldTypeTests extends FieldTypeTestCase { static String prefixAnchored(String val) { - return (char) 0 + val; + String ret = (char) 0 + val; + if (ret.length() < WildcardFieldMapper.NGRAM_SIZE) { + ret = prefixAnchored(ret); + } + return ret; } static String suffixAnchored(String val) { - return val + (char) 0; + String ret = val + (char) 0; + if (ret.length() < WildcardFieldMapper.NGRAM_SIZE) { + ret = suffixAnchored(ret); + } + return ret; } public void testTermQuery() { @@ -104,13 +112,14 @@ public void testEscapedWildcardQuery() { ft.wildcardQuery("\\**\\*", null, null) ); - assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), ft.wildcardQuery("\\*", null, null)); - - expectedTerms.remove(suffixAnchored("*")); + expectedTerms.add(prefixAnchored("*" + (char) 0)); builder = new BooleanQuery.Builder(); for (String term : expectedTerms) { builder.add(new TermQuery(new Term("field", term)), BooleanClause.Occur.FILTER); } + assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\*"), ft.wildcardQuery("\\*", null, null)); + builder = new BooleanQuery.Builder(); + builder.add(new TermQuery(new Term("field", prefixAnchored("*"))), BooleanClause.Occur.FILTER); assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "\\**"), ft.wildcardQuery("\\**", null, null)); } @@ -119,7 +128,6 @@ public void testMultipleWildcardsInQuery() { MappedFieldType ft = new WildcardFieldMapper.WildcardFieldType("field"); Set expectedTerms = new HashSet<>(); expectedTerms.add(prefixAnchored("a")); - expectedTerms.add("cd"); expectedTerms.add("efg"); expectedTerms.add(suffixAnchored("h")); BooleanQuery.Builder builder = new BooleanQuery.Builder(); @@ -153,27 +161,27 @@ public void testRegexpQuery() { assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apple_foo")); assertFalse(actualMatchingQuery.getSecondPhaseMatcher().test("foo_apply_foo")); - pattern = "ab(zz|cd|ef.*)(hi|jk)"; + pattern = "abc(zzz|def|ghi.*)(jkl|mno)"; builder = new BooleanQuery.Builder(); - builder.add(new TermQuery(new Term("field", "ab")), BooleanClause.Occur.FILTER); + builder.add(new TermQuery(new Term("field", "abc")), BooleanClause.Occur.FILTER); builder.add( - new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zz")), BooleanClause.Occur.SHOULD) - .add(new TermQuery(new Term("field", "cd")), BooleanClause.Occur.SHOULD) - .add(new TermQuery(new Term("field", "ef")), BooleanClause.Occur.SHOULD) + new BooleanQuery.Builder().add(new TermQuery(new Term("field", "zzz")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "def")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "ghi")), BooleanClause.Occur.SHOULD) .build(), BooleanClause.Occur.FILTER ); builder.add( - new BooleanQuery.Builder().add(new TermQuery(new Term("field", "hi")), BooleanClause.Occur.SHOULD) - .add(new TermQuery(new Term("field", "jk")), BooleanClause.Occur.SHOULD) + new BooleanQuery.Builder().add(new TermQuery(new Term("field", "jkl")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("field", "mno")), BooleanClause.Occur.SHOULD) .build(), BooleanClause.Occur.FILTER ); actual = ft.regexpQuery(pattern, 0, 0, 1000, null, null); assertEquals(new WildcardFieldMapper.WildcardMatchingQuery("field", builder.build(), "/" + pattern + "/"), actual); actualMatchingQuery = (WildcardFieldMapper.WildcardMatchingQuery) actual; - assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdjk")); - assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abefqwertyhi")); + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcdefmno")); + assertTrue(actualMatchingQuery.getSecondPhaseMatcher().test("abcghiqwertyjkl")); } public void testWildcardMatchAll() {