diff --git a/CHANGELOG-3.0.md b/CHANGELOG-3.0.md index c5f9611910fa9..bc5e63dbdf8ce 100644 --- a/CHANGELOG-3.0.md +++ b/CHANGELOG-3.0.md @@ -37,6 +37,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Stop minimizing automata used for case-insensitive matches ([#17268](https://github.com/opensearch-project/OpenSearch/pull/17268)) - Refactor the `:server` module `org.opensearch.client` to `org.opensearch.transport.client` to eliminate top level split packages for JPMS support ([#17272](https://github.com/opensearch-project/OpenSearch/pull/17272)) - Use Lucene `BM25Similarity` as default since the `LegacyBM25Similarity` is marked as deprecated ([#17306](https://github.com/opensearch-project/OpenSearch/pull/17306)) +- Wildcard field index only 3gram of the input data [#17349](https://github.com/opensearch-project/OpenSearch/pull/17349) ### Deprecated diff --git a/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/mixed_cluster/40_wildcard.yml b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/mixed_cluster/40_wildcard.yml new file mode 100644 index 0000000000000..e06854af7e924 --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/mixed_cluster/40_wildcard.yml @@ -0,0 +1,200 @@ +# refactored from rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml +--- +"search on mixed state": + # "term query matches exact value" + - do: + search: + index: test + body: + query: + term: + my_field: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.doc_values: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # term query matches lowercase-normalized value + - do: + search: + index: test + body: + query: + term: + my_field.lower: "abcd" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field.lower: "ABCD" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field: "abcd" + - match: { hits.total.value: 0 } + + # wildcard query matches + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*Node*Exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + # wildcard query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*node*exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*NODE*EXCEPTION*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*node*exception*" + - match: { hits.total.value: 0 } + + # prefix query matches + - do: + search: + index: test + body: + query: + prefix: + my_field: + value: "[2024-06-08T" + - match: { hits.total.value: 3 } + + # regexp query matches + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*cluster-manager node.*" + - match: { hits.total.value: 2 } + + # regexp query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + regexp: + my_field.lower: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 2 } + + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 0 } + + # wildcard match-all works + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*" + - match: { hits.total.value: 6 } + + # regexp match-all works + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*" + - match: { hits.total.value: 6 } + + # terms query on wildcard field matches + - do: + search: + index: test + body: + query: + terms: { my_field: [ "AbCd" ] } + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # case insensitive query on wildcard field + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + case_insensitive: true + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } diff --git a/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/old_cluster/40_wildcard.yml b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/old_cluster/40_wildcard.yml new file mode 100644 index 0000000000000..b19882c69ddd7 --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/old_cluster/40_wildcard.yml @@ -0,0 +1,235 @@ +# refactored from rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml +--- +"Create index with Wildcard field": + - do: + indices.create: + index: test + body: + mappings: + properties: + my_field: + type: wildcard + fields: + lower: + type: wildcard + normalizer: lowercase + doc_values: + type: wildcard + doc_values: true + + - do: + bulk: + refresh: true + body: + - '{"index": {"_index": "test", "_id":1}}' + - '{"my_field": "org.opensearch.transport.NodeDisconnectedException: [node_s0][127.0.0.1:39953][disconnected] disconnected"}' + - '{"index": {"_index": "test", "_id":2}}' + - '{"my_field": "[2024-06-08T06:31:37,443][INFO ][o.o.c.c.Coordinator ] [node_s2] cluster-manager node [{node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true}] failed, restarting discovery"}' + - '{"index": {"_index": "test", "_id":3}}' + - '{"my_field": "[2024-06-08T06:31:37,451][INFO ][o.o.c.s.ClusterApplierService] [node_s2] cluster-manager node changed {previous [{node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true}], current []}, term: 1, version: 24, reason: becoming candidate: onLeaderFailure"}' + - '{"index": {"_index": "test", "_id":4}}' + - '{"my_field": "[2024-06-08T06:31:37,452][WARN ][o.o.c.NodeConnectionsService] [node_s1] failed to connect to {node_s0}{Nj7FjR7hRP2lh_zur8KN_g}{OTGOoWmmSsWP_RQ3tIKJ9g}{127.0.0.1}{127.0.0.1:39953}{imr}{shard_indexing_pressure_enabled=true} (tried [1] times)"}' + - '{"index": {"_index": "test", "_id":5}}' + - '{"my_field": "AbCd"}' + - '{"index": {"_index": "test", "_id":6}}' + - '{"other_field": "test"}' + - '{"index": {"_index": "test", "_id":7}}' + - '{"my_field": "ABCD"}' + + # "term query matches exact value" + - do: + search: + index: test + body: + query: + term: + my_field: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.doc_values: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # term query matches lowercase-normalized value + - do: + search: + index: test + body: + query: + term: + my_field.lower: "abcd" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field.lower: "ABCD" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field: "abcd" + - match: { hits.total.value: 0 } + + # wildcard query matches + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*Node*Exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + # wildcard query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*node*exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*NODE*EXCEPTION*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*node*exception*" + - match: { hits.total.value: 0 } + + # prefix query matches + - do: + search: + index: test + body: + query: + prefix: + my_field: + value: "[2024-06-08T" + - match: { hits.total.value: 3 } + + # regexp query matches + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*cluster-manager node.*" + - match: { hits.total.value: 2 } + + # regexp query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + regexp: + my_field.lower: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 2 } + + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 0 } + + # wildcard match-all works + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*" + - match: { hits.total.value: 6 } + + # regexp match-all works + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*" + - match: { hits.total.value: 6 } + + # terms query on wildcard field matches + - do: + search: + index: test + body: + query: + terms: { my_field: [ "AbCd" ] } + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # case insensitive query on wildcard field + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + case_insensitive: true + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } diff --git a/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/upgraded_cluster/40_wildcard.yml b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/upgraded_cluster/40_wildcard.yml new file mode 100644 index 0000000000000..29518931a5b8b --- /dev/null +++ b/qa/rolling-upgrade/src/test/resources/rest-api-spec/test/upgraded_cluster/40_wildcard.yml @@ -0,0 +1,200 @@ +# refactored from rest-api-spec/src/main/resources/rest-api-spec/test/search/270_wildcard_fieldtype_queries.yml +--- +"search after upgrade": + # "term query matches exact value" + - do: + search: + index: test + body: + query: + term: + my_field: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + term: + my_field.doc_values: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # term query matches lowercase-normalized value + - do: + search: + index: test + body: + query: + term: + my_field.lower: "abcd" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field.lower: "ABCD" + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } + + - do: + search: + index: test + body: + query: + term: + my_field: "abcd" + - match: { hits.total.value: 0 } + + # wildcard query matches + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*Node*Exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + # wildcard query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*node*exception*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field.lower: + value: "*NODE*EXCEPTION*" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "1" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*node*exception*" + - match: { hits.total.value: 0 } + + # prefix query matches + - do: + search: + index: test + body: + query: + prefix: + my_field: + value: "[2024-06-08T" + - match: { hits.total.value: 3 } + + # regexp query matches + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*cluster-manager node.*" + - match: { hits.total.value: 2 } + + # regexp query matches lowercase-normalized field + - do: + search: + index: test + body: + query: + regexp: + my_field.lower: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 2 } + + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*06-08.*Cluster-Manager Node.*" + - match: { hits.total.value: 0 } + + # wildcard match-all works + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "*" + - match: { hits.total.value: 6 } + + # regexp match-all works + - do: + search: + index: test + body: + query: + regexp: + my_field: + value: ".*" + - match: { hits.total.value: 6 } + + # terms query on wildcard field matches + - do: + search: + index: test + body: + query: + terms: { my_field: [ "AbCd" ] } + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + # case insensitive query on wildcard field + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "5" } + + - do: + search: + index: test + body: + query: + wildcard: + my_field: + value: "AbCd" + case_insensitive: true + - match: { hits.total.value: 2 } + - match: { hits.hits.0._id: "5" } + - match: { hits.hits.1._id: "7" } diff --git a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java index 07dbe695bbbbb..20c5ce87ad1c7 100644 --- a/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/WildcardFieldMapper.java @@ -159,6 +159,7 @@ public WildcardFieldMapper build(BuilderContext context) { } + public static final int NGRAM_SIZE = 3; public static final String CONTENT_TYPE = "wildcard"; public static final TypeParser PARSER = new TypeParser((n, c) -> new WildcardFieldMapper.Builder(n, c.getIndexAnalyzers())); @@ -230,97 +231,49 @@ protected void parseCreateField(ParseContext context) throws IOException { /** * Tokenizer to emit tokens to support wildcard first-phase matching. *
- * Will emit all substrings of length 1,2, and 3, with 0-valued anchors for the prefix/suffix. + * Will emit all substrings of only 3, with 0-valued anchors for the prefix/suffix. *
* For example, given the string "lucene", output the following terms: *
- * [0, 'l'] + * [0, 0, 'l'] * [0, 'l', 'u'] - * ['l'] - * ['l', 'u'] * ['l', 'u', 'c'] - * ['u'] - * ['u','c'] * ['u','c','e'] - * ['c'] - * ['c', 'e'] * ['c', 'e', 'n'] - * ['e'] - * ['e', 'n'] * ['e', 'n', 'e'] - * ['n'] - * ['n', 'e'] * ['n', 'e', 0] - * ['e'] - * ['e', 0] + * ['e', 0, 0] *
* Visible for testing.
*/
static final class WildcardFieldTokenizer extends Tokenizer {
private final CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
- private final char[] buffer = new char[3]; // Ring buffer for up to 3 chars
- private int offset = 0; // Position in the buffer
- private int length = 2; // First token is anchor + first char
+ private final char[] buffer = new char[NGRAM_SIZE]; // Ring buffer for up to 3 chars
+ private int offset = NGRAM_SIZE - 1; // next position in buffer to store next input char
@Override
public void reset() throws IOException {
super.reset();
- buffer[0] = 0;
- int firstChar = input.read();
- if (firstChar != -1) {
- buffer[1] = (char) firstChar;
- int secondChar = input.read();
- if (secondChar != -1) {
- buffer[2] = (char) secondChar;
- } else {
- buffer[2] = 0;
- }
- } else {
- buffer[1] = 0;
+ for (int i = 0; i < NGRAM_SIZE - 1; i++) {
+ buffer[i] = 0;
}
-
}
@Override
public boolean incrementToken() throws IOException {
- charTermAttribute.setLength(length);
- int numZeroes = 0;
- for (int i = 0; i < length; i++) {
- char curChar = buffer[(i + offset) % 3];
- if (curChar == 0) {
- numZeroes++;
- }
- charTermAttribute.buffer()[i] = buffer[(i + offset) % 3];
- }
- if (numZeroes == 2) {
- // Two zeroes usually means we're done.
- if (length == 3 && charTermAttribute.buffer()[1] != 0) {
- // The only case where we're not done is if the input has exactly 1 character, so the buffer
- // contains 0, char, 0. In that case, we return char now, then return char, 0 on the next iteration
- charTermAttribute.buffer()[0] = charTermAttribute.buffer()[1];
- charTermAttribute.buffer()[1] = 0;
- charTermAttribute.setLength(1);
- length = 2;
- offset = 1;
- return true;
- }
- return false;
- }
- if (length == 3) {
- // Read the next character, overwriting the current offset
- int nextChar = input.read();
- if (nextChar != -1) {
- buffer[offset] = (char) nextChar;
- } else {
- // End of input. Pad with extra 0 to trigger the logic above.
- buffer[offset] = 0;
- }
- offset = (offset + 1) % 3;
- length = 1;
- } else {
- length = length + 1;
+ charTermAttribute.setLength(NGRAM_SIZE);
+ int c = input.read();
+ c = c == -1 ? 0 : c;
+
+ buffer[offset++ % NGRAM_SIZE] = (char) c;
+ boolean has_next = false;
+ for (int i = 0; i < NGRAM_SIZE; i++) {
+ char curChar = buffer[(offset + i) % NGRAM_SIZE];
+ charTermAttribute.buffer()[i] = curChar;
+ has_next |= curChar != 0;
}
- return true;
+
+ return has_next;
}
}
@@ -479,8 +432,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo
Query approximation;
if (requiredNGrams.isEmpty()) {
// This only happens when all characters are wildcard characters (* or ?),
- // or it's the empty string.
- if (value.length() == 0 || value.contains("?")) {
+ // or it's only contains sequential characters less than NGRAM_SIZE (which defaults to 3).
+ if (findNonWildcardSequence(value, 0) != value.length() || value.length() == 0 || value.contains("?")) {
approximation = this.existsQuery(context);
} else {
return existsQuery(context);
@@ -502,15 +455,20 @@ static Set