From 8231c65659b76ccc897badaac94d1885de947896 Mon Sep 17 00:00:00 2001 From: "Michael B. Klein" Date: Mon, 22 Jul 2024 16:39:44 +0000 Subject: [PATCH 1/2] Update hybrid search weights --- chat/src/handlers/opensearch_neural_search.py | 8 ++++---- chat/src/helpers/hybrid_query.py | 19 ++++++------------- chat/src/helpers/response.py | 11 +---------- chat/test/helpers/test_hybrid_query.py | 10 ++++------ 4 files changed, 15 insertions(+), 33 deletions(-) diff --git a/chat/src/handlers/opensearch_neural_search.py b/chat/src/handlers/opensearch_neural_search.py index 2540f428..0530eab0 100644 --- a/chat/src/handlers/opensearch_neural_search.py +++ b/chat/src/handlers/opensearch_neural_search.py @@ -28,19 +28,19 @@ def __init__( self.text_field = text_field def similarity_search( - self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any + self, query: str, k: int = 10, **kwargs: Any ) -> List[Document]: """Return docs most similar to the embedding vector.""" docs_with_scores = self.similarity_search_with_score( - query, k, subquery, **kwargs + query, k, **kwargs ) return [doc[0] for doc in docs_with_scores] def similarity_search_with_score( - self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any + self, query: str, k: int = 10, **kwargs: Any ) -> List[Tuple[Document, float]]: """Return docs most similar to query.""" - dsl = hybrid_query(query=query, model_id=self.model_id, vector_field=self.vector_field, k=k, subquery=subquery, **kwargs) + dsl = hybrid_query(query=query, model_id=self.model_id, vector_field=self.vector_field, k=k, **kwargs) response = self.client.search(index=self.index, body=dsl, params={"search_pipeline": self.search_pipeline} if self.search_pipeline else None) documents_with_scores = [ ( diff --git a/chat/src/helpers/hybrid_query.py b/chat/src/helpers/hybrid_query.py index 6b724ece..1af5d95f 100644 --- a/chat/src/helpers/hybrid_query.py +++ b/chat/src/helpers/hybrid_query.py @@ -11,12 +11,7 @@ def filter(query: dict): } } -def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: int = 10, subquery: Any = None, **kwargs: Any): - if subquery: - weights = [0.5, 0.3, 0.2] - else: - weights = [0.7, 0.3] - +def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: int = 10, **kwargs: Any): result = { "size": k, "query": { @@ -24,9 +19,10 @@ def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: "queries": [ filter({ "query_string": { - "default_operator": "AND", - "fields": ["title^5", "all_controlled_labels", "all_ids^5"], - "query": query + "operator": "AND", + "fields": ["all_titles^5", "all_controlled_labels", "all_ids^5"], + "query": query, + "analyzer": "ENGLISH" } }), filter({ @@ -47,7 +43,7 @@ def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: "normalization-processor": { "combination": { "parameters": { - "weights": weights + "weights": [0.25, 0.75] }, "technique": "arithmetic_mean" }, @@ -60,9 +56,6 @@ def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: } } - if subquery: - result["query"]["hybrid"]["queries"].append(filter(subquery)) - for key, value in kwargs.items(): result[key] = value diff --git a/chat/src/helpers/response.py b/chat/src/helpers/response.py index fdcf5348..dea2098b 100644 --- a/chat/src/helpers/response.py +++ b/chat/src/helpers/response.py @@ -39,16 +39,7 @@ def get_and_send_original_question(docs): def prepare_response(self): try: - subquery = { - "match": { - "all_titles": { - "query": self.config.question, - "operator": "AND", - "analyzer": "english" - } - } - } - retriever = self.config.opensearch.as_retriever(search_type="similarity", search_kwargs={"k": self.config.k, "subquery": subquery, "_source": {"excludes": ["embedding"]}}) + retriever = self.config.opensearch.as_retriever(search_type="similarity", search_kwargs={"k": self.config.k, "_source": {"excludes": ["embedding"]}}) chain = ( {"context": retriever, "question": RunnablePassthrough()} | self.original_question_passthrough() diff --git a/chat/test/helpers/test_hybrid_query.py b/chat/test/helpers/test_hybrid_query.py index 97588037..4e38861e 100644 --- a/chat/test/helpers/test_hybrid_query.py +++ b/chat/test/helpers/test_hybrid_query.py @@ -6,19 +6,17 @@ class TestFunction(TestCase): def test_hybrid_query(self): - subquery = { "term": { "title": { "value": "The Title" } } } - dsl = hybrid_query("Question?", "MODEL_ID", k=10, subquery=subquery) + dsl = hybrid_query("Question?", "MODEL_ID", k=10) subject = dsl["query"]["hybrid"]["queries"] checks = [ (lambda x: x["query_string"]["query"], "Question?"), - (lambda x: x["neural"]["embedding"]["model_id"], "MODEL_ID"), - (lambda x: x["term"]["title"]["value"], "The Title") + (lambda x: x["neural"]["embedding"]["model_id"], "MODEL_ID") ] - self.assertEqual(len(subject), 3) + self.assertEqual(len(subject), 2) - for i in range(3): + for i in range(2): lookup, expected = checks[i] queries = subject[i]["bool"]["must"] self.assertEqual(lookup(queries[0]), expected) From 09b3004d084b0ee6cd50bd74d7ff1f743c2e91c6 Mon Sep 17 00:00:00 2001 From: Brendan Quinn Date: Mon, 22 Jul 2024 19:37:33 +0000 Subject: [PATCH 2/2] Lowercase stopwords analyzer, sam template outputs --- chat/src/helpers/hybrid_query.py | 8 +++----- template.yaml | 4 ++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/chat/src/helpers/hybrid_query.py b/chat/src/helpers/hybrid_query.py index 1af5d95f..23ccf4a5 100644 --- a/chat/src/helpers/hybrid_query.py +++ b/chat/src/helpers/hybrid_query.py @@ -19,10 +19,10 @@ def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: "queries": [ filter({ "query_string": { - "operator": "AND", + "default_operator": "AND", "fields": ["all_titles^5", "all_controlled_labels", "all_ids^5"], "query": query, - "analyzer": "ENGLISH" + "analyzer": "english" } }), filter({ @@ -58,7 +58,5 @@ def hybrid_query(query: str, model_id: str, vector_field: str = "embedding", k: for key, value in kwargs.items(): result[key] = value - + return result - - \ No newline at end of file diff --git a/template.yaml b/template.yaml index dad56a3d..cbeba51c 100644 --- a/template.yaml +++ b/template.yaml @@ -1159,3 +1159,7 @@ Resources: AuthorizationType: NONE RouteKey: GET /docs/v2/{proxy+} Target: !Sub "integrations/${docsIntegration}" +Outputs: + Endpoint: + Description: "The base API endpoint for the stack" + Value: !Sub "https://${CustomDomainHost}.${CustomDomainZone}/api/v2" \ No newline at end of file