From db9477e78598addede42b7d20fbe3335d8b8e33d Mon Sep 17 00:00:00 2001 From: nora-errouhly Date: Wed, 26 Feb 2025 10:25:46 +0000 Subject: [PATCH 1/5] remove bm25 query --- redbox-core/redbox/retriever/queries.py | 34 +------------------------ 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/redbox-core/redbox/retriever/queries.py b/redbox-core/redbox/retriever/queries.py index af525892..8b73a370 100644 --- a/redbox-core/redbox/retriever/queries.py +++ b/redbox-core/redbox/retriever/queries.py @@ -115,39 +115,7 @@ def build_document_query( "size": ai_settings.rag_k, "query": { "bool": { - "should": [ - { - "match": { - "text": { - "query": query, - "boost": ai_settings.match_boost, - } - }, - }, - { - "match": { - "metadata.name": { - "query": query, - "boost": ai_settings.match_name_boost, - } - } - }, - { - "match": { - "metadata.description": { - "query": query, - "boost": ai_settings.match_description_boost, - } - } - }, - { - "match": { - "metadata.keywords": { - "query": query, - "boost": ai_settings.match_keywords_boost, - } - } - }, + "must": [ { "knn": { "vector_field": { From 752b74ca439144e3d926326283cb81e81e1390b9 Mon Sep 17 00:00:00 2001 From: nora-errouhly Date: Wed, 26 Feb 2025 11:02:40 +0000 Subject: [PATCH 2/5] understanding queries --- notebooks/composite_query_opensearch.ipynb | 479 +++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 notebooks/composite_query_opensearch.ipynb diff --git a/notebooks/composite_query_opensearch.ipynb b/notebooks/composite_query_opensearch.ipynb new file mode 100644 index 00000000..f23d8ef1 --- /dev/null +++ b/notebooks/composite_query_opensearch.ipynb @@ -0,0 +1,479 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When sending an irrelevant query such as \"what is the door about\", the search still returns irrelevant chunks from the selected document. The LLM intermittently hallucinate and answer the question based on the irrelevant chunks. This notebook is about how to optimize search but further analysis is required to ensure the LLM does not hallucinate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from opensearchpy import OpenSearch, RequestsHttpConnection, client\n", + "#\"http://admin:admin@opensearch:9200\"\n", + "client = OpenSearch(\n", + " hosts=[{\"host\": \"localhost\", \"port\": \"9200\"}],\n", + " http_auth=(\"admin\", \"admin\"),\n", + " use_ssl=False, #to run locally, changed from True to False\n", + " connection_class=RequestsHttpConnection,\n", + " retry_on_timeout=True\n", + " )\n", + "\n", + "query = {\n", + " \"size\": 1000,\n", + " \"track_total_hits\": True,\n", + " \"query\" : {\n", + " \"match_all\" : {}\n", + " }\n", + "}\n", + "\n", + "#redbox-data-integration-chunk-current\n", + "\n", + "response = client.search(index='redbox-data-integration-chunk-current', body=query)\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define QUERY" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.embeddings import BedrockEmbeddings\n", + "embedding_model = BedrockEmbeddings(region_name='eu-west-2', model_id=\"amazon.titan-embed-text-v2:0\")\n", + "#query = \"Data feminism begins by examining how power operates in the world today\" #66\n", + "#query = \"goodbye\" #score is 0\n", + "#query = \"what is this door about\" #score is 3.3\n", + "query = \"I don't know.\"\n", + "query_vector = embedding_model.embed_query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.indices.get_mapping(index='redbox-data-integration-chunk-current')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel\n", + "class AISettings(BaseModel):\n", + " \"\"\"Prompts and other AI settings\"\"\"\n", + "\n", + " # LLM settings\n", + " context_window_size: int = 128_000\n", + " llm_max_tokens: int = 1024\n", + "\n", + " # Prompts and LangGraph settings\n", + " max_document_tokens: int = 1_000_000\n", + " self_route_enabled: bool = False\n", + " map_max_concurrency: int = 128\n", + " stuff_chunk_context_ratio: float = 0.75\n", + " recursion_limit: int = 50\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " # Elasticsearch RAG and boost values\n", + " rag_k: int = 30\n", + " rag_num_candidates: int = 10\n", + " rag_gauss_scale_size: int = 3\n", + " rag_gauss_scale_decay: float = 0.5\n", + " rag_gauss_scale_min: float = 1.1\n", + " rag_gauss_scale_max: float = 2.0\n", + " elbow_filter_enabled: bool = False\n", + " match_boost: float = 1.0\n", + " match_name_boost: float = 2.0\n", + " match_description_boost: float = 0.5\n", + " match_keywords_boost: float = 0.5\n", + " knn_boost: float = 2.0\n", + " similarity_threshold: float = 0.7\n", + "\n", + " # this is also the azure_openai_model\n", + " #chat_backend: ChatLLMBackend = ChatLLMBackend()\n", + "\n", + " # settings for tool call\n", + " tool_govuk_retrieved_results: int = 100\n", + " tool_govuk_returned_results: int = 5" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "ai_settings = AISettings()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "query_filter = [{\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\"terms\": {\"metadata.file_name.keyword\": ['natasha.boyse@digital.trade.gov.uk/1_The_power_chapter.pdf']}},\n", + " {\"terms\": {\"metadata.uri.keyword\": ['natasha.boyse@digital.trade.gov.uk/1_The_power_chapter.pdf']}}\n", + " ]\n", + " }\n", + " }, {\"term\": {\"metadata.chunk_resolution.keyword\": \"normal\"}}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Composite query" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "final_query = {\"size\": ai_settings.rag_k,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"text\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_boost,\n", + " }\n", + " },\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.name\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_name_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.description\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_description_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.keywords\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_keywords_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"knn\": {\n", + " \"vector_field\": {\n", + " \"vector\": query_vector,\n", + " \"k\": ai_settings.rag_num_candidates,\n", + " \"boost\": ai_settings.knn_boost}\n", + " }\n", + " },\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "final_response = client.search(index='redbox-data-integration-chunk-current', body=final_query)\n", + "final_response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keyword query only" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BM25 search on document text, title, description and keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "keyword_final_query = {\"size\": ai_settings.rag_k,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"text\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_boost,\n", + " }\n", + " },\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.name\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_name_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.description\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_description_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.keywords\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_keywords_boost,\n", + " }\n", + " }\n", + " },\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_keyword = client.search(index='redbox-data-integration-chunk-current', body=keyword_final_query)\n", + "response_keyword" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BM25 query on document text only" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "text_keyword_final_query = {\"size\": ai_settings.rag_k,\n", + " #\"min_score\":0.01,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"text\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_boost,\n", + " #\"analyzer\": \"stop\",\n", + " \n", + " }\n", + " },\n", + " }\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_text_keyword = client.search(index='redbox-data-integration-chunk-current', body=text_keyword_final_query)\n", + "response_text_keyword" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic query" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "knn_final_query = {\"size\": ai_settings.rag_k,\n", + " #\"min_score\": 1.9,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"must\": [\n", + " {\n", + " \"knn\": {\n", + " \"vector_field\": {\n", + " \"vector\": query_vector,\n", + " \"k\": ai_settings.rag_num_candidates,\n", + " \"boost\": ai_settings.knn_boost,\n", + " \n", + " }\n", + " }\n", + " },\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_knn = client.search(index='redbox-data-integration-chunk-current', body=knn_final_query)\n", + "response_knn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_knn[\"hits\"][\"hits\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Recommendations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Keyword search based on BM25 does not remove stop words. This lead to inflated scores returning irrelevant results. Analyzer function in Opensearch should be used to remove stopwords. However, when applying analyzer only on query, it still returns irrelevant chunks. This could be due to the fact that we should also remove stop words from indexed documents. However, removing stop words would impact semantic search. Therefore, adding a new field for 'Text' attribute is required for keyword search where STOP analyzer is performed\n", + "\n", + "- Even when score is 0, Keyword search returns the chunks. We should set min_score for keyword search to a low value to filter out irrelevant chunks. When the query is irrelevant, semantic search does not return any chunks. Perhaps, there is in-built cutoff threeshold for the relevance score in Opensearch KNN but not in BM25. This need to be verified.\n", + "\n", + "- Relevance scores from BM25 are added to relevance scores from Semantic seach (cosine similarity). Scores from BM25 can be as high as 66 while score from Opensearch scaled cosine similarity is between 0 and 2.\n", + "Thereore, the impact of keyword is greater than semantic search. It doesn'\\t make sense to add both scores. In addition, there are scaling factors (Boost parameters) used as a multiplier to each score, to add more weight to to Semantic search and keyword search for the titles of the documents. The impact of such boosting scores need to be investigated. Further research on the best approach to implement hybrid search is required.\n", + "\n", + "- In the short term, we can remove keyword search and keep semantic search. This would address the issues with irrelevant queries, increasing the recall and therefore accuracy of the search. However, semantic search does not handle well acronyms. A long-term solution using hybrid search is required.\n", + "\n", + "- When implementing the short term solution, integration testing should be done to verify that the RAG and gadget/agent works when no chunks are returned:\n", + " 1) Ensuring code handles the edge case where no chunks returned.\n", + " 2) Ensuring LLM do not hallucinate if there are no chunks returned\n", + " 2) Ensuring agent/gadget select other tools (for example gov.uk) to attempt answering the question" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"The bool query takes a more-matches-is-better approach, so the score from each matching must or should clause will be added together to provide the final _score for each document.\"\n", + "https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html#:~:text=The%20bool%20query%20takes%20a,final%20_score%20for%20each%20document.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stop analyzers\n", + "https://opensearch.org/docs/2.0/opensearch/query-dsl/text-analyzers/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"The search term is analyzed by the same analyzer that was used for the specific document field at the time it was indexed. This means that your search term goes through the same analysis process as the document’s field.\"\n", + "https://opensearch.org/docs/latest/query-dsl/term-vs-full-text/\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From fbf4513441384d0325ad3e7c078572650d5105e6 Mon Sep 17 00:00:00 2001 From: nora-errouhly Date: Wed, 26 Feb 2025 11:03:47 +0000 Subject: [PATCH 3/5] move notebook --- read_local_opensearch.ipynb | 101 ------------------------------------ 1 file changed, 101 deletions(-) delete mode 100644 read_local_opensearch.ipynb diff --git a/read_local_opensearch.ipynb b/read_local_opensearch.ipynb deleted file mode 100644 index dad5dfec..00000000 --- a/read_local_opensearch.ipynb +++ /dev/null @@ -1,101 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Read data from Opensearch" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "from json import load\n", - "from opensearchpy import OpenSearch, RequestsHttpConnection, client\n", - "import os\n", - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()\n", - "password = os.environ.get(\"OPENSEARCH_INITIAL_ADMIN_PASSWORD\")\n", - "\n", - "client = OpenSearch(\n", - " hosts=[{\"host\": \"localhost\", \"port\": \"9200\"}],\n", - " http_auth=(\"admin\", password),\n", - " use_ssl=False, #to run locally, changed from True to False\n", - " connection_class=RequestsHttpConnection,\n", - " retry_on_timeout=True\n", - " )\n", - "\n", - "query = {\n", - " \"size\": 1000,\n", - " \"track_total_hits\": True,\n", - " \"query\" : {\n", - " \"match_all\" : {}\n", - " }\n", - "}\n", - "\n", - "#query 1000 records from database\n", - "\n", - "response = client.search(index='redbox-data-integration-chunk-current', body=query)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read index and alias names from Opensearch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Read index and alias names from database\n", - "client.indices.get_alias(name = \"*\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Delete Index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "client.indices.delete(\"redbox-data-integration-chunk\") #index" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "py312", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 7de1c33d1a8c8f3dd8e71edf771e2d48d43b2403 Mon Sep 17 00:00:00 2001 From: nora-errouhly Date: Wed, 26 Feb 2025 11:04:12 +0000 Subject: [PATCH 4/5] moving notebook to notebooks folder --- notebooks/read_local_opensearch.ipynb | 101 ++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 notebooks/read_local_opensearch.ipynb diff --git a/notebooks/read_local_opensearch.ipynb b/notebooks/read_local_opensearch.ipynb new file mode 100644 index 00000000..dad5dfec --- /dev/null +++ b/notebooks/read_local_opensearch.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read data from Opensearch" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from json import load\n", + "from opensearchpy import OpenSearch, RequestsHttpConnection, client\n", + "import os\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "password = os.environ.get(\"OPENSEARCH_INITIAL_ADMIN_PASSWORD\")\n", + "\n", + "client = OpenSearch(\n", + " hosts=[{\"host\": \"localhost\", \"port\": \"9200\"}],\n", + " http_auth=(\"admin\", password),\n", + " use_ssl=False, #to run locally, changed from True to False\n", + " connection_class=RequestsHttpConnection,\n", + " retry_on_timeout=True\n", + " )\n", + "\n", + "query = {\n", + " \"size\": 1000,\n", + " \"track_total_hits\": True,\n", + " \"query\" : {\n", + " \"match_all\" : {}\n", + " }\n", + "}\n", + "\n", + "#query 1000 records from database\n", + "\n", + "response = client.search(index='redbox-data-integration-chunk-current', body=query)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read index and alias names from Opensearch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Read index and alias names from database\n", + "client.indices.get_alias(name = \"*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Delete Index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.indices.delete(\"redbox-data-integration-chunk\") #index" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py312", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 7cfa3f06955296554fad769d7c87b980aebef31c Mon Sep 17 00:00:00 2001 From: nora-errouhly Date: Thu, 27 Feb 2025 15:58:17 +0000 Subject: [PATCH 5/5] setting similar score threshold --- redbox-core/redbox/retriever/queries.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/redbox-core/redbox/retriever/queries.py b/redbox-core/redbox/retriever/queries.py index f03ab5d2..0afe42c4 100644 --- a/redbox-core/redbox/retriever/queries.py +++ b/redbox-core/redbox/retriever/queries.py @@ -113,6 +113,7 @@ def build_document_query( return { "size": ai_settings.rag_k, + "min_score": 0.65, "query": { "bool": { "must": [ @@ -121,7 +122,7 @@ def build_document_query( "vector_field": { "vector": query_vector, "k": ai_settings.rag_num_candidates, - "boost": ai_settings.knn_boost, + #"boost": ai_settings.knn_boost, "filter": query_filter, } }