diff --git a/notebooks/composite_query_opensearch.ipynb b/notebooks/composite_query_opensearch.ipynb new file mode 100644 index 00000000..f23d8ef1 --- /dev/null +++ b/notebooks/composite_query_opensearch.ipynb @@ -0,0 +1,479 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Context" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When sending an irrelevant query such as \"what is the door about\", the search still returns irrelevant chunks from the selected document. The LLM intermittently hallucinate and answer the question based on the irrelevant chunks. This notebook is about how to optimize search but further analysis is required to ensure the LLM does not hallucinate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from opensearchpy import OpenSearch, RequestsHttpConnection, client\n", + "#\"http://admin:admin@opensearch:9200\"\n", + "client = OpenSearch(\n", + " hosts=[{\"host\": \"localhost\", \"port\": \"9200\"}],\n", + " http_auth=(\"admin\", \"admin\"),\n", + " use_ssl=False, #to run locally, changed from True to False\n", + " connection_class=RequestsHttpConnection,\n", + " retry_on_timeout=True\n", + " )\n", + "\n", + "query = {\n", + " \"size\": 1000,\n", + " \"track_total_hits\": True,\n", + " \"query\" : {\n", + " \"match_all\" : {}\n", + " }\n", + "}\n", + "\n", + "#redbox-data-integration-chunk-current\n", + "\n", + "response = client.search(index='redbox-data-integration-chunk-current', body=query)\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define QUERY" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.embeddings import BedrockEmbeddings\n", + "embedding_model = BedrockEmbeddings(region_name='eu-west-2', model_id=\"amazon.titan-embed-text-v2:0\")\n", + "#query = \"Data feminism begins by examining how power operates in the world today\" #66\n", + "#query = \"goodbye\" #score is 0\n", + "#query = \"what is this door about\" #score is 3.3\n", + "query = \"I don't know.\"\n", + "query_vector = embedding_model.embed_query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.indices.get_mapping(index='redbox-data-integration-chunk-current')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from pydantic import BaseModel\n", + "class AISettings(BaseModel):\n", + " \"\"\"Prompts and other AI settings\"\"\"\n", + "\n", + " # LLM settings\n", + " context_window_size: int = 128_000\n", + " llm_max_tokens: int = 1024\n", + "\n", + " # Prompts and LangGraph settings\n", + " max_document_tokens: int = 1_000_000\n", + " self_route_enabled: bool = False\n", + " map_max_concurrency: int = 128\n", + " stuff_chunk_context_ratio: float = 0.75\n", + " recursion_limit: int = 50\n", + "\n", + "\n", + "\n", + "\n", + "\n", + " # Elasticsearch RAG and boost values\n", + " rag_k: int = 30\n", + " rag_num_candidates: int = 10\n", + " rag_gauss_scale_size: int = 3\n", + " rag_gauss_scale_decay: float = 0.5\n", + " rag_gauss_scale_min: float = 1.1\n", + " rag_gauss_scale_max: float = 2.0\n", + " elbow_filter_enabled: bool = False\n", + " match_boost: float = 1.0\n", + " match_name_boost: float = 2.0\n", + " match_description_boost: float = 0.5\n", + " match_keywords_boost: float = 0.5\n", + " knn_boost: float = 2.0\n", + " similarity_threshold: float = 0.7\n", + "\n", + " # this is also the azure_openai_model\n", + " #chat_backend: ChatLLMBackend = ChatLLMBackend()\n", + "\n", + " # settings for tool call\n", + " tool_govuk_retrieved_results: int = 100\n", + " tool_govuk_returned_results: int = 5" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "ai_settings = AISettings()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "query_filter = [{\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\"terms\": {\"metadata.file_name.keyword\": ['natasha.boyse@digital.trade.gov.uk/1_The_power_chapter.pdf']}},\n", + " {\"terms\": {\"metadata.uri.keyword\": ['natasha.boyse@digital.trade.gov.uk/1_The_power_chapter.pdf']}}\n", + " ]\n", + " }\n", + " }, {\"term\": {\"metadata.chunk_resolution.keyword\": \"normal\"}}]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Composite query" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "final_query = {\"size\": ai_settings.rag_k,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"text\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_boost,\n", + " }\n", + " },\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.name\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_name_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.description\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_description_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.keywords\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_keywords_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"knn\": {\n", + " \"vector_field\": {\n", + " \"vector\": query_vector,\n", + " \"k\": ai_settings.rag_num_candidates,\n", + " \"boost\": ai_settings.knn_boost}\n", + " }\n", + " },\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "final_response = client.search(index='redbox-data-integration-chunk-current', body=final_query)\n", + "final_response" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Keyword query only" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BM25 search on document text, title, description and keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "keyword_final_query = {\"size\": ai_settings.rag_k,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"text\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_boost,\n", + " }\n", + " },\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.name\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_name_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.description\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_description_boost,\n", + " }\n", + " }\n", + " },\n", + " {\n", + " \"match\": {\n", + " \"metadata.keywords\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_keywords_boost,\n", + " }\n", + " }\n", + " },\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_keyword = client.search(index='redbox-data-integration-chunk-current', body=keyword_final_query)\n", + "response_keyword" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BM25 query on document text only" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "text_keyword_final_query = {\"size\": ai_settings.rag_k,\n", + " #\"min_score\":0.01,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"match\": {\n", + " \"text\": {\n", + " \"query\": query,\n", + " \"boost\": ai_settings.match_boost,\n", + " #\"analyzer\": \"stop\",\n", + " \n", + " }\n", + " },\n", + " }\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_text_keyword = client.search(index='redbox-data-integration-chunk-current', body=text_keyword_final_query)\n", + "response_text_keyword" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Semantic query" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "knn_final_query = {\"size\": ai_settings.rag_k,\n", + " #\"min_score\": 1.9,\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"must\": [\n", + " {\n", + " \"knn\": {\n", + " \"vector_field\": {\n", + " \"vector\": query_vector,\n", + " \"k\": ai_settings.rag_num_candidates,\n", + " \"boost\": ai_settings.knn_boost,\n", + " \n", + " }\n", + " }\n", + " },\n", + " ],\n", + " \"filter\": query_filter,\n", + " }\n", + " },\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_knn = client.search(index='redbox-data-integration-chunk-current', body=knn_final_query)\n", + "response_knn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response_knn[\"hits\"][\"hits\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Recommendations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Keyword search based on BM25 does not remove stop words. This lead to inflated scores returning irrelevant results. Analyzer function in Opensearch should be used to remove stopwords. However, when applying analyzer only on query, it still returns irrelevant chunks. This could be due to the fact that we should also remove stop words from indexed documents. However, removing stop words would impact semantic search. Therefore, adding a new field for 'Text' attribute is required for keyword search where STOP analyzer is performed\n", + "\n", + "- Even when score is 0, Keyword search returns the chunks. We should set min_score for keyword search to a low value to filter out irrelevant chunks. When the query is irrelevant, semantic search does not return any chunks. Perhaps, there is in-built cutoff threeshold for the relevance score in Opensearch KNN but not in BM25. This need to be verified.\n", + "\n", + "- Relevance scores from BM25 are added to relevance scores from Semantic seach (cosine similarity). Scores from BM25 can be as high as 66 while score from Opensearch scaled cosine similarity is between 0 and 2.\n", + "Thereore, the impact of keyword is greater than semantic search. It doesn'\\t make sense to add both scores. In addition, there are scaling factors (Boost parameters) used as a multiplier to each score, to add more weight to to Semantic search and keyword search for the titles of the documents. The impact of such boosting scores need to be investigated. Further research on the best approach to implement hybrid search is required.\n", + "\n", + "- In the short term, we can remove keyword search and keep semantic search. This would address the issues with irrelevant queries, increasing the recall and therefore accuracy of the search. However, semantic search does not handle well acronyms. A long-term solution using hybrid search is required.\n", + "\n", + "- When implementing the short term solution, integration testing should be done to verify that the RAG and gadget/agent works when no chunks are returned:\n", + " 1) Ensuring code handles the edge case where no chunks returned.\n", + " 2) Ensuring LLM do not hallucinate if there are no chunks returned\n", + " 2) Ensuring agent/gadget select other tools (for example gov.uk) to attempt answering the question" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"The bool query takes a more-matches-is-better approach, so the score from each matching must or should clause will be added together to provide the final _score for each document.\"\n", + "https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-bool-query.html#:~:text=The%20bool%20query%20takes%20a,final%20_score%20for%20each%20document.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stop analyzers\n", + "https://opensearch.org/docs/2.0/opensearch/query-dsl/text-analyzers/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"The search term is analyzed by the same analyzer that was used for the specific document field at the time it was indexed. This means that your search term goes through the same analysis process as the document’s field.\"\n", + "https://opensearch.org/docs/latest/query-dsl/term-vs-full-text/\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/read_local_opensearch.ipynb b/notebooks/read_local_opensearch.ipynb similarity index 100% rename from read_local_opensearch.ipynb rename to notebooks/read_local_opensearch.ipynb diff --git a/redbox-core/redbox/graph/root.py b/redbox-core/redbox/graph/root.py index 5aee48e6..d632ffb5 100644 --- a/redbox-core/redbox/graph/root.py +++ b/redbox-core/redbox/graph/root.py @@ -5,6 +5,7 @@ from langgraph.graph import END, START, StateGraph from langgraph.graph.graph import CompiledGraph from langgraph.prebuilt import ToolNode +from langgraph.pregel import RetryPolicy from redbox.chains.components import get_structured_response_with_citations_parser from redbox.chains.runnables import build_self_route_output_parser @@ -37,7 +38,132 @@ from redbox.models.chat import ChatRoute, ErrorRoute from redbox.models.graph import ROUTABLE_KEYWORDS, RedboxActivityEvent from redbox.transform import structure_documents_by_file_name, structure_documents_by_group_and_indices -from langgraph.pregel import RetryPolicy + + +def get_summarise_graph(all_chunks_retriever): + builder = StateGraph(RedboxState) + builder.add_node("choose_route_based_on_request_token", empty_process) + builder.add_node("set_route_to_summarise_large_doc", build_set_route_pattern(ChatRoute.chat_with_docs_map_reduce)) + builder.add_node("set_route_to_summarise_doc", build_set_route_pattern(ChatRoute.chat_with_docs)) + builder.add_node("pass_user_prompt_to_LLM_message", build_passthrough_pattern()) + builder.add_node("clear_documents", clear_documents_process) + + builder.add_node("document_has_multiple_chunks", empty_process) + builder.add_node("any_summarised_docs_bigger_than_context_window", empty_process) + builder.add_node("any_document_bigger_than_context_window", empty_process) + + builder.add_node("sending_chunks_to_summarise", empty_process) + builder.add_node("sending_summarised_chunks_checking_exceed_context", empty_process) + builder.add_node("sending_summarised_chunks", empty_process) + + builder.add_node( + "summarise_summarised_chunks", + build_merge_pattern(prompt_set=PromptSet.ChatwithDocsMapReduce), + retry=RetryPolicy(max_attempts=3), + ) + + builder.add_node( + "files_too_large_error", + build_error_pattern( + text="These documents are too large to work with.", + route_name=ErrorRoute.files_too_large, + ), + ) + + builder.add_node( + "retrieve_all_chunks", + build_retrieve_pattern( + retriever=all_chunks_retriever, + structure_func=structure_documents_by_file_name, + final_source_chain=True, + ), + ) + builder.add_node( + "summarise_each_chunk_in_document", + build_merge_pattern(prompt_set=PromptSet.ChatwithDocsMapReduce), + retry=RetryPolicy(max_attempts=3), + ) + + # edges + builder.add_edge(START, "choose_route_based_on_request_token") + builder.add_conditional_edges( + "choose_route_based_on_request_token", + build_total_tokens_request_handler_conditional(PromptSet.ChatwithDocsMapReduce), + { + "max_exceeded": "files_too_large_error", + "context_exceeded": "set_route_to_summarise_large_doc", + "pass": "set_route_to_summarise_doc", + }, + ) + builder.add_edge("set_route_to_summarise_large_doc", "pass_user_prompt_to_LLM_message") + builder.add_edge("set_route_to_summarise_doc", "pass_user_prompt_to_LLM_message") + builder.add_edge("pass_user_prompt_to_LLM_message", "retrieve_all_chunks") + builder.add_conditional_edges( + "retrieve_all_chunks", + lambda s: s.route_name, + { + ChatRoute.chat_with_docs: "summarise_document", + ChatRoute.chat_with_docs_map_reduce: "sending_chunks_to_summarise", + }, + ) + + # summarise process + builder.add_node( + "summarise_document", + build_stuff_pattern( + prompt_set=PromptSet.ChatwithDocs, + final_response_chain=True, + ), + retry=RetryPolicy(max_attempts=3), + ) + builder.add_edge("summarise_document", "clear_documents") + + # summarise large documents process + builder.add_conditional_edges( + "sending_chunks_to_summarise", + build_document_chunk_send("summarise_each_chunk_in_document"), + path_map=["summarise_each_chunk_in_document"], + ) + builder.add_edge("summarise_each_chunk_in_document", "document_has_multiple_chunks") + builder.add_conditional_edges( + "document_has_multiple_chunks", + multiple_docs_in_group_conditional, + { + True: "sending_summarised_chunks_checking_exceed_context", + False: "any_summarised_docs_bigger_than_context_window", + }, + ) + builder.add_conditional_edges( + "sending_summarised_chunks_checking_exceed_context", + build_document_group_send("any_document_bigger_than_context_window"), + path_map=["any_document_bigger_than_context_window"], + ) + builder.add_conditional_edges( + "any_document_bigger_than_context_window", + build_documents_bigger_than_context_conditional(PromptSet.ChatwithDocsMapReduce), + { + True: "files_too_large_error", + False: "sending_summarised_chunks", + }, + ) + builder.add_conditional_edges( + "sending_summarised_chunks", + build_document_group_send("summarise_summarised_chunks"), + path_map=["summarise_summarised_chunks"], + ) + builder.add_edge("summarise_summarised_chunks", "any_summarised_docs_bigger_than_context_window") + builder.add_conditional_edges( + "any_summarised_docs_bigger_than_context_window", + build_documents_bigger_than_context_conditional(PromptSet.ChatwithDocs), + { + True: "files_too_large_error", + False: "summarise_document", + }, + ) + builder.add_edge("summarise_document", "clear_documents") + builder.add_edge("clear_documents", END) + builder.add_edge("files_too_large_error", END) + return builder.compile() def get_self_route_graph(retriever: VectorStoreRetriever, prompt_set: PromptSet, debug: bool = False): @@ -434,6 +560,7 @@ def get_root_graph( builder.add_node("p_chat_with_documents", cwd_subgraph) builder.add_node("p_retrieve_metadata", metadata_subgraph) builder.add_node("p_new_route", new_route) + builder.add_node("p_summarise", get_summarise_graph(all_chunks_retriever)) # Log builder.add_node( @@ -464,6 +591,7 @@ def get_root_graph( ChatRoute.search: "p_search", ChatRoute.gadget: "p_search_agentic", ChatRoute.newroute: "p_new_route", + ChatRoute.summarise: "p_summarise", "DEFAULT": "d_docs_selected", }, ) diff --git a/redbox-core/redbox/models/chat.py b/redbox-core/redbox/models/chat.py index 71b79a8e..c560863a 100644 --- a/redbox-core/redbox/models/chat.py +++ b/redbox-core/redbox/models/chat.py @@ -8,6 +8,7 @@ class ChatRoute(StrEnum): chat_with_docs = "summarise" chat_with_docs_map_reduce = "chat/documents/large" newroute = "newroute" + summarise = "summarise" class ErrorRoute(StrEnum): diff --git a/redbox-core/redbox/models/graph.py b/redbox-core/redbox/models/graph.py index ae37a74f..fdf29dbd 100644 --- a/redbox-core/redbox/models/graph.py +++ b/redbox-core/redbox/models/graph.py @@ -24,4 +24,5 @@ class RedboxActivityEvent(BaseModel): ChatRoute.search: "Search for an answer to the question in the document", ChatRoute.gadget: "Let Redbox go-go-gadget to answer to the question using the documents", ChatRoute.newroute: "New amazing route", + ChatRoute.summarise: "Summarise documents", } diff --git a/redbox-core/redbox/retriever/queries.py b/redbox-core/redbox/retriever/queries.py index 68600965..e5d67fab 100644 --- a/redbox-core/redbox/retriever/queries.py +++ b/redbox-core/redbox/retriever/queries.py @@ -132,39 +132,7 @@ def build_document_query( "size": ai_settings.rag_k, "query": { "bool": { - "should": [ - { - "match": { - "text": { - "query": query, - "boost": ai_settings.match_boost, - } - }, - }, - { - "match": { - "metadata.name": { - "query": query, - "boost": ai_settings.match_name_boost, - } - } - }, - { - "match": { - "metadata.description": { - "query": query, - "boost": ai_settings.match_description_boost, - } - } - }, - { - "match": { - "metadata.keywords": { - "query": query, - "boost": ai_settings.match_keywords_boost, - } - } - }, + "must": [ { "knn": { "vector_field": { diff --git a/redbox-core/tests/graph/test_app.py b/redbox-core/tests/graph/test_app.py index 84ba43c8..d25794b7 100644 --- a/redbox-core/tests/graph/test_app.py +++ b/redbox-core/tests/graph/test_app.py @@ -604,6 +604,6 @@ def test_get_available_keywords(env: Settings): env=env, debug=LANGGRAPH_DEBUG, ) - keywords = {ChatRoute.search, ChatRoute.newroute, ChatRoute.gadget} + keywords = {ChatRoute.search, ChatRoute.newroute, ChatRoute.gadget, ChatRoute.summarise} assert keywords == set(app.get_available_keywords().keys())