wip: more migration to opensearch

uktrade · Feb 7, 2025 · d0b4554 · d0b4554
1 parent c44e222
commit d0b4554
Show file tree

Hide file tree

Showing 21 changed files with 68 additions and 127 deletions.
diff --git a/django_app/.vscode/launch.json b/django_app/.vscode/launch.json
@@ -15,7 +15,6 @@
                 "MINIO_HOST": "localhost",
                 "POSTGRES_HOST": "localhost",
                 "UNSTRUCTURED_HOST": "localhost",
-                "ELASTIC__HOST": "localhost"
             }
         },
 
@@ -33,7 +32,6 @@
                 "MINIO_HOST": "localhost",
                 "POSTGRES_HOST": "localhost",
                 "UNSTRUCTURED_HOST": "localhost",
-                "ELASTIC__HOST": "localhost"
             }
         },
     ]

diff --git a/django_app/.vscode/tasks.json b/django_app/.vscode/tasks.json
@@ -16,7 +16,6 @@
                     "MINIO_HOST": "localhost",
                     "POSTGRES_HOST": "localhost",
                     "UNSTRUCTURED_HOST": "localhost",
-                    "ELASTIC__HOST": "localhost",
                 }
             },
             "presentation": {
@@ -34,7 +33,6 @@
                     "MINIO_HOST": "localhost",
                     "POSTGRES_HOST": "localhost",
                     "UNSTRUCTURED_HOST": "localhost",
-                    "ELASTIC__HOST": "localhost",
                 }
             },
             "presentation": {
@@ -52,7 +50,6 @@
                     "MINIO_HOST": "localhost",
                     "POSTGRES_HOST": "localhost",
                     "UNSTRUCTURED_HOST": "localhost",
-                    "ELASTIC__HOST": "localhost",
                 }
             },
             "presentation": {
@@ -70,7 +67,6 @@
                     "MINIO_HOST": "localhost",
                     "POSTGRES_HOST": "localhost",
                     "UNSTRUCTURED_HOST": "localhost",
-                    "ELASTIC__HOST": "localhost",
                 }
             },
             "presentation": {
@@ -88,7 +84,6 @@
                     "MINIO_HOST": "localhost",
                     "POSTGRES_HOST": "localhost",
                     "UNSTRUCTURED_HOST": "localhost",
-                    "ELASTIC__HOST": "localhost",
                 }
             },
             "presentation": {
@@ -106,7 +101,6 @@
                     "MINIO_HOST": "localhost",
                     "POSTGRES_HOST": "localhost",
                     "UNSTRUCTURED_HOST": "localhost",
-                    "ELASTIC__HOST": "localhost",
                 }
             },
             "presentation": {

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -120,7 +120,7 @@ services:
       start_period: 30s
 
   opensearch:
-    image: opensearchproject/opensearch:2.17.0
+    image: opensearchproject/opensearch:2.18.0
     environment:
       - discovery.type=single-node
       - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m

diff --git a/docs/DEVELOPER_SETUP.md b/docs/DEVELOPER_SETUP.md
@@ -198,16 +198,16 @@ elasticdump \
   --type=data
 ```
 
-### Loading data to Elasticsearch
+### Loading data to Opensearch
 
 If you've been provided with a dump from the vector store, add it to [data/elastic-dumps/](../data/elastic-dumps/). The below assumes the existance of `redbox-data-chunk.json` in that directory.
 
 Consider dumping your existing indices if you don't want to have to reembed data you're working on.
 
-Start the Elasticsearch service.
+Start the Opensearch service.
 
 ```console
-docker compose up -d elasticsearch
+docker compose up -d opensearch
 ```
 
 Load data from your JSONs, or your own file.

diff --git a/docs/architecture/index.md b/docs/architecture/index.md
@@ -34,7 +34,7 @@ The Retrieval Augmented Generation (RAG) architecture grounds our Large Language
 | Core API | ECS | App Service | Docker | FastAPI AI Interaction and DB Intermediary |
 | Worker | ECS | App Service | Docker | Queue fed file ingester and embedder               |
 | Database | RDS/Postgres | Postgres | Postgres | Chat history & user data          |
-| Vector Database | ElasticCloud | ElasticCloud | Elasticsearch | RAG Database                               |
+| Vector Database | ElasticCloud | ElasticCloud | Opensearch | RAG Database                               |
 | Container Registry | ECR | ACR | Harbor | Storage for app containers                 |
 | Embedding API | Azure OpenAI Service | Azure OpenAI Service | Huggingface Containers | Embedding for docs into VectorDB           |
 | LLM API | Azure OpenAI Service | Azure OpenAI Service | Huggingface Containers | Chat model                                 |
diff --git a/docs/architecture/transactions_and_schema.md b/docs/architecture/transactions_and_schema.md
@@ -16,9 +16,9 @@ sequenceDiagram
     Django->>S3: file key, content
     Django->>Core: file key
     Core->>Workers: file key
-    Core->>Elastic: file key
+    Core->>Opensearch: file key
     S3->>Workers: file content
-    Workers->>Elastic: chunk key, content
+    Workers->>Opensearch: chunk key, content
 ```
 
 ### Chat APIs
@@ -44,7 +44,7 @@ title: Transaction sequence - POST /chat/rag
 
 sequenceDiagram
     Django->> Core: ChatHistory.messages[], File[].uuid
-    Elastic->>Core: File[].Chunk[].embeddings
+    Opensearch->>Core: File[].Chunk[].embeddings
     Core->>LLM API: ChatHistory.messages[].embeddings, File[].Chunk[].embeddings
 
 ```
@@ -101,13 +101,13 @@ erDiagram
     ChatHistory }|--o{ FileRecord: "ChatHistory.files_retrieved"
 ```
 
-### Elastic Schema
+### Opensearch Schema
 
 Keeping things simple is the primary ethos here. We are storing the UUID of the parent file in the chunk. This allows us to easily query for all chunks of a file. We are also storing the text of the chunk, the metadata of the chunk, and the embedding of the chunk. The embedding is a float array that is generated by the embedding API.
 
 ```mermaid
 ---
-title: Elastic schema
+title: Opensearch schema
 ---
 
 erDiagram

diff --git a/docs/code_reference/models/settings.md b/docs/code_reference/models/settings.md
@@ -4,14 +4,6 @@ Redbox used the `pydantic_settings` library to manage settings. This library all
 
 ::: redbox.models.settings.Settings
 
-# Elasticsearch Settings
+# OpenSearch Settings
 
-Depending on the deployment scenarios we have two different ways to configure Elasticsearch: `ElasticLocalSettings` and `ElasticCloudSettings`.
-
-## `ElasticLocalSettings`
-
-::: redbox.models.settings.ElasticLocalSettings
-
-## `ElasticCloudSettings`
-
-::: redbox.models.settings.ElasticCloudSettings
+We configure Opensearch via `OpenSearchSettings` in redbox-core/redbox/models/settings.py
diff --git a/docs/installation/local.md b/docs/installation/local.md
@@ -24,7 +24,7 @@ As the project deploys, you should eventually see the following message:
 ```
 [+] Running 8/8
  ✔ Network redbox_redbox-app-network  Created                                                                       0.0s 
- ✔ Container redbox-elasticsearch-1   Healthy                                                                      22.7s 
+ ✔ Container redbox-opensearch-1   Healthy                                                                      22.7s 
  ✔ Container redbox-redis-1           Healthy                                                                      22.7s 
  ✔ Container redbox-minio-1           Healthy                                                                      22.7s 
  ✔ Container redbox-db-1              Healthy                                                                      22.7s 
@@ -35,11 +35,11 @@ As the project deploys, you should eventually see the following message:
 
 Redbox utilises health checks to ensure that the services are running correctly.
 
-!!! info "Elastic and Minio failure"
-    If you see that the Elasticsearch or MinIO containers are unhealthy, this may be due to a permission issue with the directory they're mounted to. You can fix this by running the following command:
+!!! info "Opensearch and Minio failure"
+    If you see that the Opensearch or MinIO containers are unhealthy, this may be due to a permission issue with the directory they're mounted to. You can fix this by running the following command:
 
     ```bash
-    chmod -R 777 ./data/elastic/
+    chmod -R 777 ./data/opensearch/
     chmod -R 777 ./data/objectstore/
     ```
 

diff --git a/redbox-core/poetry.lock b/redbox-core/poetry.lock
diff --git a/redbox-core/pyproject.toml b/redbox-core/pyproject.toml
@@ -13,7 +13,6 @@ readme = "../README.md"
 [tool.poetry.dependencies]
 python = ">=3.12,<3.13"
 pydantic = "^2.7.1"
-elasticsearch = "^8.15.0"
 langchain-community = ">0.2.12"
 langchain = "^0.3.4"
 langchain_openai = ">0.1.21"

diff --git a/redbox-core/redbox/chains/components.py b/redbox-core/redbox/chains/components.py
@@ -11,7 +11,6 @@
 from langchain_core.runnables import Runnable
 from langchain_core.utils import convert_to_secret_str
 
-# from langchain_elasticsearch import ElasticsearchRetriever
 from langchain_openai.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings
 
 
@@ -98,7 +97,7 @@ def get_all_chunks_retriever(env: Settings) -> OpenSearchRetriever:
 
 
 def get_parameterised_retriever(env: Settings, embeddings: Embeddings | None = None):
-    """Creates an Elasticsearch retriever runnable.
+    """Creates an Opensearch retriever runnable.
 
     Runnable takes input of a dict keyed to question, file_uuids and user_uuid.
 

diff --git a/redbox-core/redbox/graph/nodes/tools.py b/redbox-core/redbox/graph/nodes/tools.py
@@ -3,7 +3,6 @@
 import numpy as np
 import requests
 import tiktoken
-from elasticsearch import Elasticsearch
 from opensearchpy import OpenSearch
 from langchain_community.utilities import WikipediaAPIWrapper
 from langchain_core.documents import Document
@@ -30,7 +29,7 @@
 
 
 def build_search_documents_tool(
-    es_client: Union[Elasticsearch, OpenSearch],
+    es_client: OpenSearch,
     index_name: str,
     embedding_model: Embeddings,
     embedding_field_name: str,

diff --git a/redbox-core/redbox/loader/ingester.py b/redbox-core/redbox/loader/ingester.py
@@ -25,33 +25,20 @@
 
 
 def get_elasticsearch_store(es, es_index_name: str):
-    # return ElasticsearchStore(
-    #     index_name=es_index_name,
-    #     embedding=get_embeddings(env),
-    #     es_connection=es,
-    #     query_field="text",
-    #     vector_query_field=env.embedding_document_field_name,
-    # )
     return OpenSearchVectorSearch(
         index_name=es_index_name,
-        opensearch_url=env.elastic.collection_endpoint,
+        opensearch_url=env.opensearch.collection_endpoint,
         embedding_function=get_embeddings(env),
         query_field="text",
         vector_query_field=env.embedding_document_field_name,
     )
 
 
 def get_elasticsearch_store_without_embeddings(es, es_index_name: str):
-    # return ElasticsearchStore(
-    #     index_name=es_index_name,
-    #     es_connection=es,
-    #     query_field="text",
-    #     strategy=BM25Strategy(),
-    # )
 
     return OpenSearchVectorSearch(
         index_name=es_index_name,
-        opensearch_url=env.elastic.collection_endpoint,
+        opensearch_url=env.opensearch.collection_endpoint,
         embedding_function=FakeEmbeddings(size=env.embedding_backend_vector_size),
     )
 

diff --git a/redbox-core/redbox/models/chain.py b/redbox-core/redbox/models/chain.py
@@ -60,7 +60,7 @@ class AISettings(BaseModel):
     chat_map_question_prompt: str = prompts.CHAT_MAP_QUESTION_PROMPT
     reduce_system_prompt: str = prompts.REDUCE_SYSTEM_PROMPT
 
-    # Elasticsearch RAG and boost values
+    # Opensearch RAG and boost values
     rag_k: int = 30
     rag_num_candidates: int = 10
     rag_gauss_scale_size: int = 3