From d0b45544e18960969ec37b083e3ff4e7ec837109 Mon Sep 17 00:00:00 2001 From: Natasha Boyse Date: Fri, 7 Feb 2025 14:53:26 +0000 Subject: [PATCH] wip: more migration to opensearch --- django_app/.vscode/launch.json | 2 - django_app/.vscode/tasks.json | 6 -- docker-compose.yml | 2 +- docs/DEVELOPER_SETUP.md | 6 +- docs/architecture/index.md | 2 +- docs/architecture/transactions_and_schema.md | 10 ++-- docs/code_reference/models/settings.md | 12 +--- docs/installation/local.md | 8 +-- redbox-core/poetry.lock | 10 ++-- redbox-core/pyproject.toml | 1 - redbox-core/redbox/chains/components.py | 3 +- redbox-core/redbox/graph/nodes/tools.py | 3 +- redbox-core/redbox/loader/ingester.py | 17 +----- redbox-core/redbox/models/chain.py | 2 +- redbox-core/redbox/models/settings.py | 62 ++++++-------------- redbox-core/redbox/retriever/queries.py | 10 ++-- redbox-core/redbox/retriever/retrievers.py | 10 ++-- redbox-core/tests/conftest.py | 14 ++--- redbox-core/tests/test_ingest.py | 9 +-- redbox-core/tests/test_tools.py | 4 +- redbox-core/tests/test_transform.py | 2 +- 21 files changed, 68 insertions(+), 127 deletions(-) diff --git a/django_app/.vscode/launch.json b/django_app/.vscode/launch.json index b7540695c..998b7089e 100644 --- a/django_app/.vscode/launch.json +++ b/django_app/.vscode/launch.json @@ -15,7 +15,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost" } }, @@ -33,7 +32,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost" } }, ] diff --git a/django_app/.vscode/tasks.json b/django_app/.vscode/tasks.json index 432fdc576..3da8dce25 100644 --- a/django_app/.vscode/tasks.json +++ b/django_app/.vscode/tasks.json @@ -16,7 +16,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost", } }, "presentation": { @@ -34,7 +33,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost", } }, "presentation": { @@ -52,7 +50,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost", } }, "presentation": { @@ -70,7 +67,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost", } }, "presentation": { @@ -88,7 +84,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost", } }, "presentation": { @@ -106,7 +101,6 @@ "MINIO_HOST": "localhost", "POSTGRES_HOST": "localhost", "UNSTRUCTURED_HOST": "localhost", - "ELASTIC__HOST": "localhost", } }, "presentation": { diff --git a/docker-compose.yml b/docker-compose.yml index 5af21d2bc..eb1751937 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -120,7 +120,7 @@ services: start_period: 30s opensearch: - image: opensearchproject/opensearch:2.17.0 + image: opensearchproject/opensearch:2.18.0 environment: - discovery.type=single-node - OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m diff --git a/docs/DEVELOPER_SETUP.md b/docs/DEVELOPER_SETUP.md index 492745524..417841c50 100644 --- a/docs/DEVELOPER_SETUP.md +++ b/docs/DEVELOPER_SETUP.md @@ -198,16 +198,16 @@ elasticdump \ --type=data ``` -### Loading data to Elasticsearch +### Loading data to Opensearch If you've been provided with a dump from the vector store, add it to [data/elastic-dumps/](../data/elastic-dumps/). The below assumes the existance of `redbox-data-chunk.json` in that directory. Consider dumping your existing indices if you don't want to have to reembed data you're working on. -Start the Elasticsearch service. +Start the Opensearch service. ```console -docker compose up -d elasticsearch +docker compose up -d opensearch ``` Load data from your JSONs, or your own file. diff --git a/docs/architecture/index.md b/docs/architecture/index.md index 9426fe4e3..d6b60ad06 100644 --- a/docs/architecture/index.md +++ b/docs/architecture/index.md @@ -34,7 +34,7 @@ The Retrieval Augmented Generation (RAG) architecture grounds our Large Language | Core API | ECS | App Service | Docker | FastAPI AI Interaction and DB Intermediary | | Worker | ECS | App Service | Docker | Queue fed file ingester and embedder | | Database | RDS/Postgres | Postgres | Postgres | Chat history & user data | -| Vector Database | ElasticCloud | ElasticCloud | Elasticsearch | RAG Database | +| Vector Database | ElasticCloud | ElasticCloud | Opensearch | RAG Database | | Container Registry | ECR | ACR | Harbor | Storage for app containers | | Embedding API | Azure OpenAI Service | Azure OpenAI Service | Huggingface Containers | Embedding for docs into VectorDB | | LLM API | Azure OpenAI Service | Azure OpenAI Service | Huggingface Containers | Chat model | diff --git a/docs/architecture/transactions_and_schema.md b/docs/architecture/transactions_and_schema.md index a62ca79b9..0e3c6ca72 100644 --- a/docs/architecture/transactions_and_schema.md +++ b/docs/architecture/transactions_and_schema.md @@ -16,9 +16,9 @@ sequenceDiagram Django->>S3: file key, content Django->>Core: file key Core->>Workers: file key - Core->>Elastic: file key + Core->>Opensearch: file key S3->>Workers: file content - Workers->>Elastic: chunk key, content + Workers->>Opensearch: chunk key, content ``` ### Chat APIs @@ -44,7 +44,7 @@ title: Transaction sequence - POST /chat/rag sequenceDiagram Django->> Core: ChatHistory.messages[], File[].uuid - Elastic->>Core: File[].Chunk[].embeddings + Opensearch->>Core: File[].Chunk[].embeddings Core->>LLM API: ChatHistory.messages[].embeddings, File[].Chunk[].embeddings ``` @@ -101,13 +101,13 @@ erDiagram ChatHistory }|--o{ FileRecord: "ChatHistory.files_retrieved" ``` -### Elastic Schema +### Opensearch Schema Keeping things simple is the primary ethos here. We are storing the UUID of the parent file in the chunk. This allows us to easily query for all chunks of a file. We are also storing the text of the chunk, the metadata of the chunk, and the embedding of the chunk. The embedding is a float array that is generated by the embedding API. ```mermaid --- -title: Elastic schema +title: Opensearch schema --- erDiagram diff --git a/docs/code_reference/models/settings.md b/docs/code_reference/models/settings.md index 93b007c35..3d280dff6 100644 --- a/docs/code_reference/models/settings.md +++ b/docs/code_reference/models/settings.md @@ -4,14 +4,6 @@ Redbox used the `pydantic_settings` library to manage settings. This library all ::: redbox.models.settings.Settings -# Elasticsearch Settings +# OpenSearch Settings -Depending on the deployment scenarios we have two different ways to configure Elasticsearch: `ElasticLocalSettings` and `ElasticCloudSettings`. - -## `ElasticLocalSettings` - -::: redbox.models.settings.ElasticLocalSettings - -## `ElasticCloudSettings` - -::: redbox.models.settings.ElasticCloudSettings \ No newline at end of file +We configure Opensearch via `OpenSearchSettings` in redbox-core/redbox/models/settings.py \ No newline at end of file diff --git a/docs/installation/local.md b/docs/installation/local.md index 44d7c080f..fae4eaa36 100644 --- a/docs/installation/local.md +++ b/docs/installation/local.md @@ -24,7 +24,7 @@ As the project deploys, you should eventually see the following message: ``` [+] Running 8/8 ✔ Network redbox_redbox-app-network Created 0.0s - ✔ Container redbox-elasticsearch-1 Healthy 22.7s + ✔ Container redbox-opensearch-1 Healthy 22.7s ✔ Container redbox-redis-1 Healthy 22.7s ✔ Container redbox-minio-1 Healthy 22.7s ✔ Container redbox-db-1 Healthy 22.7s @@ -35,11 +35,11 @@ As the project deploys, you should eventually see the following message: Redbox utilises health checks to ensure that the services are running correctly. -!!! info "Elastic and Minio failure" - If you see that the Elasticsearch or MinIO containers are unhealthy, this may be due to a permission issue with the directory they're mounted to. You can fix this by running the following command: +!!! info "Opensearch and Minio failure" + If you see that the Opensearch or MinIO containers are unhealthy, this may be due to a permission issue with the directory they're mounted to. You can fix this by running the following command: ```bash - chmod -R 777 ./data/elastic/ + chmod -R 777 ./data/opensearch/ chmod -R 777 ./data/objectstore/ ``` diff --git a/redbox-core/poetry.lock b/redbox-core/poetry.lock index da35d2463..81789b5fc 100644 --- a/redbox-core/poetry.lock +++ b/redbox-core/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1212,13 +1212,13 @@ develop = ["aiohttp", "furo", "httpcore (<1.0.6)", "httpx", "opentelemetry-api", [[package]] name = "elasticsearch" -version = "8.16.0" +version = "8.17.1" description = "Python client for Elasticsearch" optional = false python-versions = ">=3.8" files = [ - {file = "elasticsearch-8.16.0-py3-none-any.whl", hash = "sha256:83d9fe09e8e95880559da43e44976c1e11cc63fe96bc0c0592f3d64f371772bf"}, - {file = "elasticsearch-8.16.0.tar.gz", hash = "sha256:d2aaa92f44ebea3c4147389aeba038c0b42a017f8c52ff35b1e7ebc34c49adb7"}, + {file = "elasticsearch-8.17.1-py3-none-any.whl", hash = "sha256:f1de0a075f12cc0fa377668eb4fb2ce02185c060ebb50cf2c3889242f9a5130e"}, + {file = "elasticsearch-8.17.1.tar.gz", hash = "sha256:057ab44cae8b3acffbf826a31678e46eafc38f26fcffa91015352d973299cdf0"}, ] [package.dependencies] @@ -4809,4 +4809,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.12,<3.13" -content-hash = "e068974df41e04dce020573cf08519942e6e6d34070798358b6f549aa9d3dd9f" +content-hash = "e2362b961c8e1d9df7fe2ad77e3556dc5c23dc8882f42f6c3180708a520a3125" diff --git a/redbox-core/pyproject.toml b/redbox-core/pyproject.toml index ef7c4c0fb..70b59aec6 100644 --- a/redbox-core/pyproject.toml +++ b/redbox-core/pyproject.toml @@ -13,7 +13,6 @@ readme = "../README.md" [tool.poetry.dependencies] python = ">=3.12,<3.13" pydantic = "^2.7.1" -elasticsearch = "^8.15.0" langchain-community = ">0.2.12" langchain = "^0.3.4" langchain_openai = ">0.1.21" diff --git a/redbox-core/redbox/chains/components.py b/redbox-core/redbox/chains/components.py index 328cb92d8..435d3efe4 100644 --- a/redbox-core/redbox/chains/components.py +++ b/redbox-core/redbox/chains/components.py @@ -11,7 +11,6 @@ from langchain_core.runnables import Runnable from langchain_core.utils import convert_to_secret_str -# from langchain_elasticsearch import ElasticsearchRetriever from langchain_openai.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings @@ -98,7 +97,7 @@ def get_all_chunks_retriever(env: Settings) -> OpenSearchRetriever: def get_parameterised_retriever(env: Settings, embeddings: Embeddings | None = None): - """Creates an Elasticsearch retriever runnable. + """Creates an Opensearch retriever runnable. Runnable takes input of a dict keyed to question, file_uuids and user_uuid. diff --git a/redbox-core/redbox/graph/nodes/tools.py b/redbox-core/redbox/graph/nodes/tools.py index e7679ae19..6001e3bc0 100644 --- a/redbox-core/redbox/graph/nodes/tools.py +++ b/redbox-core/redbox/graph/nodes/tools.py @@ -3,7 +3,6 @@ import numpy as np import requests import tiktoken -from elasticsearch import Elasticsearch from opensearchpy import OpenSearch from langchain_community.utilities import WikipediaAPIWrapper from langchain_core.documents import Document @@ -30,7 +29,7 @@ def build_search_documents_tool( - es_client: Union[Elasticsearch, OpenSearch], + es_client: OpenSearch, index_name: str, embedding_model: Embeddings, embedding_field_name: str, diff --git a/redbox-core/redbox/loader/ingester.py b/redbox-core/redbox/loader/ingester.py index 1164e57db..3c13b87a0 100644 --- a/redbox-core/redbox/loader/ingester.py +++ b/redbox-core/redbox/loader/ingester.py @@ -25,16 +25,9 @@ def get_elasticsearch_store(es, es_index_name: str): - # return ElasticsearchStore( - # index_name=es_index_name, - # embedding=get_embeddings(env), - # es_connection=es, - # query_field="text", - # vector_query_field=env.embedding_document_field_name, - # ) return OpenSearchVectorSearch( index_name=es_index_name, - opensearch_url=env.elastic.collection_endpoint, + opensearch_url=env.opensearch.collection_endpoint, embedding_function=get_embeddings(env), query_field="text", vector_query_field=env.embedding_document_field_name, @@ -42,16 +35,10 @@ def get_elasticsearch_store(es, es_index_name: str): def get_elasticsearch_store_without_embeddings(es, es_index_name: str): - # return ElasticsearchStore( - # index_name=es_index_name, - # es_connection=es, - # query_field="text", - # strategy=BM25Strategy(), - # ) return OpenSearchVectorSearch( index_name=es_index_name, - opensearch_url=env.elastic.collection_endpoint, + opensearch_url=env.opensearch.collection_endpoint, embedding_function=FakeEmbeddings(size=env.embedding_backend_vector_size), ) diff --git a/redbox-core/redbox/models/chain.py b/redbox-core/redbox/models/chain.py index 876001e4e..bbd6b84cf 100644 --- a/redbox-core/redbox/models/chain.py +++ b/redbox-core/redbox/models/chain.py @@ -60,7 +60,7 @@ class AISettings(BaseModel): chat_map_question_prompt: str = prompts.CHAT_MAP_QUESTION_PROMPT reduce_system_prompt: str = prompts.REDUCE_SYSTEM_PROMPT - # Elasticsearch RAG and boost values + # Opensearch RAG and boost values rag_k: int = 30 rag_num_candidates: int = 10 rag_gauss_scale_size: int = 3 diff --git a/redbox-core/redbox/models/settings.py b/redbox-core/redbox/models/settings.py index 6622d5689..aef0f29d2 100644 --- a/redbox-core/redbox/models/settings.py +++ b/redbox-core/redbox/models/settings.py @@ -7,9 +7,7 @@ import boto3 import environ from dotenv import load_dotenv -from elasticsearch import Elasticsearch from langchain.globals import set_debug -from openai import max_retries from opensearchpy import OpenSearch, RequestsHttpConnection from pydantic import AnyUrl, BaseModel from pydantic_settings import BaseSettings, SettingsConfigDict @@ -42,30 +40,6 @@ class OpenSearchSettings(BaseModel): collection_endpoint__port_local: Optional[str] = "9200" # locally, the port number is 9200 -class ElasticLocalSettings(BaseModel): - """settings required for a local/ec2 instance of elastic""" - - model_config = SettingsConfigDict(frozen=True) - - host: str = "elasticsearch" - port: int = 9200 - scheme: str = "http" - user: str = "elastic" - version: str = "8.11.0" - password: str = "redboxpass" - subscription_level: str = "basic" - - -class ElasticCloudSettings(BaseModel): - """settings required for elastic-cloud""" - - model_config = SettingsConfigDict(frozen=True) - - api_key: str - cloud_id: str - subscription_level: str = "basic" - - class ChatLLMBackend(BaseModel): name: str = "gpt-4o" provider: str = "azure_openai" @@ -197,12 +171,12 @@ class Settings(BaseSettings): } @property - def elastic_chat_mesage_index(self): - return self.elastic_root_index + "-chat-mesage-log" + def opensearch_chat_mesage_index(self): + return self.opensearch_root_index + "-chat-mesage-log" @property - def elastic_alias(self): - return self.elastic_root_index + "-chunk-current" + def opensearch_alias(self): + return self.opensearch_root_index + "-chunk-current" # @lru_cache(1) #removing cache because pydantic object (index mapping) is not hashable def opensearch_client(self) -> OpenSearch: @@ -210,11 +184,11 @@ def opensearch_client(self) -> OpenSearch: client = OpenSearch( hosts=[ { - "host": self.elastic.collection_endpoint__host, - "port": self.elastic.collection_endpoint__port_local, + "host": self.opensearch.collection_endpoint__host, + "port": self.opensearch.collection_endpoint__port_local, } ], - http_auth=(self.elastic.collection_endpoint__username, self.elastic.collection_endpoint__password), + http_auth=(self.opensearch.collection_endpoint__username, self.opensearch.collection_endpoint__password), use_ssl=False, connection_class=RequestsHttpConnection, ) @@ -222,9 +196,9 @@ def opensearch_client(self) -> OpenSearch: else: client = OpenSearch( hosts=[ - {"host": self.elastic.collection_endpoint__host, "port": self.elastic.collection_endpoint__port} + {"host": self.opensearch.collection_endpoint__host, "port": self.opensearch.collection_endpoint__port} ], - http_auth=(self.elastic.collection_endpoint__username, self.elastic.collection_endpoint__password), + http_auth=(self.opensearch.collection_endpoint__username, self.opensearch.collection_endpoint__password), use_ssl=True, verify_certs=True, connection_class=RequestsHttpConnection, @@ -233,10 +207,10 @@ def opensearch_client(self) -> OpenSearch: timeout=120, ) - if not client.indices.exists_alias(name=self.elastic_alias): - chunk_index = f"{self.elastic_root_index}-chunk" + if not client.indices.exists_alias(name=self.opensearch_alias): + chunk_index = f"{self.opensearch_root_index}-chunk" # client.options(ignore_status=[400]).indices.create(index=chunk_index) - # client.indices.put_alias(index=chunk_index, name=self.elastic_alias) + # client.indices.put_alias(index=chunk_index, name=self.opensearch_alias) try: client.indices.create( index=chunk_index, body=self.index_mapping, ignore=400 @@ -245,18 +219,18 @@ def opensearch_client(self) -> OpenSearch: logger.error(f"Failed to create index {chunk_index}: {e}") try: - client.indices.put_alias(index=chunk_index, name=f"{self.elastic_root_index}-chunk-current") + client.indices.put_alias(index=chunk_index, name=f"{self.opensearch_root_index}-chunk-current") except Exception as e: - logger.error(f"Failed to set alias {self.elastic_root_index}-chunk-current: {e}") + logger.error(f"Failed to set alias {self.opensearch_root_index}-chunk-current: {e}") - if not client.indices.exists(index=self.elastic_chat_mesage_index): + if not client.indices.exists(index=self.opensearch_chat_mesage_index): try: client.indices.create( - index=self.elastic_chat_mesage_index, ignore=400 + index=self.opensearch_chat_mesage_index, ignore=400 ) # 400 is ignored to avoid index-already-exists errors except Exception as e: - logger.error(f"Failed to create index {self.elastic_chat_mesage_index}: {e}") - # client.indices.create(index=self.elastic_chat_mesage_index) + logger.error(f"Failed to create index {self.opensearch_chat_mesage_index}: {e}") + # client.indices.create(index=self.opensearch_chat_mesage_index) return client diff --git a/redbox-core/redbox/retriever/queries.py b/redbox-core/redbox/retriever/queries.py index 66c44e0e3..9acb80a78 100644 --- a/redbox-core/redbox/retriever/queries.py +++ b/redbox-core/redbox/retriever/queries.py @@ -10,12 +10,12 @@ def build_file_filter(file_names: list[str]) -> dict[str, Any]: - """Creates an Elasticsearch filter for file names.""" + """Creates an Opensearch filter for file names.""" return {"terms": {"metadata.uri.keyword": file_names}} def build_resolution_filter(chunk_resolution: ChunkResolution) -> dict[str, Any]: - """Creates an Elasticsearch filter for chunk resolutions.""" + """Creates an Opensearch filter for chunk resolutions.""" return {"term": {"metadata.chunk_resolution.keyword": str(chunk_resolution.normal)}} #add normal to fix error @@ -56,7 +56,7 @@ def get_all( state: RedboxState, ) -> dict[str, Any]: """ - Returns a parameterised elastic query that will return everything it matches. + Returns a parameterised opensearch query that will return everything it matches. As it's used in summarisation, it excludes embeddings. """ @@ -97,7 +97,7 @@ def build_document_query( selected_files: list[str] | None = None, chunk_resolution: ChunkResolution | None = None, ) -> dict[str, Any]: - """Builds a an Elasticsearch query that will return documents when called. + """Builds a an Opensearch query that will return documents when called. Searches the document: * Text, as a keyword and similarity @@ -165,7 +165,7 @@ def build_document_query( def scale_score(score: float, old_min: float, old_max: float, new_min=1.1, new_max: float = 2.0): - """Rescales an Elasticsearch score. + """Rescales an Opensearch score. Intended to turn the score into a multiplier to weight a Gauss function. diff --git a/redbox-core/redbox/retriever/retrievers.py b/redbox-core/redbox/retriever/retrievers.py index 1fa7d6f02..b7c393e7a 100644 --- a/redbox-core/redbox/retriever/retrievers.py +++ b/redbox-core/redbox/retriever/retrievers.py @@ -4,7 +4,6 @@ from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Union, cast import opensearchpy -from elasticsearch import Elasticsearch # from elasticsearch.helpers import scan from opensearchpy.helpers import scan @@ -14,7 +13,6 @@ from langchain_core.documents import Document from langchain_core.embeddings.embeddings import Embeddings from langchain_core.retrievers import BaseRetriever -from langchain_elasticsearch.retrievers import ElasticsearchRetriever import os from redbox.models.chain import RedboxState @@ -123,7 +121,7 @@ def hit_to_doc(hit: dict[str, Any]) -> Document: def query_to_documents( - es_client: Union[Elasticsearch, OpenSearch], index_name: str, query: dict[str, Any] + es_client: OpenSearch, index_name: str, query: dict[str, Any] ) -> list[Document]: """Runs an Elasticsearch query and returns Documents.""" logger.info("query to opensearch: from query_to_documents") @@ -171,7 +169,7 @@ def _filter_by_elbow(docs: list[Document]) -> list[Document]: class ParameterisedElasticsearchRetriever(BaseRetriever): """A modified ElasticsearchRetriever that allows configuration from RedboxState.""" - es_client: Union[Elasticsearch, OpenSearch] + es_client: OpenSearch index_name: str | Sequence[str] embedding_model: Embeddings embedding_field_name: str = "embedding" @@ -224,7 +222,7 @@ class AllElasticsearchRetriever(OpenSearchRetriever): chunk_resolution: ChunkResolution = ChunkResolution.largest - def __init__(self, es_client: Union[Elasticsearch, OpenSearch], **kwargs: Any) -> None: + def __init__(self, es_client: OpenSearch, **kwargs: Any) -> None: # Hack to pass validation before overwrite # Partly necessary due to how .with_config() interacts with a retriever kwargs["es_client"] = es_client @@ -255,7 +253,7 @@ class MetadataRetriever(OpenSearchRetriever): chunk_resolution: ChunkResolution = ChunkResolution.largest - def __init__(self, es_client: Union[Elasticsearch, OpenSearch], **kwargs: Any) -> None: + def __init__(self, es_client: OpenSearch, **kwargs: Any) -> None: # Hack to pass validation before overwrite # Partly necessary due to how .with_config() interacts with a retriever kwargs["body_func"] = get_metadata diff --git a/redbox-core/tests/conftest.py b/redbox-core/tests/conftest.py index e53b6c809..070152c76 100644 --- a/redbox-core/tests/conftest.py +++ b/redbox-core/tests/conftest.py @@ -5,9 +5,9 @@ import tiktoken from _pytest.fixtures import FixtureRequest from botocore.exceptions import ClientError -from elasticsearch import Elasticsearch from langchain_core.embeddings.fake import FakeEmbeddings from langchain_elasticsearch import ElasticsearchStore +from opensearchpy import OpenSearch from tiktoken.core import Encoding from redbox.models.settings import Settings @@ -63,17 +63,17 @@ def embedding_model(embedding_model_dim: int) -> FakeEmbeddings: @pytest.fixture(scope="session") def es_index(env: Settings) -> str: - return f"{env.elastic_root_index}-chunk" + return f"{env.opensearch_root_index}-chunk" @pytest.fixture(scope="session") -def es_client(env: Settings) -> Elasticsearch: +def es_client(env: Settings) -> OpenSearch: return env.opensearch_client() @pytest.fixture(scope="session") def es_vector_store( - es_client: Elasticsearch, es_index: str, embedding_model: FakeEmbeddings, env: Settings + es_client: OpenSearch, es_index: str, embedding_model: FakeEmbeddings, env: Settings ) -> ElasticsearchStore: return ElasticsearchStore( index_name=es_index, @@ -94,7 +94,7 @@ def create_index(env: Settings, es_index: str) -> Generator[None, None, None]: @pytest.fixture(scope="session") -def all_chunks_retriever(es_client: Elasticsearch, es_index: str) -> AllElasticsearchRetriever: +def all_chunks_retriever(es_client: OpenSearch, es_index: str) -> AllElasticsearchRetriever: return AllElasticsearchRetriever( es_client=es_client, index_name=es_index, @@ -103,7 +103,7 @@ def all_chunks_retriever(es_client: Elasticsearch, es_index: str) -> AllElastics @pytest.fixture(scope="session") def parameterised_retriever( - env: Settings, es_client: Elasticsearch, es_index: str, embedding_model: FakeEmbeddings + env: Settings, es_client: OpenSearch, es_index: str, embedding_model: FakeEmbeddings ) -> ParameterisedElasticsearchRetriever: return ParameterisedElasticsearchRetriever( es_client=es_client, @@ -114,7 +114,7 @@ def parameterised_retriever( @pytest.fixture(scope="session") -def metadata_retriever(es_client: Elasticsearch, es_index: str) -> MetadataRetriever: +def metadata_retriever(es_client: OpenSearch, es_index: str) -> MetadataRetriever: return MetadataRetriever(es_client=es_client, index_name=es_index) diff --git a/redbox-core/tests/test_ingest.py b/redbox-core/tests/test_ingest.py index 29907b718..50b34da50 100644 --- a/redbox-core/tests/test_ingest.py +++ b/redbox-core/tests/test_ingest.py @@ -5,7 +5,8 @@ import pytest from _pytest.monkeypatch import MonkeyPatch -from elasticsearch import Elasticsearch +from opensearchpy import OpenSearch + from elasticsearch.helpers import scan from langchain_core.embeddings.fake import FakeEmbeddings from langchain_core.language_models.fake_chat_models import GenericFakeChatModel @@ -199,7 +200,7 @@ def test_ingest_from_loader( resolution: ChunkResolution, has_embeddings: bool, monkeypatch: MonkeyPatch, - es_client: Elasticsearch, + es_client: OpenSearch, es_vector_store: ElasticsearchStore, es_index: str, s3_client: S3Client, @@ -308,7 +309,7 @@ def get_metadata(chunk: dict) -> dict: def test_ingest_file( mock_post: MagicMock, mock_llm: MagicMock, - es_client: Elasticsearch, + es_client: OpenSearch, s3_client: S3Client, monkeypatch: MonkeyPatch, env: Settings, @@ -322,7 +323,7 @@ def test_ingest_file( When I call ingest_file I Expect to see this file to be: 1. chunked - 2. written to Elasticsearch + 2. written to OpenSearch """ # Mock call to Unstructured mock_response = mock_post.return_value diff --git a/redbox-core/tests/test_tools.py b/redbox-core/tests/test_tools.py index 18be2968b..84e0fd05e 100644 --- a/redbox-core/tests/test_tools.py +++ b/redbox-core/tests/test_tools.py @@ -2,7 +2,7 @@ from uuid import uuid4 import pytest -from elasticsearch import Elasticsearch +from opensearchpy import OpenSearch from langchain_core.embeddings.fake import FakeEmbeddings from langchain_core.messages import AIMessage from langgraph.prebuilt import ToolNode @@ -24,7 +24,7 @@ def test_search_documents_tool( chain_params: dict, stored_file_parameterised: RedboxChatTestCase, - es_client: Elasticsearch, + es_client: OpenSearch, es_index: str, embedding_model: FakeEmbeddings, env: Settings, diff --git a/redbox-core/tests/test_transform.py b/redbox-core/tests/test_transform.py index 03912f423..2c1107c6f 100644 --- a/redbox-core/tests/test_transform.py +++ b/redbox-core/tests/test_transform.py @@ -124,7 +124,7 @@ ) def test_combine_documents(a: Document, b: Document, combined: Document): """ - Test that documents as pulled by the Elasticsearch retriever get properly mapped to source documents + Test that documents as pulled by the Opensearch retriever get properly mapped to source documents """ test_combined = combine_documents(a, b)