From 9178c6b5fafd5c8b173fd7948bca23a224107fc3 Mon Sep 17 00:00:00 2001 From: jayasankar Date: Fri, 8 Dec 2023 23:18:41 +0530 Subject: [PATCH 1/7] api changes made to Show more information about source documents --- app/core/llm_framework/openai_vanilla.py | 5 +-- app/core/vectordb/postgres4langchain.py | 14 ++++++--- app/routers.py | 4 +-- app/schema.py | 40 ++++++++++++++++++++---- 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/app/core/llm_framework/openai_vanilla.py b/app/core/llm_framework/openai_vanilla.py index 05d500c..a03fbfb 100644 --- a/app/core/llm_framework/openai_vanilla.py +++ b/app/core/llm_framework/openai_vanilla.py @@ -23,8 +23,9 @@ def get_context(source_documents): len(source_document.page_content) + len(context) > 11000 ): # FIXME: use tiktoken library to count tokens break - context += "{source:" + source_document.metadata.get("source", "") - context += ", text: " + source_document.page_content + "}" + "," + if source_document.metadata.get("source", "") is not None: + context += "{source:" + source_document.metadata.get("source", "") + context += ", text: " + source_document.page_content + "}" + "," context += "]" + "\n" return context diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py index 206d371..416b2e1 100644 --- a/app/core/vectordb/postgres4langchain.py +++ b/app/core/vectordb/postgres4langchain.py @@ -140,6 +140,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: cur.execute( "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,)) doc_id_already_exists = cur.fetchone() + links= ",".join([str(item) for item in doc.links]) if not doc_id_already_exists: data_list.append( [ @@ -147,7 +148,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: doc.text, doc.label, doc.media, - str(doc.links), + links, doc.embedding, ] ) @@ -169,7 +170,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: doc.text, doc.label, doc.media, - str(doc.links), + links, doc.embedding, doc.docId, ), @@ -216,7 +217,7 @@ def _get_relevant_documents( cur = self.db_conn.cursor() cur.execute( """ - SELECT source_id, document + SELECT label, media, links, source_id, document FROM embeddings WHERE label = ANY(%s) AND embedding <=> %s < %s @@ -252,7 +253,12 @@ def _get_relevant_documents( ) ] return [ - LangchainDocument(page_content=doc[1], metadata={"source": doc[0]}) + + LangchainDocument(page_content=doc[1], metadata={"label": doc[0], + "media": doc[1], + 'link':doc[2], + 'source_id':doc[3], + 'document':doc[4]}) for doc in records ] diff --git a/app/routers.py b/app/routers.py index 63fe576..ea048a3 100644 --- a/app/routers.py +++ b/app/routers.py @@ -292,7 +292,7 @@ async def websocket_chat_endpoint( "Human: {0}\nBot:{1}\nSources:{2}\n\n".format( question, bot_response['answer'], - [item.metadata['source'] + [item.metadata for item in bot_response['source_documents']] ) ) @@ -310,7 +310,7 @@ async def websocket_chat_endpoint( message=bot_response["answer"], type=schema.ChatResponseType.ANSWER, sources=[ - item.metadata["source"] + item.metadata for item in bot_response["source_documents"] ], media=[], diff --git a/app/schema.py b/app/schema.py index d02918f..2e6c026 100644 --- a/app/schema.py +++ b/app/schema.py @@ -200,19 +200,47 @@ class ChatResponseType(str, Enum): ANSWER = "answer" ERROR = "error" +class SourceDocument(BaseModel): + """Source field of Chat response from server to UI or user app""" + label: str = Field( + "open-access", + examples=["paratext user manual or bible or door-43-users"], + desc="The common tag for all sentences under a set. " + + "Used for specifying access rules and filtering during querying", + ) + media:str = Field( + None, + desc="Additional media links, like images, videos etc " + + "to be used in output to make the chat interface multimodel", + ) + link: str = Field( + None, + desc="The links to fetch the actual resource. " + + "To be used by end user like a search result", + ) + source_id: str = Field( + None, + examples=["NIV Bible Mat 1:1-20"], + desc="Unique for a sentence. Used by the LLM to specify which document " + + "it answers from. Better to combine the source tag and a serial number.", + ) + document: str = Field( + None, desc="The sentence which is used for question answering" + ) + class BotResponse(BaseModel): """Chat response from server to UI or user app""" message: str = Field(..., examples=["Good Morning to you too!"]) sender: SenderType = Field(..., examples=["You or BOT"]) - sources: List[str] | None = Field( + sources: List[SourceDocument] | None = Field( None, - examples=[[ - "https://www.biblegateway.com/passage/?search=Genesis+1%3A1&version=NIV", - "https://git.door43.org/Door43-Catalog/en_tw/src/branch/master/" - + "bible/other/creation.md", - ]], + examples=[{'source': 'ESV-Bible', + 'media': {"https://www.youtube.com/watch?v=teu7BCZTgDs"}, + 'link': 'https://www.esv.org/Genesis+1/,https://www.esv.org/Genesis+1/', + 'source_id': 'ESV-gen1:1', + 'document': 'In the beginning, God created the heavens and the earth.'}], ) media: List[AnyUrl] | None = Field( None, examples=[["https://www.youtube.com/watch?v=teu7BCZTgDs"]] From 14896d7a1cf4fc8020adde9eb62a0aeb1bea0e44 Mon Sep 17 00:00:00 2001 From: jayasankar Date: Sun, 10 Dec 2023 16:15:04 +0530 Subject: [PATCH 2/7] lynting issue resolved --- app/core/vectordb/postgres4langchain.py | 1 - 1 file changed, 1 deletion(-) diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py index 416b2e1..242171f 100644 --- a/app/core/vectordb/postgres4langchain.py +++ b/app/core/vectordb/postgres4langchain.py @@ -253,7 +253,6 @@ def _get_relevant_documents( ) ] return [ - LangchainDocument(page_content=doc[1], metadata={"label": doc[0], "media": doc[1], 'link':doc[2], From 43cfd053da2404bf518b05eff52a4623a0041fd3 Mon Sep 17 00:00:00 2001 From: "Kavitha.Raju" Date: Mon, 11 Dec 2023 13:54:04 +0530 Subject: [PATCH 3/7] handle no OPENAI key case in tests --- app/core/llm_framework/openai_langchain.py | 2 +- app/core/pipeline/__init__.py | 13 ++++++++----- app/routers.py | 8 ++++++-- app/schema.py | 3 +++ app/tests/test_chat_on_websocket.py | 4 +++- app/tests/test_dataupload.py | 3 +++ 6 files changed, 24 insertions(+), 9 deletions(-) diff --git a/app/core/llm_framework/openai_langchain.py b/app/core/llm_framework/openai_langchain.py index 1bc0a46..872d1ac 100644 --- a/app/core/llm_framework/openai_langchain.py +++ b/app/core/llm_framework/openai_langchain.py @@ -33,7 +33,7 @@ class LangchainOpenAI(LLMFrameworkInterface): def __init__( self, # pylint: disable=super-init-not-called # FIXME : Ideal to be able to mock the __init__ from tests - key: str = os.getenv("OPENAI_API_KEY", "dummy-for-test"), + key: str = os.getenv("OPENAI_API_KEY"), model_name: str = "gpt-3.5-turbo", vectordb: VectordbInterface = Chroma(), max_tokens_limit: int = int( diff --git a/app/core/pipeline/__init__.py b/app/core/pipeline/__init__.py index 86b00f9..0dbac3a 100644 --- a/app/core/pipeline/__init__.py +++ b/app/core/pipeline/__init__.py @@ -126,8 +126,10 @@ def __init__( file_processor: FileProcessorInterface = LangchainLoader, embedding: EmbeddingInterface = SentenceTransformerEmbedding(), vectordb: VectordbInterface = Chroma(), - llm_framework: LLMFrameworkInterface = LangchainOpenAI(), + llm_framework: LLMFrameworkInterface = LangchainOpenAI, + llm_api_key: str | None = None, transcription_framework: AudioTranscriptionInterface = WhisperAudioTranscription, + transcription_api_key: str | None = None, ) -> None: """Instantiate with default tech stack""" super().__init__(file_processor, embedding, vectordb) @@ -138,7 +140,8 @@ def __init__( self.embedding = embedding self.vectordb = vectordb self.llm_framework = llm_framework - self.transcription_framework = transcription_framework() + self.transcription_framework = transcription_framework(key=transcription_api_key) + self.llm_framework = llm_framework(key=llm_api_key) def set_llm_framework( self, @@ -159,7 +162,7 @@ def set_llm_framework( path=vectordb.db_path, collection_name=vectordb.collection_name, ) - self.llm_framework = LangchainOpenAI(vectordb=vectordb) + self.llm_framework = LangchainOpenAI(vectordb=vectordb, api_key=api_key) elif choice == schema.LLMFrameworkType.VANILLA: if isinstance(vectordb, Chroma): vectordb = ChromaLC( @@ -168,7 +171,7 @@ def set_llm_framework( path=vectordb.db_path, collection_name=vectordb.collection_name, ) - self.llm_framework = OpenAIVanilla(vectordb=vectordb) + self.llm_framework = OpenAIVanilla(vectordb=vectordb, key=api_key) def set_transcription_framework( self, @@ -181,4 +184,4 @@ def set_transcription_framework( self.transcription_framework.api_key = api_key self.transcription_framework.model_name = model_name if choice == schema.AudioTranscriptionType.WHISPER: - self.transcription_framework = WhisperAudioTranscription() + self.transcription_framework = WhisperAudioTranscription(key=api_key) diff --git a/app/routers.py b/app/routers.py index 63fe576..e245519 100644 --- a/app/routers.py +++ b/app/routers.py @@ -228,7 +228,10 @@ async def websocket_chat_endpoint( if token: log.info("User, connecting with token, %s", token) await websocket.accept() - chat_stack = ConversationPipeline(user="XXX", labels=labels) + chat_stack = ConversationPipeline(user="XXX", + labels=labels, + transcription_api_key=settings.transcriptionApiKey, + llm_api_key=settings.llmApiKey) vectordb_args = compose_vector_db_args( settings.vectordbType, @@ -246,7 +249,8 @@ async def websocket_chat_endpoint( chat_stack.set_llm_framework( settings.llmFrameworkType, vectordb=chat_stack.vectordb, **llm_args ) - chat_stack.set_transcription_framework(settings.transcriptionFrameworkType) + chat_stack.set_transcription_framework(settings.transcriptionFrameworkType, + api_key=settings.transcriptionApiKey) # Not implemented using custom embeddings diff --git a/app/schema.py b/app/schema.py index d02918f..c56a228 100644 --- a/app/schema.py +++ b/app/schema.py @@ -179,6 +179,9 @@ class ChatPipelineSelector(BaseModel): AudioTranscriptionType.WHISPER, desc="The framework through which audio transcription is handled", ) + transcriptionApiKey: str | None = Field( + None, desc="If using a cloud service, like OpenAI, the key obtained from them" + ) # class UserPrompt(BaseModel): # not using this as we recieve string from websocket diff --git a/app/tests/test_chat_on_websocket.py b/app/tests/test_chat_on_websocket.py index c0adf14..c8bf574 100644 --- a/app/tests/test_chat_on_websocket.py +++ b/app/tests/test_chat_on_websocket.py @@ -18,6 +18,8 @@ "collectionName": "adotdcollection_test", "labels": ["NIV bible", "ESV-Bible", "translationwords", "open-access"], "token": admin_token, + "transcriptionApiKey":"dummy-key-for-openai", + "llmApiKey":"dummy-key-for-openai", } @@ -33,7 +35,7 @@ def assert_positive_bot_response(resp_json): assert resp_json["sender"] in ["Bot", "You"] -def test_chat_websocket_connection(mocker, fresh_db): +def test_chat_websocket_connection(mocker, fresh_db, monkeypatch): """Check if websocket is connecting to and is bot responding""" mocker.patch("app.routers.Supabase.check_token", return_value={"user_id": "1111"}) diff --git a/app/tests/test_dataupload.py b/app/tests/test_dataupload.py index 44d184e..342f6e7 100644 --- a/app/tests/test_dataupload.py +++ b/app/tests/test_dataupload.py @@ -61,6 +61,7 @@ def test_data_upload_processed_sentences(mocker, vectordb, fresh_db): "dbPath": fresh_db["dbPath"], "collectionName": fresh_db["collectionName"], "embeddingType": schema.EmbeddingType.HUGGINGFACE_DEFAULT.value, + "llmApiKey":"dummy-value", }, json=SENT_DATA, ) @@ -101,6 +102,7 @@ def test_data_upload_markdown(mocker, vectordb, chunker, fresh_db): "dbPath": fresh_db["dbPath"], "collectionName": fresh_db["collectionName"], "token": ADMIN_TOKEN, + "llmApiKey":"dummy-value", } # json={"vectordb_config": fresh_db} ) @@ -132,6 +134,7 @@ def test_data_upload_csv(mocker, vectordb, fresh_db): "dbPath": fresh_db["dbPath"], "collectionName": fresh_db["collectionName"], "token": ADMIN_TOKEN, + "llmApiKey":"dummy-value", }, json={"vectordb_config": fresh_db}, ) From fc0b6e237d140a75b20539900dd57d72565f4791 Mon Sep 17 00:00:00 2001 From: "Kavitha.Raju" Date: Mon, 11 Dec 2023 14:04:32 +0530 Subject: [PATCH 4/7] Fix linting issues --- app/core/pipeline/__init__.py | 2 +- app/tests/test_chat_on_websocket.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/app/core/pipeline/__init__.py b/app/core/pipeline/__init__.py index 0dbac3a..5d53015 100644 --- a/app/core/pipeline/__init__.py +++ b/app/core/pipeline/__init__.py @@ -162,7 +162,7 @@ def set_llm_framework( path=vectordb.db_path, collection_name=vectordb.collection_name, ) - self.llm_framework = LangchainOpenAI(vectordb=vectordb, api_key=api_key) + self.llm_framework = LangchainOpenAI(vectordb=vectordb, key=api_key) elif choice == schema.LLMFrameworkType.VANILLA: if isinstance(vectordb, Chroma): vectordb = ChromaLC( diff --git a/app/tests/test_chat_on_websocket.py b/app/tests/test_chat_on_websocket.py index c8bf574..1efe735 100644 --- a/app/tests/test_chat_on_websocket.py +++ b/app/tests/test_chat_on_websocket.py @@ -35,7 +35,7 @@ def assert_positive_bot_response(resp_json): assert resp_json["sender"] in ["Bot", "You"] -def test_chat_websocket_connection(mocker, fresh_db, monkeypatch): +def test_chat_websocket_connection(mocker, fresh_db): """Check if websocket is connecting to and is bot responding""" mocker.patch("app.routers.Supabase.check_token", return_value={"user_id": "1111"}) From fd2e90a3681df10262afae4ae046400a9810200f Mon Sep 17 00:00:00 2001 From: Mark Date: Mon, 22 Jan 2024 14:35:39 -0800 Subject: [PATCH 5/7] Add AQuA docs --- app/core/vectordb/postgres4langchain.py | 44 ++++++++++++------------- app/templates/chat-demo-postgres.html | 12 +++++++ 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py index 242171f..7ff6fb9 100644 --- a/app/core/vectordb/postgres4langchain.py +++ b/app/core/vectordb/postgres4langchain.py @@ -141,6 +141,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,)) doc_id_already_exists = cur.fetchone() links= ",".join([str(item) for item in doc.links]) + doc.text = doc.text.replace('\0', '').replace('\x00', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') if not doc_id_already_exists: data_list.append( [ @@ -176,31 +177,28 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: ), ) cur.close() - try: - cur = self.db_conn.cursor() - execute_values( - cur, - "INSERT INTO embeddings (source_id, document, label, media, links, embedding" - ") VALUES %s", - data_list, - ) - self.db_conn.commit() + cur = self.db_conn.cursor() + execute_values( + cur, + "INSERT INTO embeddings (source_id, document, label, media, links, embedding" + ") VALUES %s", + data_list, + ) + self.db_conn.commit() - # create index - cur.execute("SELECT COUNT(*) as cnt FROM embeddings;") - num_records = cur.fetchone()[0] - num_lists = num_records / 1000 - num_lists = max(10, num_lists, math.sqrt(num_records)) - # use the cosine distance measure, which is what we'll later use for querying - cur.execute( - "CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) " - + f"WITH (lists = {num_lists});" - ) - self.db_conn.commit() + # create index + cur.execute("SELECT COUNT(*) as cnt FROM embeddings;") + num_records = cur.fetchone()[0] + num_lists = num_records / 1000 + num_lists = max(10, num_lists, math.sqrt(num_records)) + # use the cosine distance measure, which is what we'll later use for querying + cur.execute( + "CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) " + + f"WITH (lists = {num_lists});" + ) + self.db_conn.commit() - cur.close() - except Exception as exe: - raise PostgresException("While adding data: " + str(exe)) from exe + cur.close() def _get_relevant_documents( self, query: list, run_manager: CallbackManagerForRetrieverRun| None = None, **kwargs diff --git a/app/templates/chat-demo-postgres.html b/app/templates/chat-demo-postgres.html index 0afa2d1..d500a42 100644 --- a/app/templates/chat-demo-postgres.html +++ b/app/templates/chat-demo-postgres.html @@ -67,6 +67,18 @@

assistant.bible

Faith and Farming +
+ + +
Date: Mon, 22 Jan 2024 14:43:32 -0800 Subject: [PATCH 6/7] Add AQuA docs javascript --- app/templates/chat-demo-postgres.html | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/app/templates/chat-demo-postgres.html b/app/templates/chat-demo-postgres.html index d500a42..e9764a3 100644 --- a/app/templates/chat-demo-postgres.html +++ b/app/templates/chat-demo-postgres.html @@ -221,6 +221,7 @@

assistant.bible

assistant.bible---Domain:Faith-and-Farming:' + faithAndFarmingElement.checked + '------Domain:Faith-and-Farming:' + faithAndFarmingElement.checked + '------Domain:AQuA-Docs:' + aquaDocsElement.checked + '--- Date: Mon, 22 Jan 2024 21:24:43 -0800 Subject: [PATCH 7/7] Replace null tokens and new lines in imported text --- app/core/vectordb/postgres4langchain.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py index 7ff6fb9..8dcb886 100644 --- a/app/core/vectordb/postgres4langchain.py +++ b/app/core/vectordb/postgres4langchain.py @@ -136,6 +136,12 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: """Loads the document object as per chroma DB formats into the collection""" data_list = [] for doc in docs: + doc.text = (doc.text + .replace("\n", " ") + .replace("\r", " ") + .replace("\t", " ") + .replace('\x00', '') + ) cur = self.db_conn.cursor() cur.execute( "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))