From 2a776aef81dcb5ab92f39c11d3866475721b3081 Mon Sep 17 00:00:00 2001 From: aquintero Date: Mon, 22 Jan 2024 16:43:42 -0700 Subject: [PATCH 1/5] going back before breaking changes by handle no OPEANAI key case in test --- app/core/llm_framework/openai_langchain.py | 2 +- app/core/llm_framework/openai_vanilla.py | 5 +- app/core/pipeline/__init__.py | 13 +++-- app/core/vectordb/postgres4langchain.py | 57 ++++++++++++---------- app/routers.py | 12 +++-- app/schema.py | 43 +++++++++++++--- app/templates/chat-demo-postgres.html | 12 +++++ app/tests/test_chat_on_websocket.py | 2 + app/tests/test_dataupload.py | 3 ++ 9 files changed, 104 insertions(+), 45 deletions(-) diff --git a/app/core/llm_framework/openai_langchain.py b/app/core/llm_framework/openai_langchain.py index 1bc0a46..872d1ac 100644 --- a/app/core/llm_framework/openai_langchain.py +++ b/app/core/llm_framework/openai_langchain.py @@ -33,7 +33,7 @@ class LangchainOpenAI(LLMFrameworkInterface): def __init__( self, # pylint: disable=super-init-not-called # FIXME : Ideal to be able to mock the __init__ from tests - key: str = os.getenv("OPENAI_API_KEY", "dummy-for-test"), + key: str = os.getenv("OPENAI_API_KEY"), model_name: str = "gpt-3.5-turbo", vectordb: VectordbInterface = Chroma(), max_tokens_limit: int = int( diff --git a/app/core/llm_framework/openai_vanilla.py b/app/core/llm_framework/openai_vanilla.py index 96609cd..dfb4c99 100644 --- a/app/core/llm_framework/openai_vanilla.py +++ b/app/core/llm_framework/openai_vanilla.py @@ -24,8 +24,9 @@ def get_context(source_documents): len(source_document.page_content) + len(context) > 11000 ): # FIXME: use tiktoken library to count tokens break - context += "{source:" + source_document.metadata.get("source", "") - context += ", text: " + source_document.page_content + "}" + "," + if source_document.metadata.get("source", "") is not None: + context += "{source:" + source_document.metadata.get("source", "") + context += ", text: " + source_document.page_content + "}" + "," context += "]" + "\n" return context diff --git a/app/core/pipeline/__init__.py b/app/core/pipeline/__init__.py index 86b00f9..5d53015 100644 --- a/app/core/pipeline/__init__.py +++ b/app/core/pipeline/__init__.py @@ -126,8 +126,10 @@ def __init__( file_processor: FileProcessorInterface = LangchainLoader, embedding: EmbeddingInterface = SentenceTransformerEmbedding(), vectordb: VectordbInterface = Chroma(), - llm_framework: LLMFrameworkInterface = LangchainOpenAI(), + llm_framework: LLMFrameworkInterface = LangchainOpenAI, + llm_api_key: str | None = None, transcription_framework: AudioTranscriptionInterface = WhisperAudioTranscription, + transcription_api_key: str | None = None, ) -> None: """Instantiate with default tech stack""" super().__init__(file_processor, embedding, vectordb) @@ -138,7 +140,8 @@ def __init__( self.embedding = embedding self.vectordb = vectordb self.llm_framework = llm_framework - self.transcription_framework = transcription_framework() + self.transcription_framework = transcription_framework(key=transcription_api_key) + self.llm_framework = llm_framework(key=llm_api_key) def set_llm_framework( self, @@ -159,7 +162,7 @@ def set_llm_framework( path=vectordb.db_path, collection_name=vectordb.collection_name, ) - self.llm_framework = LangchainOpenAI(vectordb=vectordb) + self.llm_framework = LangchainOpenAI(vectordb=vectordb, key=api_key) elif choice == schema.LLMFrameworkType.VANILLA: if isinstance(vectordb, Chroma): vectordb = ChromaLC( @@ -168,7 +171,7 @@ def set_llm_framework( path=vectordb.db_path, collection_name=vectordb.collection_name, ) - self.llm_framework = OpenAIVanilla(vectordb=vectordb) + self.llm_framework = OpenAIVanilla(vectordb=vectordb, key=api_key) def set_transcription_framework( self, @@ -181,4 +184,4 @@ def set_transcription_framework( self.transcription_framework.api_key = api_key self.transcription_framework.model_name = model_name if choice == schema.AudioTranscriptionType.WHISPER: - self.transcription_framework = WhisperAudioTranscription() + self.transcription_framework = WhisperAudioTranscription(key=api_key) diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py index 206d371..7ff6fb9 100644 --- a/app/core/vectordb/postgres4langchain.py +++ b/app/core/vectordb/postgres4langchain.py @@ -140,6 +140,8 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: cur.execute( "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,)) doc_id_already_exists = cur.fetchone() + links= ",".join([str(item) for item in doc.links]) + doc.text = doc.text.replace('\0', '').replace('\x00', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') if not doc_id_already_exists: data_list.append( [ @@ -147,7 +149,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: doc.text, doc.label, doc.media, - str(doc.links), + links, doc.embedding, ] ) @@ -169,37 +171,34 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: doc.text, doc.label, doc.media, - str(doc.links), + links, doc.embedding, doc.docId, ), ) cur.close() - try: - cur = self.db_conn.cursor() - execute_values( - cur, - "INSERT INTO embeddings (source_id, document, label, media, links, embedding" - ") VALUES %s", - data_list, - ) - self.db_conn.commit() + cur = self.db_conn.cursor() + execute_values( + cur, + "INSERT INTO embeddings (source_id, document, label, media, links, embedding" + ") VALUES %s", + data_list, + ) + self.db_conn.commit() - # create index - cur.execute("SELECT COUNT(*) as cnt FROM embeddings;") - num_records = cur.fetchone()[0] - num_lists = num_records / 1000 - num_lists = max(10, num_lists, math.sqrt(num_records)) - # use the cosine distance measure, which is what we'll later use for querying - cur.execute( - "CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) " - + f"WITH (lists = {num_lists});" - ) - self.db_conn.commit() + # create index + cur.execute("SELECT COUNT(*) as cnt FROM embeddings;") + num_records = cur.fetchone()[0] + num_lists = num_records / 1000 + num_lists = max(10, num_lists, math.sqrt(num_records)) + # use the cosine distance measure, which is what we'll later use for querying + cur.execute( + "CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) " + + f"WITH (lists = {num_lists});" + ) + self.db_conn.commit() - cur.close() - except Exception as exe: - raise PostgresException("While adding data: " + str(exe)) from exe + cur.close() def _get_relevant_documents( self, query: list, run_manager: CallbackManagerForRetrieverRun| None = None, **kwargs @@ -216,7 +215,7 @@ def _get_relevant_documents( cur = self.db_conn.cursor() cur.execute( """ - SELECT source_id, document + SELECT label, media, links, source_id, document FROM embeddings WHERE label = ANY(%s) AND embedding <=> %s < %s @@ -252,7 +251,11 @@ def _get_relevant_documents( ) ] return [ - LangchainDocument(page_content=doc[1], metadata={"source": doc[0]}) + LangchainDocument(page_content=doc[1], metadata={"label": doc[0], + "media": doc[1], + 'link':doc[2], + 'source_id':doc[3], + 'document':doc[4]}) for doc in records ] diff --git a/app/routers.py b/app/routers.py index 63fe576..baa91ac 100644 --- a/app/routers.py +++ b/app/routers.py @@ -228,7 +228,10 @@ async def websocket_chat_endpoint( if token: log.info("User, connecting with token, %s", token) await websocket.accept() - chat_stack = ConversationPipeline(user="XXX", labels=labels) + chat_stack = ConversationPipeline(user="XXX", + labels=labels, + transcription_api_key=settings.transcriptionApiKey, + llm_api_key=settings.llmApiKey) vectordb_args = compose_vector_db_args( settings.vectordbType, @@ -246,7 +249,8 @@ async def websocket_chat_endpoint( chat_stack.set_llm_framework( settings.llmFrameworkType, vectordb=chat_stack.vectordb, **llm_args ) - chat_stack.set_transcription_framework(settings.transcriptionFrameworkType) + chat_stack.set_transcription_framework(settings.transcriptionFrameworkType, + api_key=settings.transcriptionApiKey) # Not implemented using custom embeddings @@ -292,7 +296,7 @@ async def websocket_chat_endpoint( "Human: {0}\nBot:{1}\nSources:{2}\n\n".format( question, bot_response['answer'], - [item.metadata['source'] + [item.metadata for item in bot_response['source_documents']] ) ) @@ -310,7 +314,7 @@ async def websocket_chat_endpoint( message=bot_response["answer"], type=schema.ChatResponseType.ANSWER, sources=[ - item.metadata["source"] + item.metadata for item in bot_response["source_documents"] ], media=[], diff --git a/app/schema.py b/app/schema.py index d02918f..aa9a9c0 100644 --- a/app/schema.py +++ b/app/schema.py @@ -179,6 +179,9 @@ class ChatPipelineSelector(BaseModel): AudioTranscriptionType.WHISPER, desc="The framework through which audio transcription is handled", ) + transcriptionApiKey: str | None = Field( + None, desc="If using a cloud service, like OpenAI, the key obtained from them" + ) # class UserPrompt(BaseModel): # not using this as we recieve string from websocket @@ -200,19 +203,47 @@ class ChatResponseType(str, Enum): ANSWER = "answer" ERROR = "error" +class SourceDocument(BaseModel): + """Source field of Chat response from server to UI or user app""" + label: str = Field( + "open-access", + examples=["paratext user manual or bible or door-43-users"], + desc="The common tag for all sentences under a set. " + + "Used for specifying access rules and filtering during querying", + ) + media:str = Field( + None, + desc="Additional media links, like images, videos etc " + + "to be used in output to make the chat interface multimodel", + ) + link: str = Field( + None, + desc="The links to fetch the actual resource. " + + "To be used by end user like a search result", + ) + source_id: str = Field( + None, + examples=["NIV Bible Mat 1:1-20"], + desc="Unique for a sentence. Used by the LLM to specify which document " + + "it answers from. Better to combine the source tag and a serial number.", + ) + document: str = Field( + None, desc="The sentence which is used for question answering" + ) + class BotResponse(BaseModel): """Chat response from server to UI or user app""" message: str = Field(..., examples=["Good Morning to you too!"]) sender: SenderType = Field(..., examples=["You or BOT"]) - sources: List[str] | None = Field( + sources: List[SourceDocument] | None = Field( None, - examples=[[ - "https://www.biblegateway.com/passage/?search=Genesis+1%3A1&version=NIV", - "https://git.door43.org/Door43-Catalog/en_tw/src/branch/master/" - + "bible/other/creation.md", - ]], + examples=[{'source': 'ESV-Bible', + 'media': {"https://www.youtube.com/watch?v=teu7BCZTgDs"}, + 'link': 'https://www.esv.org/Genesis+1/,https://www.esv.org/Genesis+1/', + 'source_id': 'ESV-gen1:1', + 'document': 'In the beginning, God created the heavens and the earth.'}], ) media: List[AnyUrl] | None = Field( None, examples=[["https://www.youtube.com/watch?v=teu7BCZTgDs"]] diff --git a/app/templates/chat-demo-postgres.html b/app/templates/chat-demo-postgres.html index 0afa2d1..d500a42 100644 --- a/app/templates/chat-demo-postgres.html +++ b/app/templates/chat-demo-postgres.html @@ -67,6 +67,18 @@

assistant.bible

Faith and Farming +
+ + +
Date: Mon, 22 Jan 2024 14:35:39 -0800 Subject: [PATCH 2/5] Add AQuA docs From 3779eaf10ea85c2613d160d6f951e0cb78d0e22d Mon Sep 17 00:00:00 2001 From: Mark Date: Mon, 22 Jan 2024 21:23:54 -0800 Subject: [PATCH 3/5] Switch to latest gpt-3.5-turbo, increase context length, and ask it to be more verbose --- app/core/llm_framework/openai_vanilla.py | 7 ++++--- app/templates/chat-demo-postgres.html | 21 +++++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/app/core/llm_framework/openai_vanilla.py b/app/core/llm_framework/openai_vanilla.py index 05d500c..39790b4 100644 --- a/app/core/llm_framework/openai_vanilla.py +++ b/app/core/llm_framework/openai_vanilla.py @@ -20,7 +20,7 @@ def get_context(source_documents): # ** This will need to be adjusted, based on what the returned results look like ** for _, source_document in enumerate(source_documents): if ( - len(source_document.page_content) + len(context) > 11000 + len(source_document.page_content) + len(context) > 44000 ): # FIXME: use tiktoken library to count tokens break context += "{source:" + source_document.metadata.get("source", "") @@ -34,7 +34,8 @@ def get_pre_prompt(context): """Constructs a pre-prompt for the conversation, including the context""" chat_prefix = "The following is a conversation with an AI assistant for " chat_prefix += "Bible translators. The assistant is" - chat_prefix += " helpful, creative, clever, very friendly and follows instructions carefully.\n" + chat_prefix += "verbose, helpful, creative, clever, very friendly and follows instructions carefully," + chat_prefix += "giving as much information as possible.\n" prompt = ( chat_prefix + "Read the paragraph below and answer the question, using only the information" @@ -75,7 +76,7 @@ class OpenAIVanilla(LLMFrameworkInterface): # pylint: disable=too-few-public-me def __init__( self, # pylint: disable=super-init-not-called key: str = os.getenv("OPENAI_API_KEY"), - model_name: str = "gpt-3.5-turbo", + model_name: str = "gpt-3.5-turbo-1106", vectordb: VectordbInterface = None, # What should this be by default? ) -> None: """Sets the API key and initializes library objects if any""" diff --git a/app/templates/chat-demo-postgres.html b/app/templates/chat-demo-postgres.html index 0afa2d1..e9764a3 100644 --- a/app/templates/chat-demo-postgres.html +++ b/app/templates/chat-demo-postgres.html @@ -67,6 +67,18 @@

assistant.bible

Faith and Farming
+
+ + +
assistant.bibleassistant.bible---Domain:Faith-and-Farming:' + faithAndFarmingElement.checked + '------Domain:Faith-and-Farming:' + faithAndFarmingElement.checked + '------Domain:AQuA-Docs:' + aquaDocsElement.checked + '--- Date: Mon, 22 Jan 2024 21:24:43 -0800 Subject: [PATCH 4/5] Replace null tokens and new lines in imported text --- app/core/vectordb/postgres4langchain.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py index 206d371..bd40315 100644 --- a/app/core/vectordb/postgres4langchain.py +++ b/app/core/vectordb/postgres4langchain.py @@ -136,6 +136,12 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None: """Loads the document object as per chroma DB formats into the collection""" data_list = [] for doc in docs: + doc.text = (doc.text + .replace("\n", " ") + .replace("\r", " ") + .replace("\t", " ") + .replace('\x00', '') + ) cur = self.db_conn.cursor() cur.execute( "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,)) From 2f4e7b1ec335469f008160632a6e5a7231a19929 Mon Sep 17 00:00:00 2001 From: "dev.assitant.bible" Date: Wed, 24 Jan 2024 20:15:57 +0000 Subject: [PATCH 5/5] updating files for cert renewal --- deployment/docker-compose.yml | 6 +++--- deployment/nginx/nginx.conf | 21 +++++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 deployment/nginx/nginx.conf diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml index 22320c7..417de98 100644 --- a/deployment/docker-compose.yml +++ b/deployment/docker-compose.yml @@ -76,7 +76,7 @@ services: - chatbot environment: - CHAT_DOMAIN=${DOMAIN:-"localhost"} - - PROD_DOMAIN=${DOMAIN2:-assistant.bible} + - PROD_DOMAIN=${DOMAIN2:-dev.assistant.bible} volumes: - ./nginx/nginx.conf.template:/etc/nginx/templates/default.conf.template:ro # - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro @@ -93,7 +93,7 @@ services: - ./certbot/conf/:/etc/letsencrypt/:rw networks: - chatbot-network - + # command: certonly --webroot --webroot-path=/var/www/certbot --email alejandro_quintero@sil.org --agree-tos --no-eff-email --staging -d dev.assistant.bible ofelia-scheduler: image: mcuadros/ofelia:v0.3.7 depends_on: @@ -131,4 +131,4 @@ volumes: logs-vol: chroma-db: postgres-db-vol: - postgres-db-backup: \ No newline at end of file + postgres-db-backup: diff --git a/deployment/nginx/nginx.conf b/deployment/nginx/nginx.conf new file mode 100644 index 0000000..0ab4fc3 --- /dev/null +++ b/deployment/nginx/nginx.conf @@ -0,0 +1,21 @@ +events { + worker_connections 1024; + # other events directives can be placed here +} +http { + server { + listen 80; + listen [::]:80; + + server_name [dev.assistant.bible] www.[dev.assistant.bible]; + server_tokens off; + + location /.well-known/acme-challenge/ { + root /var/www/certbot; + } + + location / { + return 301 https://[dev.assistant.bible]$request_uri; + } + } +} \ No newline at end of file