From 2a776aef81dcb5ab92f39c11d3866475721b3081 Mon Sep 17 00:00:00 2001
From: aquintero <alejandro_quintero@sil.org>
Date: Mon, 22 Jan 2024 16:43:42 -0700
Subject: [PATCH 1/5] going back before breaking changes by handle no OPEANAI
 key case in test

---
 app/core/llm_framework/openai_langchain.py |  2 +-
 app/core/llm_framework/openai_vanilla.py   |  5 +-
 app/core/pipeline/__init__.py              | 13 +++--
 app/core/vectordb/postgres4langchain.py    | 57 ++++++++++++----------
 app/routers.py                             | 12 +++--
 app/schema.py                              | 43 +++++++++++++---
 app/templates/chat-demo-postgres.html      | 12 +++++
 app/tests/test_chat_on_websocket.py        |  2 +
 app/tests/test_dataupload.py               |  3 ++
 9 files changed, 104 insertions(+), 45 deletions(-)

diff --git a/app/core/llm_framework/openai_langchain.py b/app/core/llm_framework/openai_langchain.py
index 1bc0a46..872d1ac 100644
--- a/app/core/llm_framework/openai_langchain.py
+++ b/app/core/llm_framework/openai_langchain.py
@@ -33,7 +33,7 @@ class LangchainOpenAI(LLMFrameworkInterface):
     def __init__(
         self,  # pylint: disable=super-init-not-called
         # FIXME : Ideal to be able to mock the __init__ from tests
-        key: str = os.getenv("OPENAI_API_KEY", "dummy-for-test"),
+        key: str = os.getenv("OPENAI_API_KEY"),
         model_name: str = "gpt-3.5-turbo",
         vectordb: VectordbInterface = Chroma(),
         max_tokens_limit: int = int(
diff --git a/app/core/llm_framework/openai_vanilla.py b/app/core/llm_framework/openai_vanilla.py
index 96609cd..dfb4c99 100644
--- a/app/core/llm_framework/openai_vanilla.py
+++ b/app/core/llm_framework/openai_vanilla.py
@@ -24,8 +24,9 @@ def get_context(source_documents):
             len(source_document.page_content) + len(context) > 11000
         ):  # FIXME: use tiktoken library to count tokens
             break
-        context += "{source:" + source_document.metadata.get("source", "")
-        context += ", text: " + source_document.page_content + "}" + ","
+        if source_document.metadata.get("source", "") is not None:
+            context += "{source:" + source_document.metadata.get("source", "")
+            context += ", text: " + source_document.page_content + "}" + ","
     context += "]" + "\n"
 
     return context
diff --git a/app/core/pipeline/__init__.py b/app/core/pipeline/__init__.py
index 86b00f9..5d53015 100644
--- a/app/core/pipeline/__init__.py
+++ b/app/core/pipeline/__init__.py
@@ -126,8 +126,10 @@ def __init__(
         file_processor: FileProcessorInterface = LangchainLoader,
         embedding: EmbeddingInterface = SentenceTransformerEmbedding(),
         vectordb: VectordbInterface = Chroma(),
-        llm_framework: LLMFrameworkInterface = LangchainOpenAI(),
+        llm_framework: LLMFrameworkInterface = LangchainOpenAI,
+        llm_api_key: str | None = None,
         transcription_framework: AudioTranscriptionInterface = WhisperAudioTranscription,
+        transcription_api_key: str | None = None,
     ) -> None:
         """Instantiate with default tech stack"""
         super().__init__(file_processor, embedding, vectordb)
@@ -138,7 +140,8 @@ def __init__(
         self.embedding = embedding
         self.vectordb = vectordb
         self.llm_framework = llm_framework
-        self.transcription_framework = transcription_framework()
+        self.transcription_framework = transcription_framework(key=transcription_api_key)
+        self.llm_framework = llm_framework(key=llm_api_key)
 
     def set_llm_framework(
         self,
@@ -159,7 +162,7 @@ def set_llm_framework(
                     path=vectordb.db_path,
                     collection_name=vectordb.collection_name,
                 )
-            self.llm_framework = LangchainOpenAI(vectordb=vectordb)
+            self.llm_framework = LangchainOpenAI(vectordb=vectordb, key=api_key)
         elif choice == schema.LLMFrameworkType.VANILLA:
             if isinstance(vectordb, Chroma):
                 vectordb = ChromaLC(
@@ -168,7 +171,7 @@ def set_llm_framework(
                     path=vectordb.db_path,
                     collection_name=vectordb.collection_name,
                 )
-            self.llm_framework = OpenAIVanilla(vectordb=vectordb)
+            self.llm_framework = OpenAIVanilla(vectordb=vectordb, key=api_key)
 
     def set_transcription_framework(
         self,
@@ -181,4 +184,4 @@ def set_transcription_framework(
         self.transcription_framework.api_key = api_key
         self.transcription_framework.model_name = model_name
         if choice == schema.AudioTranscriptionType.WHISPER:
-            self.transcription_framework = WhisperAudioTranscription()
+            self.transcription_framework = WhisperAudioTranscription(key=api_key)
diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py
index 206d371..7ff6fb9 100644
--- a/app/core/vectordb/postgres4langchain.py
+++ b/app/core/vectordb/postgres4langchain.py
@@ -140,6 +140,8 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
             cur.execute(
                 "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))
             doc_id_already_exists = cur.fetchone()
+            links= ",".join([str(item) for item in doc.links])
+            doc.text = doc.text.replace('\0', '').replace('\x00', '').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
             if not doc_id_already_exists:
                 data_list.append(
                     [
@@ -147,7 +149,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
                         doc.text,
                         doc.label,
                         doc.media,
-                        str(doc.links),
+                        links,
                         doc.embedding,
                     ]
                 )
@@ -169,37 +171,34 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
                         doc.text,
                         doc.label,
                         doc.media,
-                        str(doc.links),
+                        links,
                         doc.embedding,
                         doc.docId,
                     ),
                 )
             cur.close()
-        try:
-            cur = self.db_conn.cursor()
-            execute_values(
-                cur,
-                "INSERT INTO embeddings (source_id, document, label, media, links, embedding"
-                ") VALUES %s",
-                data_list,
-            )
-            self.db_conn.commit()
+        cur = self.db_conn.cursor()
+        execute_values(
+            cur,
+            "INSERT INTO embeddings (source_id, document, label, media, links, embedding"
+            ") VALUES %s",
+            data_list,
+        )
+        self.db_conn.commit()
 
-            # create index
-            cur.execute("SELECT COUNT(*) as cnt FROM embeddings;")
-            num_records = cur.fetchone()[0]
-            num_lists = num_records / 1000
-            num_lists = max(10, num_lists, math.sqrt(num_records))
-            # use the cosine distance measure, which is what we'll later use for querying
-            cur.execute(
-                "CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) "
-                + f"WITH (lists = {num_lists});"
-            )
-            self.db_conn.commit()
+        # create index
+        cur.execute("SELECT COUNT(*) as cnt FROM embeddings;")
+        num_records = cur.fetchone()[0]
+        num_lists = num_records / 1000
+        num_lists = max(10, num_lists, math.sqrt(num_records))
+        # use the cosine distance measure, which is what we'll later use for querying
+        cur.execute(
+            "CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) "
+            + f"WITH (lists = {num_lists});"
+        )
+        self.db_conn.commit()
 
-            cur.close()
-        except Exception as exe:
-            raise PostgresException("While adding data: " + str(exe)) from exe
+        cur.close()
 
     def _get_relevant_documents(
         self, query: list, run_manager: CallbackManagerForRetrieverRun| None = None, **kwargs
@@ -216,7 +215,7 @@ def _get_relevant_documents(
             cur = self.db_conn.cursor()
             cur.execute(
                 """
-                SELECT source_id, document 
+                SELECT label, media, links, source_id, document 
                 FROM embeddings 
                 WHERE label = ANY(%s) 
                 AND embedding <=> %s < %s 
@@ -252,7 +251,11 @@ def _get_relevant_documents(
                 )
             ]
         return [
-            LangchainDocument(page_content=doc[1], metadata={"source": doc[0]})
+            LangchainDocument(page_content=doc[1], metadata={"label": doc[0],
+                                                             "media": doc[1],
+                                                             'link':doc[2],
+                                                            'source_id':doc[3],
+                                                            'document':doc[4]})
             for doc in records
         ]
 
diff --git a/app/routers.py b/app/routers.py
index 63fe576..baa91ac 100644
--- a/app/routers.py
+++ b/app/routers.py
@@ -228,7 +228,10 @@ async def websocket_chat_endpoint(
     if token:
         log.info("User, connecting with token, %s", token)
     await websocket.accept()
-    chat_stack = ConversationPipeline(user="XXX", labels=labels)
+    chat_stack = ConversationPipeline(user="XXX",
+                                        labels=labels,
+                                        transcription_api_key=settings.transcriptionApiKey,
+                                        llm_api_key=settings.llmApiKey)
 
     vectordb_args = compose_vector_db_args(
         settings.vectordbType,
@@ -246,7 +249,8 @@ async def websocket_chat_endpoint(
     chat_stack.set_llm_framework(
         settings.llmFrameworkType, vectordb=chat_stack.vectordb, **llm_args
     )
-    chat_stack.set_transcription_framework(settings.transcriptionFrameworkType)
+    chat_stack.set_transcription_framework(settings.transcriptionFrameworkType,
+                                            api_key=settings.transcriptionApiKey)
 
     # Not implemented using custom embeddings
 
@@ -292,7 +296,7 @@ async def websocket_chat_endpoint(
                     "Human: {0}\nBot:{1}\nSources:{2}\n\n".format(
                         question,
                         bot_response['answer'],
-                        [item.metadata['source']
+                        [item.metadata
                             for item in bot_response['source_documents']]
                     )
                 )
@@ -310,7 +314,7 @@ async def websocket_chat_endpoint(
                     message=bot_response["answer"],
                     type=schema.ChatResponseType.ANSWER,
                     sources=[
-                        item.metadata["source"]
+                        item.metadata
                         for item in bot_response["source_documents"]
                     ],
                     media=[],
diff --git a/app/schema.py b/app/schema.py
index d02918f..aa9a9c0 100644
--- a/app/schema.py
+++ b/app/schema.py
@@ -179,6 +179,9 @@ class ChatPipelineSelector(BaseModel):
         AudioTranscriptionType.WHISPER,
         desc="The framework through which audio transcription is handled",
     )
+    transcriptionApiKey: str | None = Field(
+        None, desc="If using a cloud service, like OpenAI, the key obtained from them"
+    )
 
 
 # class UserPrompt(BaseModel): # not using this as we recieve string from websocket
@@ -200,19 +203,47 @@ class ChatResponseType(str, Enum):
     ANSWER = "answer"
     ERROR = "error"
 
+class SourceDocument(BaseModel):
+    """Source field of Chat response from server to UI or user app"""
+    label: str = Field(
+        "open-access",
+        examples=["paratext user manual or bible or door-43-users"],
+        desc="The common tag for all sentences under a set. "
+        + "Used for specifying access rules and filtering during querying",
+    )
+    media:str = Field(
+        None,
+        desc="Additional media links, like images, videos etc "
+        + "to be used in output to make the chat interface multimodel",
+    )
+    link: str = Field(
+        None,
+        desc="The links to fetch the actual resource. "
+        + "To be used by end user like a search result",
+    )
+    source_id: str = Field(
+        None,
+        examples=["NIV Bible Mat 1:1-20"],
+        desc="Unique for a sentence. Used by the LLM to specify which document "
+        + "it answers from. Better to combine the source tag and a serial number.",
+    )
+    document: str = Field(
+        None, desc="The sentence which is used for question answering"
+    )
+
 
 class BotResponse(BaseModel):
     """Chat response from server to UI or user app"""
 
     message: str = Field(..., examples=["Good Morning to you too!"])
     sender: SenderType = Field(..., examples=["You or BOT"])
-    sources: List[str] | None = Field(
+    sources: List[SourceDocument] | None = Field(
         None,
-        examples=[[
-            "https://www.biblegateway.com/passage/?search=Genesis+1%3A1&version=NIV",
-            "https://git.door43.org/Door43-Catalog/en_tw/src/branch/master/"
-            + "bible/other/creation.md",
-        ]],
+        examples=[{'source': 'ESV-Bible',
+                   'media': {"https://www.youtube.com/watch?v=teu7BCZTgDs"}, 
+                   'link': 'https://www.esv.org/Genesis+1/,https://www.esv.org/Genesis+1/',
+                   'source_id': 'ESV-gen1:1',
+                   'document': 'In the beginning, God created the heavens and the earth.'}],
     )
     media: List[AnyUrl] | None = Field(
         None, examples=[["https://www.youtube.com/watch?v=teu7BCZTgDs"]]
diff --git a/app/templates/chat-demo-postgres.html b/app/templates/chat-demo-postgres.html
index 0afa2d1..d500a42 100644
--- a/app/templates/chat-demo-postgres.html
+++ b/app/templates/chat-demo-postgres.html
@@ -67,6 +67,18 @@ <h1 class="font-bold mb-4">assistant.bible</h1></a
               >Faith and Farming</label
             >
           </div>
+          <div class="flex items-center">
+            <input
+              type="checkbox"
+              id="aqua-docs"
+              name="aqua-docs"
+              value="aqua_docs"
+              onchange="changeLabel(this)"
+            />
+            <label for="aqua-docs" class="ml-2"
+              >AQuA Docs</label
+            >
+          </div>
         </div>
         <div class="w-full md:w-auto mt-4 md:mt-0">
           <a
diff --git a/app/tests/test_chat_on_websocket.py b/app/tests/test_chat_on_websocket.py
index c0adf14..1efe735 100644
--- a/app/tests/test_chat_on_websocket.py
+++ b/app/tests/test_chat_on_websocket.py
@@ -18,6 +18,8 @@
     "collectionName": "adotdcollection_test",
     "labels": ["NIV bible", "ESV-Bible", "translationwords", "open-access"],
     "token": admin_token,
+    "transcriptionApiKey":"dummy-key-for-openai",
+    "llmApiKey":"dummy-key-for-openai",
 }
 
 
diff --git a/app/tests/test_dataupload.py b/app/tests/test_dataupload.py
index 44d184e..342f6e7 100644
--- a/app/tests/test_dataupload.py
+++ b/app/tests/test_dataupload.py
@@ -61,6 +61,7 @@ def test_data_upload_processed_sentences(mocker, vectordb, fresh_db):
             "dbPath": fresh_db["dbPath"],
             "collectionName": fresh_db["collectionName"],
             "embeddingType": schema.EmbeddingType.HUGGINGFACE_DEFAULT.value,
+            "llmApiKey":"dummy-value",
         },
         json=SENT_DATA,
     )
@@ -101,6 +102,7 @@ def test_data_upload_markdown(mocker, vectordb, chunker, fresh_db):
                     "dbPath": fresh_db["dbPath"],
                     "collectionName": fresh_db["collectionName"],
                     "token": ADMIN_TOKEN,
+                    "llmApiKey":"dummy-value",
                 }
                 # json={"vectordb_config": fresh_db}
             )
@@ -132,6 +134,7 @@ def test_data_upload_csv(mocker, vectordb, fresh_db):
                 "dbPath": fresh_db["dbPath"],
                 "collectionName": fresh_db["collectionName"],
                 "token": ADMIN_TOKEN,
+                "llmApiKey":"dummy-value",
             },
             json={"vectordb_config": fresh_db},
         )

From 89821b3746d04cd995e332ad44175275d90fd469 Mon Sep 17 00:00:00 2001
From: Mark <woodwardmw@gmail.com>
Date: Mon, 22 Jan 2024 14:35:39 -0800
Subject: [PATCH 2/5] Add AQuA docs


From 3779eaf10ea85c2613d160d6f951e0cb78d0e22d Mon Sep 17 00:00:00 2001
From: Mark <woodwardmw@gmail.com>
Date: Mon, 22 Jan 2024 21:23:54 -0800
Subject: [PATCH 3/5] Switch to latest gpt-3.5-turbo, increase context length,
 and ask it to be more verbose

---
 app/core/llm_framework/openai_vanilla.py |  7 ++++---
 app/templates/chat-demo-postgres.html    | 21 +++++++++++++++++++++
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/app/core/llm_framework/openai_vanilla.py b/app/core/llm_framework/openai_vanilla.py
index 05d500c..39790b4 100644
--- a/app/core/llm_framework/openai_vanilla.py
+++ b/app/core/llm_framework/openai_vanilla.py
@@ -20,7 +20,7 @@ def get_context(source_documents):
     # ** This will need to be adjusted, based on what the returned results look like **
     for _, source_document in enumerate(source_documents):
         if (
-            len(source_document.page_content) + len(context) > 11000
+            len(source_document.page_content) + len(context) > 44000
         ):  # FIXME: use tiktoken library to count tokens
             break
         context += "{source:" + source_document.metadata.get("source", "")
@@ -34,7 +34,8 @@ def get_pre_prompt(context):
     """Constructs a pre-prompt for the conversation, including the context"""
     chat_prefix = "The following is a conversation with an AI assistant for "
     chat_prefix += "Bible translators. The assistant is"
-    chat_prefix += " helpful, creative, clever, very friendly and follows instructions carefully.\n"
+    chat_prefix += "verbose, helpful, creative, clever, very friendly and follows instructions carefully,"
+    chat_prefix += "giving as much information as possible.\n"
     prompt = (
         chat_prefix
         + "Read the paragraph below and answer the question, using only the information"
@@ -75,7 +76,7 @@ class OpenAIVanilla(LLMFrameworkInterface):  # pylint: disable=too-few-public-me
     def __init__(
         self,  # pylint: disable=super-init-not-called
         key: str = os.getenv("OPENAI_API_KEY"),
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-3.5-turbo-1106",
         vectordb: VectordbInterface = None,  # What should this be by default?
     ) -> None:
         """Sets the API key and initializes library objects if any"""
diff --git a/app/templates/chat-demo-postgres.html b/app/templates/chat-demo-postgres.html
index 0afa2d1..e9764a3 100644
--- a/app/templates/chat-demo-postgres.html
+++ b/app/templates/chat-demo-postgres.html
@@ -67,6 +67,18 @@ <h1 class="font-bold mb-4">assistant.bible</h1></a
               >Faith and Farming</label
             >
           </div>
+          <div class="flex items-center">
+            <input
+              type="checkbox"
+              id="aqua-docs"
+              name="aqua-docs"
+              value="aqua_docs"
+              onchange="changeLabel(this)"
+            />
+            <label for="aqua-docs" class="ml-2"
+              >AQuA Docs</label
+            >
+          </div>
         </div>
         <div class="w-full md:w-auto mt-4 md:mt-0">
           <a
@@ -209,6 +221,7 @@ <h1 class="font-bold mb-4">assistant.bible</h1></a
             var endpoint = {{ ws_url| tojson }};
             var tyndaleOpenElement = document.getElementById("tyndale-open");
             var faithAndFarmingElement = document.getElementById("faith-and-farming");
+            var aquaDocsElement = document.getElementById("aqua-docs");
         endpoint += '?llmFrameworkType=openai-vanilla&vectordbType=postgres-with-pgvector&token=' + accessToken;
         if (tyndaleOpenElement.checked == true) { 
             endpoint += '&labels=' + tyndaleOpenElement.value;
@@ -218,6 +231,14 @@ <h1 class="font-bold mb-4">assistant.bible</h1></a
             endpoint += '&labels=' + faithAndFarmingElement.value;
             $('#messages').append('<div>---<b>Domain:Faith-and-Farming:' + faithAndFarmingElement.checked + '</b>---</div');
          }
+         if (faithAndFarmingElement.checked == true) { 
+            endpoint += '&labels=' + faithAndFarmingElement.value;
+            $('#messages').append('<div>---<b>Domain:Faith-and-Farming:' + faithAndFarmingElement.checked + '</b>---</div');
+         }
+            if (aquaDocsElement.checked == true) { 
+                endpoint += '&labels=' + aquaDocsElement.value;
+                $('#messages').append('<div>---<b>Domain:AQuA-Docs:' + aquaDocsElement.checked + '</b>---</div');
+            }
         ws.close();
         console.log("endpoint: " + endpoint)
 

From c1d7428cf555b5d8e1229e38456a64365ecea5ec Mon Sep 17 00:00:00 2001
From: Mark <woodwardmw@gmail.com>
Date: Mon, 22 Jan 2024 21:24:43 -0800
Subject: [PATCH 4/5] Replace null tokens and new lines in imported text

---
 app/core/vectordb/postgres4langchain.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py
index 206d371..bd40315 100644
--- a/app/core/vectordb/postgres4langchain.py
+++ b/app/core/vectordb/postgres4langchain.py
@@ -136,6 +136,12 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
         """Loads the document object as per chroma DB formats into the collection"""
         data_list = []
         for doc in docs:
+            doc.text = (doc.text
+                        .replace("\n", " ")
+                        .replace("\r", " ")
+                        .replace("\t", " ")
+                        .replace('\x00', '')
+            )
             cur = self.db_conn.cursor()
             cur.execute(
                 "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))

From 2f4e7b1ec335469f008160632a6e5a7231a19929 Mon Sep 17 00:00:00 2001
From: "dev.assitant.bible" <dev@assistnat.com>
Date: Wed, 24 Jan 2024 20:15:57 +0000
Subject: [PATCH 5/5] updating files for cert renewal

---
 deployment/docker-compose.yml |  6 +++---
 deployment/nginx/nginx.conf   | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 deployment/nginx/nginx.conf

diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml
index 22320c7..417de98 100644
--- a/deployment/docker-compose.yml
+++ b/deployment/docker-compose.yml
@@ -76,7 +76,7 @@ services:
      - chatbot
     environment:
       - CHAT_DOMAIN=${DOMAIN:-"localhost"}
-      - PROD_DOMAIN=${DOMAIN2:-assistant.bible}
+      - PROD_DOMAIN=${DOMAIN2:-dev.assistant.bible}
     volumes:
      - ./nginx/nginx.conf.template:/etc/nginx/templates/default.conf.template:ro
      # - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
@@ -93,7 +93,7 @@ services:
      - ./certbot/conf/:/etc/letsencrypt/:rw
     networks:
      - chatbot-network
-
+    # command: certonly --webroot --webroot-path=/var/www/certbot --email alejandro_quintero@sil.org --agree-tos --no-eff-email --staging -d dev.assistant.bible 
   ofelia-scheduler:
     image: mcuadros/ofelia:v0.3.7
     depends_on:
@@ -131,4 +131,4 @@ volumes:
   logs-vol:
   chroma-db:
   postgres-db-vol:
-  postgres-db-backup:
\ No newline at end of file
+  postgres-db-backup:
diff --git a/deployment/nginx/nginx.conf b/deployment/nginx/nginx.conf
new file mode 100644
index 0000000..0ab4fc3
--- /dev/null
+++ b/deployment/nginx/nginx.conf
@@ -0,0 +1,21 @@
+events {
+    worker_connections 1024;
+    # other events directives can be placed here
+}
+http {
+   server {
+        listen 80;
+        listen [::]:80;
+
+        server_name [dev.assistant.bible] www.[dev.assistant.bible];
+        server_tokens off;
+
+        location /.well-known/acme-challenge/ {
+            root /var/www/certbot;
+        }
+
+        location / {
+            return 301 https://[dev.assistant.bible]$request_uri;
+        }
+    }
+}
\ No newline at end of file