BibleNLP · kavitharaju · Dec 11, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 10, 2023
diff --git a/app/core/llm_framework/openai_vanilla.py b/app/core/llm_framework/openai_vanilla.py
@@ -23,8 +23,9 @@ def get_context(source_documents):
             len(source_document.page_content) + len(context) > 11000
         ):  # FIXME: use tiktoken library to count tokens
             break
-        context += "{source:" + source_document.metadata.get("source", "")
-        context += ", text: " + source_document.page_content + "}" + ","
+        if source_document.metadata.get("source", "") is not None:
+            context += "{source:" + source_document.metadata.get("source", "")
+            context += ", text: " + source_document.page_content + "}" + ","
     context += "]" + "\n"
 
     return context

diff --git a/app/core/vectordb/postgres4langchain.py b/app/core/vectordb/postgres4langchain.py
@@ -140,14 +140,15 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
             cur.execute(
                 "SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))
             doc_id_already_exists = cur.fetchone()
+            links= ",".join([str(item) for item in doc.links])
             if not doc_id_already_exists:
                 data_list.append(
                     [
                         doc.docId,
                         doc.text,
                         doc.label,
                         doc.media,
-                        str(doc.links),
+                        links,
                         doc.embedding,
                     ]
                 )
@@ -169,7 +170,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
                         doc.text,
                         doc.label,
                         doc.media,
-                        str(doc.links),
+                        links,
                         doc.embedding,
                         doc.docId,
                     ),
@@ -216,7 +217,7 @@ def _get_relevant_documents(
             cur = self.db_conn.cursor()
             cur.execute(
                 """
-                SELECT source_id, document 
+                SELECT label, media, links, source_id, document 
                 FROM embeddings 
                 WHERE label = ANY(%s) 
                 AND embedding <=> %s < %s 
@@ -252,7 +253,11 @@ def _get_relevant_documents(
                 )
             ]
         return [
-            LangchainDocument(page_content=doc[1], metadata={"source": doc[0]})
+            LangchainDocument(page_content=doc[1], metadata={"label": doc[0],
+                                                             "media": doc[1],
+                                                             'link':doc[2],
+                                                            'source_id':doc[3],
+                                                            'document':doc[4]})
             for doc in records
         ]
 

diff --git a/app/routers.py b/app/routers.py
@@ -292,7 +292,7 @@ async def websocket_chat_endpoint(
                     "Human: {0}\nBot:{1}\nSources:{2}\n\n".format(
                         question,
                         bot_response['answer'],
-                        [item.metadata['source']
+                        [item.metadata
                             for item in bot_response['source_documents']]
                     )
                 )
@@ -310,7 +310,7 @@ async def websocket_chat_endpoint(
                     message=bot_response["answer"],
                     type=schema.ChatResponseType.ANSWER,
                     sources=[
-                        item.metadata["source"]
+                        item.metadata
                         for item in bot_response["source_documents"]
                     ],
                     media=[],

diff --git a/app/schema.py b/app/schema.py
@@ -200,19 +200,47 @@ class ChatResponseType(str, Enum):
     ANSWER = "answer"
     ERROR = "error"
 
+class SourceDocument(BaseModel):
+    """Source field of Chat response from server to UI or user app"""
+    label: str = Field(
+        "open-access",
+        examples=["paratext user manual or bible or door-43-users"],
+        desc="The common tag for all sentences under a set. "
+        + "Used for specifying access rules and filtering during querying",
+    )
+    media:str = Field(
+        None,
+        desc="Additional media links, like images, videos etc "
+        + "to be used in output to make the chat interface multimodel",
+    )
+    link: str = Field(
+        None,
+        desc="The links to fetch the actual resource. "
+        + "To be used by end user like a search result",
+    )
+    source_id: str = Field(
+        None,
+        examples=["NIV Bible Mat 1:1-20"],
+        desc="Unique for a sentence. Used by the LLM to specify which document "
+        + "it answers from. Better to combine the source tag and a serial number.",
+    )
+    document: str = Field(
+        None, desc="The sentence which is used for question answering"
+    )
+
 
 class BotResponse(BaseModel):
     """Chat response from server to UI or user app"""
 
     message: str = Field(..., examples=["Good Morning to you too!"])
     sender: SenderType = Field(..., examples=["You or BOT"])
-    sources: List[str] | None = Field(
+    sources: List[SourceDocument] | None = Field(
         None,
-        examples=[[
-            "https://www.biblegateway.com/passage/?search=Genesis+1%3A1&version=NIV",
-            "https://git.door43.org/Door43-Catalog/en_tw/src/branch/master/"
-            + "bible/other/creation.md",
-        ]],
+        examples=[{'source': 'ESV-Bible',
+                   'media': {"https://www.youtube.com/watch?v=teu7BCZTgDs"}, 
+                   'link': 'https://www.esv.org/Genesis+1/,https://www.esv.org/Genesis+1/',
+                   'source_id': 'ESV-gen1:1',
+                   'document': 'In the beginning, God created the heavens and the earth.'}],
     )
     media: List[AnyUrl] | None = Field(
         None, examples=[["https://www.youtube.com/watch?v=teu7BCZTgDs"]]