Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Show more information about source documents #122

Merged
merged 3 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions app/core/llm_framework/openai_vanilla.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ def get_context(source_documents):
len(source_document.page_content) + len(context) > 11000
): # FIXME: use tiktoken library to count tokens
break
context += "{source:" + source_document.metadata.get("source", "")
context += ", text: " + source_document.page_content + "}" + ","
if source_document.metadata.get("source", "") is not None:
context += "{source:" + source_document.metadata.get("source", "")
context += ", text: " + source_document.page_content + "}" + ","
context += "]" + "\n"

return context
Expand Down
13 changes: 9 additions & 4 deletions app/core/vectordb/postgres4langchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,15 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
cur.execute(
"SELECT 1 FROM embeddings WHERE source_id = %s", (doc.docId,))
doc_id_already_exists = cur.fetchone()
links= ",".join([str(item) for item in doc.links])
if not doc_id_already_exists:
data_list.append(
[
doc.docId,
doc.text,
doc.label,
doc.media,
str(doc.links),
links,
doc.embedding,
]
)
Expand All @@ -169,7 +170,7 @@ def add_to_collection(self, docs: List[schema.Document], **kwargs) -> None:
doc.text,
doc.label,
doc.media,
str(doc.links),
links,
doc.embedding,
doc.docId,
),
Expand Down Expand Up @@ -216,7 +217,7 @@ def _get_relevant_documents(
cur = self.db_conn.cursor()
cur.execute(
"""
SELECT source_id, document
SELECT label, media, links, source_id, document
FROM embeddings
WHERE label = ANY(%s)
AND embedding <=> %s < %s
Expand Down Expand Up @@ -252,7 +253,11 @@ def _get_relevant_documents(
)
]
return [
LangchainDocument(page_content=doc[1], metadata={"source": doc[0]})
LangchainDocument(page_content=doc[1], metadata={"label": doc[0],
"media": doc[1],
'link':doc[2],
'source_id':doc[3],
'document':doc[4]})
for doc in records
]

Expand Down
4 changes: 2 additions & 2 deletions app/routers.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ async def websocket_chat_endpoint(
"Human: {0}\nBot:{1}\nSources:{2}\n\n".format(
question,
bot_response['answer'],
[item.metadata['source']
[item.metadata
for item in bot_response['source_documents']]
)
)
Expand All @@ -310,7 +310,7 @@ async def websocket_chat_endpoint(
message=bot_response["answer"],
type=schema.ChatResponseType.ANSWER,
sources=[
item.metadata["source"]
item.metadata
for item in bot_response["source_documents"]
],
media=[],
Expand Down
40 changes: 34 additions & 6 deletions app/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,19 +200,47 @@ class ChatResponseType(str, Enum):
ANSWER = "answer"
ERROR = "error"

class SourceDocument(BaseModel):
"""Source field of Chat response from server to UI or user app"""
label: str = Field(
"open-access",
examples=["paratext user manual or bible or door-43-users"],
desc="The common tag for all sentences under a set. "
+ "Used for specifying access rules and filtering during querying",
)
media:str = Field(
None,
desc="Additional media links, like images, videos etc "
+ "to be used in output to make the chat interface multimodel",
)
link: str = Field(
None,
desc="The links to fetch the actual resource. "
+ "To be used by end user like a search result",
)
source_id: str = Field(
None,
examples=["NIV Bible Mat 1:1-20"],
desc="Unique for a sentence. Used by the LLM to specify which document "
+ "it answers from. Better to combine the source tag and a serial number.",
)
document: str = Field(
None, desc="The sentence which is used for question answering"
)


class BotResponse(BaseModel):
"""Chat response from server to UI or user app"""

message: str = Field(..., examples=["Good Morning to you too!"])
sender: SenderType = Field(..., examples=["You or BOT"])
sources: List[str] | None = Field(
sources: List[SourceDocument] | None = Field(
None,
examples=[[
"https://www.biblegateway.com/passage/?search=Genesis+1%3A1&version=NIV",
"https://git.door43.org/Door43-Catalog/en_tw/src/branch/master/"
+ "bible/other/creation.md",
]],
examples=[{'source': 'ESV-Bible',
'media': {"https://www.youtube.com/watch?v=teu7BCZTgDs"},
'link': 'https://www.esv.org/Genesis+1/,https://www.esv.org/Genesis+1/',
'source_id': 'ESV-gen1:1',
'document': 'In the beginning, God created the heavens and the earth.'}],
)
media: List[AnyUrl] | None = Field(
None, examples=[["https://www.youtube.com/watch?v=teu7BCZTgDs"]]
Expand Down
Loading