init commit

davidcrab · May 16, 2023 · 343288f · 343288f
commit 343288f
Show file tree

Hide file tree

Showing 19 changed files with 8,834 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,35 @@
+# Powering your products with ChatGPT and your own data
+
+The Chatbot Kickstarter is a starter repo to get you used to building basic a basic Chatbot using the ChatGPT API and your own knowledge base. The flow you're taken through was originally presented with [these slides](https://drive.google.com/file/d/1dB-RQhZC_Q1iAsHkNNdkqtxxXqYODFYy/view?usp=share_link), which may come in useful to refer to. 
+
+This repo contains one notebook and two basic Streamlit apps:
+- `powering_your_products_with_chatgpt_and_your_data.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, and building simple Q&A and Chatbot functionality on top.
+- `search.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
+- `chat.py`: A Streamlit app providing a simple Chatbot via a search bar to query your knowledge base.
+
+To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
+
+## How it works
+
+The notebook is the best place to start, and is broadly laid out as follows:
+- **Lay the foundations:**
+    - Set up the vector database to accept vectors and data
+    - Load the dataset, chunk the data up for embedding and store in the vector database
+- **Make it a product:**
+    - Add a retrieval step where users provide queries and we return the most relevant entries
+    - Summarise search results with GPT-3
+    - Test out this basic Q&A app in Streamlit
+- **Build your moat:**
+    - Create an Assistant class to manage context and interact with our bot
+    - Use the Chatbot to answer questions using semantic search context
+    - Test out this basic Chatbot app in Streamlit
+
+Once you've run the notebook and tried the two Streamlit apps, you should be in a position to strip out any useful snippets and start your own Q&A or Chat application.
+
+## Limitations
+
+- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
+- This is a simple starting point - if you hit issues deploying your use case you may need to tune (non-exhaustive list):
+    - The prompt and parameters for the model for it to answer accurately
+    - Your search to return more relevant results
+    - Your chunking/embedding approach to store the most relevant content effectively for retrieval
diff --git a/__pycache__/config.cpython-39.pyc b/__pycache__/config.cpython-39.pyc
diff --git a/__pycache__/database.cpython-39.pyc b/__pycache__/database.cpython-39.pyc
diff --git a/__pycache__/transformers.cpython-39.pyc b/__pycache__/transformers.cpython-39.pyc
diff --git a/chat.py b/chat.py
@@ -0,0 +1,83 @@
+import streamlit as st
+from streamlit_chat import message
+
+from database import get_redis_connection
+from chatbot import RetrievalAssistant, Message
+
+# Initialise database
+
+## Initialise Redis connection
+redis_client = get_redis_connection()
+
+# Set instruction
+
+# System prompt requiring Question and Year to be extracted from the user
+system_prompt = '''
+You are a helpful Formula 1 knowledge base assistant. You need to capture a Question and Year from each customer.
+The Question is their query on Formula 1, and the Year is the year of the applicable Formula 1 season.
+Think about this step by step:
+- The user will ask a Question
+- You will ask them for the Year if their question didn't include a Year
+- Once you have the Year, say "searching for answers".
+
+Example:
+
+User: I'd like to know the cost cap for a power unit
+
+Assistant: Certainly, what year would you like this for?
+
+User: 2023 please.
+
+Assistant: Searching for answers.
+'''
+
+### CHATBOT APP
+
+st.set_page_config(
+    page_title="Streamlit Chat - Demo",
+    page_icon=":robot:"
+)
+
+st.title('Formula 1 Chatbot')
+st.subheader("Help us help you learn about Formula 1")
+
+if 'generated' not in st.session_state:
+    st.session_state['generated'] = []
+
+if 'past' not in st.session_state:
+    st.session_state['past'] = []
+
+def query(question):
+    response = st.session_state['chat'].ask_assistant(question)
+    return response
+
+prompt = st.text_input("What do you want to know: ","", key="input")
+
+if st.button('Submit', key='generationSubmit'):
+
+    # Initialization
+    if 'chat' not in st.session_state:
+        st.session_state['chat'] = RetrievalAssistant()
+        messages = []
+        system_message = Message('system',system_prompt)
+        messages.append(system_message.message())
+    else:
+        messages = []
+
+
+    user_message = Message('user',prompt)
+    messages.append(user_message.message())
+
+    response = query(messages)
+
+    # Debugging step to print the whole response
+    #st.write(response)
+
+    st.session_state.past.append(prompt)
+    st.session_state.generated.append(response['content'])
+
+if st.session_state['generated']:
+
+    for i in range(len(st.session_state['generated'])-1, -1, -1):
+        message(st.session_state["generated"][i], key=str(i))
+        message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
diff --git a/chatbot.py b/chatbot.py
@@ -0,0 +1,84 @@
+import openai
+from termcolor import colored
+import streamlit as st
+
+from database import get_redis_connection,get_redis_results
+
+from config import CHAT_MODEL,COMPLETIONS_MODEL, INDEX_NAME
+
+redis_client = get_redis_connection()
+
+# A basic class to create a message as a dict for chat
+class Message:
+
+
+    def __init__(self,role,content):
+
+        self.role = role
+        self.content = content
+
+    def message(self):
+
+        return {"role": self.role,"content": self.content}
+
+# New Assistant class to add a vector database call to its responses
+class RetrievalAssistant:
+
+    def __init__(self):
+        self.conversation_history = []  
+
+    def _get_assistant_response(self, prompt):
+
+        try:
+            completion = openai.ChatCompletion.create(
+              model=CHAT_MODEL,
+              messages=prompt,
+              temperature=0.1
+            )
+
+            response_message = Message(completion['choices'][0]['message']['role'],completion['choices'][0]['message']['content'])
+            return response_message.message()
+
+        except Exception as e:
+
+            return f'Request failed with exception {e}'
+
+    # The function to retrieve Redis search results
+    def _get_search_results(self,prompt):
+        latest_question = prompt
+        search_content = get_redis_results(redis_client,latest_question,INDEX_NAME)['result'][0]
+        return search_content
+
+
+    def ask_assistant(self, next_user_prompt):
+        [self.conversation_history.append(x) for x in next_user_prompt]
+        assistant_response = self._get_assistant_response(self.conversation_history)
+
+        # Answer normally unless the trigger sequence is used "searching_for_answers"
+        if 'searching for answers' in assistant_response['content'].lower():
+            question_extract = openai.Completion.create(model=COMPLETIONS_MODEL,prompt=f"Extract the user's latest question and the year for that question from this conversation: {self.conversation_history}. Extract it as a sentence stating the Question and Year")
+            search_result = self._get_search_results(question_extract['choices'][0]['text'])
+
+            # We insert an extra system prompt here to give fresh context to the Chatbot on how to use the Redis results
+            # In this instance we add it to the conversation history, but in production it may be better to hide
+            self.conversation_history.insert(-1,{"role": 'system',"content": f"Answer the user's question using this content: {search_result}. If you cannot answer the question, say 'Sorry, I don't know the answer to this one'"})
+
+            assistant_response = self._get_assistant_response(self.conversation_history)
+
+            self.conversation_history.append(assistant_response)
+            return assistant_response
+        else:
+            self.conversation_history.append(assistant_response)
+            return assistant_response
+
+
+    def pretty_print_conversation_history(self, colorize_assistant_replies=True):
+        for entry in self.conversation_history:
+            if entry['role'] == 'system':
+                pass
+            else:
+                prefix = entry['role']
+                content = entry['content']
+                output = colored(prefix +':\n' + content, 'green') if colorize_assistant_replies and entry['role'] == 'assistant' else prefix +':\n' + content
+                #prefix = entry['role']
+                print(output)
diff --git a/config.py b/config.py
@@ -0,0 +1,8 @@
+COMPLETIONS_MODEL = "text-davinci-003"
+EMBEDDINGS_MODEL = "text-embedding-ada-002"
+CHAT_MODEL = 'gpt-3.5-turbo'
+TEXT_EMBEDDING_CHUNK_SIZE=300
+VECTOR_FIELD_NAME='content_vector'
+PREFIX = "moodfitdocs"  
+INDEX_NAME = "moodfit-index"
+OPENAI_API_KEY = "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"
diff --git a/data/FIA Practice Directions - Competitor's Staff Registration System.pdf b/data/FIA Practice Directions - Competitor's Staff Registration System.pdf