Merge branch 'main' into workflow

Project-Testify · Jun 13, 2024 · d47bf3a · d47bf3a
2 parents a8320f5 + 7652bfb
commit d47bf3a
Show file tree

Hide file tree

Showing 9 changed files with 226 additions and 70 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,17 @@
+# Testify - AI Assistant
+
+## Setup
+1. Clone the repository
+2. Install the required packages using `pip install -r requirements.txt`
+
+```bash
+pip install -r requirements.txt
+```
+
+3. Run the app using `uvicorn app.main:app --reload`    
+
+```bash
+uvicorn app.main:app --reload --port 7401
+```
+
+4. Open API documentation at `http://localhost:7401/docs`
diff --git a/app/__init__.py b/app/__init__.py
@@ -1,14 +1,11 @@
 # app/__init__.py
-from .main import app
-
+from fastapi import FastAPI
 from .routers.upload import router as upload_router
 from .routers.questionGenerate import router as questionGenerate_router
 
 
+from .main import app
+
+# Include routers with appropriate API version prefix
 app.include_router(upload_router, prefix="/api/v1")
 app.include_router(questionGenerate_router, prefix="/api/v1")
-
-
-# app/routers/upload.py
-
-
diff --git a/app/data/questionPrompts.py b/app/data/questionPrompts.py
@@ -0,0 +1,46 @@
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_core.output_parsers import JsonOutputParser
+
+
+
+# Define a Pydantic model for a standard question and answer format.
+class QuestionParser(BaseModel):
+    question: str = Field(description="The question generated from the text.")
+    answer: str = Field(description="The answer to the generated question.")
+
+# Define a Pydantic model for multiple-choice questions.
+class Answer(BaseModel):
+    char: str = Field(description="The character representing the answer, e.g., 'A', 'B', 'C', 'D'.")
+    text: str = Field(description="The text of the answer.")
+
+class MultipleChoiceQuestionParser(BaseModel):
+    question: str = Field(description="The multiple choice question generated from the text.")
+    options: list[Answer] = Field(description="The options for the multiple choice question, should be a list of Answer objects.")
+    answer: str = Field(description="The character representing the correct answer, e.g., 'A', 'B', 'C', 'D'.")
+
+# Function to generate a prompt and corresponding parser for creating multiple-choice questions.
+def mcq_prompt(options: int) -> tuple[str, JsonOutputParser]:
+    """
+    Generates a prompt for creating multiple-choice questions along with a JSON output parser.
+
+    Args:
+        options_count (int): The number of options for the multiple-choice question.
+
+    Returns:
+        tuple[str, JsonOutputParser]: A tuple containing the prompt and the JSON output parser.
+    """
+    prompt_text = f"Generate a multiple choice question with {options} options and indicate the correct answer."
+    parser = JsonOutputParser(pydantic_object=MultipleChoiceQuestionParser)
+    return (prompt_text, parser)
+
+# Function to generate a prompt and corresponding parser for creating essay-type questions.
+def essay_prompt() -> tuple[str, JsonOutputParser]:
+    """
+    Generates a prompt for creating essay questions along with a JSON output parser.
+
+    Returns:
+        tuple[str, JsonOutputParser]: A tuple containing the prompt and the JSON output parser.
+    """
+    prompt_text = "Generate an essay question."
+    parser = JsonOutputParser(pydantic_object=QuestionParser)
+    return (prompt_text, parser)
diff --git a/app/main.py b/app/main.py
@@ -1,12 +1,14 @@
 from fastapi import FastAPI
 
-# Initialize the FastAPI app with a custom title
+# Create an instance of the FastAPI application with a custom title
 app = FastAPI(title="Testify AI")
 
-@app.get("/", response_model=dict)
+@app.get("/api/assistant", response_model=dict)
 async def read_root() -> dict:
     """
-    Root GET endpoint to return a simple greeting.
-    Returns a JSON object with a greeting message.
+    Root GET endpoint that provides a simple greeting message.
+
+    Returns:
+        dict: A dictionary containing a greeting message.
     """
-    return {"Hello": "World"}
+    return {"message": "Welcome to the Testify AI Assistant!"}
diff --git a/app/routers/questionGenerate.py b/app/routers/questionGenerate.py
@@ -1,14 +1,16 @@
-from fastapi import APIRouter, Query, HTTPException
-from typing import List
-
+from fastapi import APIRouter, HTTPException, Query
 from ..services.prompt import prompt
 
-
 router = APIRouter()
 
-@router.get("/generate-question/", response_model=str)
+@router.get("/generate-question/", response_model=dict)
 async def generate_question(text: str = Query(..., description="The text to generate a question for"),
-                            examid: str = Query(..., description="The ID of the exam related to the text")) -> str:
+                            examid: str = Query(..., description="The ID of the exam related to the text")) -> dict:
     """Endpoint to generate a question for a given text using OpenAI's model."""
-
-    return prompt(text, examid) 
+    try:
+        # Assuming 'prompt' function is synchronous; if it's async, use 'await prompt(text, examid)'
+        question_response = prompt(text, examid)
+        return question_response
+    except Exception as e:
+        # Catching a broad exception is not best practice; adjust according to specific exceptions expected from 'prompt'
+        raise HTTPException(status_code=500, detail=f"An error occurred while generating the question: {str(e)}")
diff --git a/app/routers/upload.py b/app/routers/upload.py
@@ -6,15 +6,17 @@
 router = APIRouter()
 
 @router.post("/upload-pdf/", status_code=201)
-async def upload_pdf(file: UploadFile = File(...), examid:str = Query(..., description="The ID of the exam related to the uploaded PDF") ) -> dict:
+async def upload_pdf(file: UploadFile = File(...), examid: str = Query(..., description="The ID of the exam related to the uploaded PDF")) -> dict:
     """Endpoint to upload a PDF and upsert its contents into a Pinecone vector store."""
-
     if file.content_type != 'application/pdf':
         raise HTTPException(status_code=415, detail="Unsupported file type. Please upload a PDF.")
 
-    # Call the upsert function from the imported service
-    upsert(file, examid)
+    # Assuming 'upsert' is an async function; if not, consider wrapping with 'await'
+    # or adjust the function to be a regular call if it's designed to be synchronous
+    success =  upsert(file, examid)
+
+    if not success:
+        raise HTTPException(status_code=500, detail="Failed to process the PDF file.")
 
-    # return {"filename": file.filename}
-    Response(status_code=201)
+    # Directly return a message if upsert is successful; 'Response(status_code=201)' is redundant with `status_code=201` in the decorator
     return {"message": "PDF uploaded successfully."}
diff --git a/app/services/pinecone_upsert.py b/app/services/pinecone_upsert.py
@@ -1,19 +1,15 @@
-from typing import Any, BinaryIO
+from typing import BinaryIO
 import os
 import dotenv
 import pdfplumber
 from langchain_openai import OpenAIEmbeddings
 from langchain_pinecone import PineconeVectorStore
-from pinecone import Pinecone, ServerlessSpec
 
 dotenv.load_dotenv()
 
-pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
-
-def generate_embeddings_from_pdf(pdf_file: BinaryIO) -> str:
-    """Generates embeddings from a PDF file using OpenAI's model."""
-
-    print(pdf_file.filename)
+def generate_text_from_pdf(pdf_file: BinaryIO) -> str:
+    """Generates and returns text extracted from a PDF file."""
+    print(f"Processing file: {pdf_file.filename}")
     full_text = ""
     with pdfplumber.open(pdf_file.file) as pdf:
         for page in pdf.pages:
@@ -23,13 +19,11 @@ def generate_embeddings_from_pdf(pdf_file: BinaryIO) -> str:
 
     if not full_text.strip():
         raise ValueError("No text found in the PDF.")
-
     return full_text
 
 def upsert(pdf_file: BinaryIO, examid: str) -> str:
-    """Upserts PDF text into a Pinecone vector store and returns the extracted text."""
-
-    text = generate_embeddings_from_pdf(pdf_file)
+    """Extracts text from a PDF file, generates embeddings, and upserts them into a Pinecone vector store."""
+    text = generate_text_from_pdf(pdf_file)
 
     embeddings = OpenAIEmbeddings(
         model="text-embedding-3-large",
@@ -44,4 +38,7 @@ def upsert(pdf_file: BinaryIO, examid: str) -> str:
         index_name="abc"
     )
 
+
+
+
     return text
diff --git a/app/services/prompt.py b/app/services/prompt.py
@@ -1,26 +1,43 @@
-from typing import Any, BinaryIO
 import os
 import dotenv
-import pdfplumber
-from langchain_openai import OpenAIEmbeddings
+import json
+from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_pinecone import PineconeVectorStore
-from pinecone import Pinecone, ServerlessSpec
-
-from langchain.chat_models import ChatOpenAI
-from langchain.chains import RetrievalQA
-
+from pinecone import Pinecone
+from langchain_core.pydantic_v1 import BaseModel, Field
+from langchain_core.output_parsers import JsonOutputParser
+from langchain.prompts import PromptTemplate
 
+from ..data.questionPrompts import mcq_prompt, essay_prompt
 
 dotenv.load_dotenv()
 
-pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
+class QuestionParser(BaseModel):
+    question: str = Field(description="The question generated from the text.")
+    answer: str = Field(description="The answer to the generated question.")
+
+class MultipleChoiceQuestionParser(BaseModel):
+    question: str = Field(description="The multiple choice question generated from the text.")
+    options: list[str] = Field(description="The options for the multiple choice question.")
+    answer: int = Field(description="The index of the correct answer in the options list.")
 
+def format_docs(docs):
+    """Helper function to format document content."""
+    return "\n\n".join([d.page_content for d in docs])
 
+def select_prompt(question_type: str) -> tuple[str, JsonOutputParser]:
+    """Selects the appropriate prompt and parser based on the question type."""
+    if question_type == "mcq":
+        return mcq_prompt(4)  # This function is assumed to return a tuple (prompt, parser)
+    elif question_type == "essay":
+        return essay_prompt()  # This function is assumed to return a tuple (prompt, parser)
+    else:
+        raise ValueError("Invalid question type. Please select 'mcq' or 'essay'.")
 
+def prompt(text: str, examid: str, question_type: str = "mcq") -> dict:
+    """Generates a question based on the provided text and exam ID."""
+    question, parser = select_prompt(question_type)
 
-def prompt(text: str, examid: str) -> str:
-    """Upserts PDF text into a Pinecone vector store and returns the extracted text."""
-
     embed = OpenAIEmbeddings(
         model="text-embedding-3-large",
         api_key=os.getenv('OPENAI_API_KEY'),
@@ -31,34 +48,29 @@ def prompt(text: str, examid: str) -> str:
         namespace=examid,
         index_name="abc",
         embedding=embed
-
     )
 
-    vectorstore.similarity_search(
-        text,
-        # top_k=5
-    )
+    docs = vectorstore.similarity_search(text)  # Assuming this method returns relevant documents
 
     llm = ChatOpenAI(
-        model="gpt-3.5-turbo",
-        api_key=os.getenv('OPENAI_API_KEY')
+        model="gpt-4o",
+        api_key=os.getenv('OPENAI_API_KEY'),
+        model_kwargs={"response_format": {"type": "json_object"}}
+
     )
 
-    qa = RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=vectorstore.as_retriever()
+    prompt_template = PromptTemplate(
+        template="Generate one question, {question} about {query} from {document}. Output is only json format.",
+        input_variables=["query", "document", "question"],
+        partial_variables={"format_instructions": parser.get_format_instructions()},
+
     )
 
-    print(qa.invoke(text))
-    return "Question generated successfully."
-
-
-
-
-
-
+    chain = prompt_template | llm | parser
 
+    formatted_docs = format_docs(docs)
+    result = chain.invoke({"query": text, "document": formatted_docs, "question": question})
 
-
+    return result
+    return json.dumps(result)  # Converting the result to a JSON string for consistency
 
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,81 @@
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.7.0
+anyio==4.4.0
+attrs==23.2.0
+beautifulsoup4==4.12.3
+certifi==2024.6.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cryptography==42.0.8
+dataclasses-json==0.6.6
+distro==1.9.0
+dnspython==2.6.1
+email_validator==2.1.1
+fastapi==0.111.0
+fastapi-cli==0.0.4
+frozenlist==1.4.1
+google==3.0.0
+greenlet==3.0.3
+h11==0.14.0
+httpcore==1.0.5
+httptools==0.6.1
+httpx==0.27.0
+idna==3.7
+Jinja2==3.1.4
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.2.3
+langchain-community==0.2.4
+langchain-core==0.2.5
+langchain-openai==0.1.8
+langchain-pinecone==0.1.1
+langchain-text-splitters==0.2.1
+langsmith==0.1.75
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+marshmallow==3.21.3
+mdurl==0.1.2
+multidict==6.0.5
+mypy-extensions==1.0.0
+numpy==1.26.4
+openai==1.32.0
+orjson==3.10.3
+packaging==23.2
+pdfminer.six==20231228
+pdfplumber==0.11.0
+pillow==10.3.0
+pinecone-client==3.2.2
+pinecone-plugin-interface==0.0.7
+protobuf==5.27.1
+protobuf-to-dict==0.1.0
+pycparser==2.22
+pydantic==2.7.3
+pydantic_core==2.18.4
+Pygments==2.18.0
+pypdfium2==4.30.0
+python-dotenv==1.0.1
+python-multipart==0.0.9
+PyYAML==6.0.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+shellingham==1.5.4
+sniffio==1.3.1
+soupsieve==2.5
+SQLAlchemy==2.0.30
+starlette==0.37.2
+tenacity==8.3.0
+tiktoken==0.7.0
+tqdm==4.66.4
+typer==0.12.3
+typing-inspect==0.9.0
+typing_extensions==4.12.1
+ujson==5.10.0
+urllib3==2.2.1
+uvicorn==0.30.1
+uvloop==0.19.0
+watchfiles==0.22.0
+websockets==12.0
+yarl==1.9.4