Skip to content

Commit

Permalink
Merge branch 'main' into workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
nsavinda authored Jun 13, 2024
2 parents a8320f5 + 7652bfb commit d47bf3a
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 70 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Testify - AI Assistant

## Setup
1. Clone the repository
2. Install the required packages using `pip install -r requirements.txt`

```bash
pip install -r requirements.txt
```

3. Run the app using `uvicorn app.main:app --reload`

```bash
uvicorn app.main:app --reload --port 7401
```

4. Open API documentation at `http://localhost:7401/docs`
11 changes: 4 additions & 7 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
# app/__init__.py
from .main import app

from fastapi import FastAPI
from .routers.upload import router as upload_router
from .routers.questionGenerate import router as questionGenerate_router


from .main import app

# Include routers with appropriate API version prefix
app.include_router(upload_router, prefix="/api/v1")
app.include_router(questionGenerate_router, prefix="/api/v1")


# app/routers/upload.py


46 changes: 46 additions & 0 deletions app/data/questionPrompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser



# Define a Pydantic model for a standard question and answer format.
class QuestionParser(BaseModel):
question: str = Field(description="The question generated from the text.")
answer: str = Field(description="The answer to the generated question.")

# Define a Pydantic model for multiple-choice questions.
class Answer(BaseModel):
char: str = Field(description="The character representing the answer, e.g., 'A', 'B', 'C', 'D'.")
text: str = Field(description="The text of the answer.")

class MultipleChoiceQuestionParser(BaseModel):
question: str = Field(description="The multiple choice question generated from the text.")
options: list[Answer] = Field(description="The options for the multiple choice question, should be a list of Answer objects.")
answer: str = Field(description="The character representing the correct answer, e.g., 'A', 'B', 'C', 'D'.")

# Function to generate a prompt and corresponding parser for creating multiple-choice questions.
def mcq_prompt(options: int) -> tuple[str, JsonOutputParser]:
"""
Generates a prompt for creating multiple-choice questions along with a JSON output parser.
Args:
options_count (int): The number of options for the multiple-choice question.
Returns:
tuple[str, JsonOutputParser]: A tuple containing the prompt and the JSON output parser.
"""
prompt_text = f"Generate a multiple choice question with {options} options and indicate the correct answer."
parser = JsonOutputParser(pydantic_object=MultipleChoiceQuestionParser)
return (prompt_text, parser)

# Function to generate a prompt and corresponding parser for creating essay-type questions.
def essay_prompt() -> tuple[str, JsonOutputParser]:
"""
Generates a prompt for creating essay questions along with a JSON output parser.
Returns:
tuple[str, JsonOutputParser]: A tuple containing the prompt and the JSON output parser.
"""
prompt_text = "Generate an essay question."
parser = JsonOutputParser(pydantic_object=QuestionParser)
return (prompt_text, parser)
12 changes: 7 additions & 5 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from fastapi import FastAPI

# Initialize the FastAPI app with a custom title
# Create an instance of the FastAPI application with a custom title
app = FastAPI(title="Testify AI")

@app.get("/", response_model=dict)
@app.get("/api/assistant", response_model=dict)
async def read_root() -> dict:
"""
Root GET endpoint to return a simple greeting.
Returns a JSON object with a greeting message.
Root GET endpoint that provides a simple greeting message.
Returns:
dict: A dictionary containing a greeting message.
"""
return {"Hello": "World"}
return {"message": "Welcome to the Testify AI Assistant!"}
18 changes: 10 additions & 8 deletions app/routers/questionGenerate.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from fastapi import APIRouter, Query, HTTPException
from typing import List

from fastapi import APIRouter, HTTPException, Query
from ..services.prompt import prompt


router = APIRouter()

@router.get("/generate-question/", response_model=str)
@router.get("/generate-question/", response_model=dict)
async def generate_question(text: str = Query(..., description="The text to generate a question for"),
examid: str = Query(..., description="The ID of the exam related to the text")) -> str:
examid: str = Query(..., description="The ID of the exam related to the text")) -> dict:
"""Endpoint to generate a question for a given text using OpenAI's model."""

return prompt(text, examid)
try:
# Assuming 'prompt' function is synchronous; if it's async, use 'await prompt(text, examid)'
question_response = prompt(text, examid)
return question_response
except Exception as e:
# Catching a broad exception is not best practice; adjust according to specific exceptions expected from 'prompt'
raise HTTPException(status_code=500, detail=f"An error occurred while generating the question: {str(e)}")
14 changes: 8 additions & 6 deletions app/routers/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@
router = APIRouter()

@router.post("/upload-pdf/", status_code=201)
async def upload_pdf(file: UploadFile = File(...), examid:str = Query(..., description="The ID of the exam related to the uploaded PDF") ) -> dict:
async def upload_pdf(file: UploadFile = File(...), examid: str = Query(..., description="The ID of the exam related to the uploaded PDF")) -> dict:
"""Endpoint to upload a PDF and upsert its contents into a Pinecone vector store."""

if file.content_type != 'application/pdf':
raise HTTPException(status_code=415, detail="Unsupported file type. Please upload a PDF.")

# Call the upsert function from the imported service
upsert(file, examid)
# Assuming 'upsert' is an async function; if not, consider wrapping with 'await'
# or adjust the function to be a regular call if it's designed to be synchronous
success = upsert(file, examid)

if not success:
raise HTTPException(status_code=500, detail="Failed to process the PDF file.")

# return {"filename": file.filename}
Response(status_code=201)
# Directly return a message if upsert is successful; 'Response(status_code=201)' is redundant with `status_code=201` in the decorator
return {"message": "PDF uploaded successfully."}
21 changes: 9 additions & 12 deletions app/services/pinecone_upsert.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,15 @@
from typing import Any, BinaryIO
from typing import BinaryIO
import os
import dotenv
import pdfplumber
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

dotenv.load_dotenv()

pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))

def generate_embeddings_from_pdf(pdf_file: BinaryIO) -> str:
"""Generates embeddings from a PDF file using OpenAI's model."""

print(pdf_file.filename)
def generate_text_from_pdf(pdf_file: BinaryIO) -> str:
"""Generates and returns text extracted from a PDF file."""
print(f"Processing file: {pdf_file.filename}")
full_text = ""
with pdfplumber.open(pdf_file.file) as pdf:
for page in pdf.pages:
Expand All @@ -23,13 +19,11 @@ def generate_embeddings_from_pdf(pdf_file: BinaryIO) -> str:

if not full_text.strip():
raise ValueError("No text found in the PDF.")

return full_text

def upsert(pdf_file: BinaryIO, examid: str) -> str:
"""Upserts PDF text into a Pinecone vector store and returns the extracted text."""

text = generate_embeddings_from_pdf(pdf_file)
"""Extracts text from a PDF file, generates embeddings, and upserts them into a Pinecone vector store."""
text = generate_text_from_pdf(pdf_file)

embeddings = OpenAIEmbeddings(
model="text-embedding-3-large",
Expand All @@ -44,4 +38,7 @@ def upsert(pdf_file: BinaryIO, examid: str) -> str:
index_name="abc"
)




return text
76 changes: 44 additions & 32 deletions app/services/prompt.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,43 @@
from typing import Any, BinaryIO
import os
import dotenv
import pdfplumber
from langchain_openai import OpenAIEmbeddings
import json
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

from pinecone import Pinecone
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts import PromptTemplate

from ..data.questionPrompts import mcq_prompt, essay_prompt

dotenv.load_dotenv()

pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
class QuestionParser(BaseModel):
question: str = Field(description="The question generated from the text.")
answer: str = Field(description="The answer to the generated question.")

class MultipleChoiceQuestionParser(BaseModel):
question: str = Field(description="The multiple choice question generated from the text.")
options: list[str] = Field(description="The options for the multiple choice question.")
answer: int = Field(description="The index of the correct answer in the options list.")

def format_docs(docs):
"""Helper function to format document content."""
return "\n\n".join([d.page_content for d in docs])

def select_prompt(question_type: str) -> tuple[str, JsonOutputParser]:
"""Selects the appropriate prompt and parser based on the question type."""
if question_type == "mcq":
return mcq_prompt(4) # This function is assumed to return a tuple (prompt, parser)
elif question_type == "essay":
return essay_prompt() # This function is assumed to return a tuple (prompt, parser)
else:
raise ValueError("Invalid question type. Please select 'mcq' or 'essay'.")

def prompt(text: str, examid: str, question_type: str = "mcq") -> dict:
"""Generates a question based on the provided text and exam ID."""
question, parser = select_prompt(question_type)

def prompt(text: str, examid: str) -> str:
"""Upserts PDF text into a Pinecone vector store and returns the extracted text."""

embed = OpenAIEmbeddings(
model="text-embedding-3-large",
api_key=os.getenv('OPENAI_API_KEY'),
Expand All @@ -31,34 +48,29 @@ def prompt(text: str, examid: str) -> str:
namespace=examid,
index_name="abc",
embedding=embed

)

vectorstore.similarity_search(
text,
# top_k=5
)
docs = vectorstore.similarity_search(text) # Assuming this method returns relevant documents

llm = ChatOpenAI(
model="gpt-3.5-turbo",
api_key=os.getenv('OPENAI_API_KEY')
model="gpt-4o",
api_key=os.getenv('OPENAI_API_KEY'),
model_kwargs={"response_format": {"type": "json_object"}}

)

qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectorstore.as_retriever()
prompt_template = PromptTemplate(
template="Generate one question, {question} about {query} from {document}. Output is only json format.",
input_variables=["query", "document", "question"],
partial_variables={"format_instructions": parser.get_format_instructions()},

)

print(qa.invoke(text))
return "Question generated successfully."






chain = prompt_template | llm | parser

formatted_docs = format_docs(docs)
result = chain.invoke({"query": text, "document": formatted_docs, "question": question})


return result
return json.dumps(result) # Converting the result to a JSON string for consistency

81 changes: 81 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
aiohttp==3.9.5
aiosignal==1.3.1
annotated-types==0.7.0
anyio==4.4.0
attrs==23.2.0
beautifulsoup4==4.12.3
certifi==2024.6.2
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cryptography==42.0.8
dataclasses-json==0.6.6
distro==1.9.0
dnspython==2.6.1
email_validator==2.1.1
fastapi==0.111.0
fastapi-cli==0.0.4
frozenlist==1.4.1
google==3.0.0
greenlet==3.0.3
h11==0.14.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
idna==3.7
Jinja2==3.1.4
jsonpatch==1.33
jsonpointer==2.4
langchain==0.2.3
langchain-community==0.2.4
langchain-core==0.2.5
langchain-openai==0.1.8
langchain-pinecone==0.1.1
langchain-text-splitters==0.2.1
langsmith==0.1.75
markdown-it-py==3.0.0
MarkupSafe==2.1.5
marshmallow==3.21.3
mdurl==0.1.2
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.26.4
openai==1.32.0
orjson==3.10.3
packaging==23.2
pdfminer.six==20231228
pdfplumber==0.11.0
pillow==10.3.0
pinecone-client==3.2.2
pinecone-plugin-interface==0.0.7
protobuf==5.27.1
protobuf-to-dict==0.1.0
pycparser==2.22
pydantic==2.7.3
pydantic_core==2.18.4
Pygments==2.18.0
pypdfium2==4.30.0
python-dotenv==1.0.1
python-multipart==0.0.9
PyYAML==6.0.1
regex==2024.5.15
requests==2.32.3
rich==13.7.1
shellingham==1.5.4
sniffio==1.3.1
soupsieve==2.5
SQLAlchemy==2.0.30
starlette==0.37.2
tenacity==8.3.0
tiktoken==0.7.0
tqdm==4.66.4
typer==0.12.3
typing-inspect==0.9.0
typing_extensions==4.12.1
ujson==5.10.0
urllib3==2.2.1
uvicorn==0.30.1
uvloop==0.19.0
watchfiles==0.22.0
websockets==12.0
yarl==1.9.4

0 comments on commit d47bf3a

Please sign in to comment.