From 7652bfb6fd8737e755eeff2e3ad876e84593db9e Mon Sep 17 00:00:00 2001 From: Nirmal Savinda Date: Thu, 13 Jun 2024 11:02:03 +0530 Subject: [PATCH] Dev (#7) * feat:question template added * feat:mcq generate * docs:readme * ci:Create pylint.yml * Update pylint.yml * refactor:remove unused imports * refactor:lint fix * fix:response json format --- .github/workflows/pylint.yml | 24 ++++++++++ README.md | 17 +++++++ app/__init__.py | 11 ++--- app/data/questionPrompts.py | 46 +++++++++++++++++++ app/main.py | 12 +++-- app/routers/questionGenerate.py | 18 ++++---- app/routers/upload.py | 14 +++--- app/services/pinecone_upsert.py | 21 ++++----- app/services/prompt.py | 76 ++++++++++++++++++------------- requirements.txt | 81 +++++++++++++++++++++++++++++++++ 10 files changed, 250 insertions(+), 70 deletions(-) create mode 100644 .github/workflows/pylint.yml create mode 100644 README.md create mode 100644 app/data/questionPrompts.py create mode 100644 requirements.txt diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml new file mode 100644 index 0000000..5312a4c --- /dev/null +++ b/.github/workflows/pylint.yml @@ -0,0 +1,24 @@ +name: Pylint + +on: [push] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ "3.12"] + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pylint + - name: Analysing the code with pylint + run: | + pylint $(git ls-files '*.py') diff --git a/README.md b/README.md new file mode 100644 index 0000000..d16a063 --- /dev/null +++ b/README.md @@ -0,0 +1,17 @@ +# Testify - AI Assistant + +## Setup +1. Clone the repository +2. Install the required packages using `pip install -r requirements.txt` + +```bash +pip install -r requirements.txt +``` + +3. Run the app using `uvicorn app.main:app --reload` + +```bash +uvicorn app.main:app --reload --port 7401 +``` + +4. Open API documentation at `http://localhost:7401/docs` \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py index acd8b3c..de26b39 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,14 +1,11 @@ # app/__init__.py -from .main import app - +from fastapi import FastAPI from .routers.upload import router as upload_router from .routers.questionGenerate import router as questionGenerate_router +from .main import app + +# Include routers with appropriate API version prefix app.include_router(upload_router, prefix="/api/v1") app.include_router(questionGenerate_router, prefix="/api/v1") - - -# app/routers/upload.py - - diff --git a/app/data/questionPrompts.py b/app/data/questionPrompts.py new file mode 100644 index 0000000..a5e47df --- /dev/null +++ b/app/data/questionPrompts.py @@ -0,0 +1,46 @@ +from langchain_core.pydantic_v1 import BaseModel, Field +from langchain_core.output_parsers import JsonOutputParser + + + +# Define a Pydantic model for a standard question and answer format. +class QuestionParser(BaseModel): + question: str = Field(description="The question generated from the text.") + answer: str = Field(description="The answer to the generated question.") + +# Define a Pydantic model for multiple-choice questions. +class Answer(BaseModel): + char: str = Field(description="The character representing the answer, e.g., 'A', 'B', 'C', 'D'.") + text: str = Field(description="The text of the answer.") + +class MultipleChoiceQuestionParser(BaseModel): + question: str = Field(description="The multiple choice question generated from the text.") + options: list[Answer] = Field(description="The options for the multiple choice question, should be a list of Answer objects.") + answer: str = Field(description="The character representing the correct answer, e.g., 'A', 'B', 'C', 'D'.") + +# Function to generate a prompt and corresponding parser for creating multiple-choice questions. +def mcq_prompt(options: int) -> tuple[str, JsonOutputParser]: + """ + Generates a prompt for creating multiple-choice questions along with a JSON output parser. + + Args: + options_count (int): The number of options for the multiple-choice question. + + Returns: + tuple[str, JsonOutputParser]: A tuple containing the prompt and the JSON output parser. + """ + prompt_text = f"Generate a multiple choice question with {options} options and indicate the correct answer." + parser = JsonOutputParser(pydantic_object=MultipleChoiceQuestionParser) + return (prompt_text, parser) + +# Function to generate a prompt and corresponding parser for creating essay-type questions. +def essay_prompt() -> tuple[str, JsonOutputParser]: + """ + Generates a prompt for creating essay questions along with a JSON output parser. + + Returns: + tuple[str, JsonOutputParser]: A tuple containing the prompt and the JSON output parser. + """ + prompt_text = "Generate an essay question." + parser = JsonOutputParser(pydantic_object=QuestionParser) + return (prompt_text, parser) diff --git a/app/main.py b/app/main.py index d23a18c..f67db31 100644 --- a/app/main.py +++ b/app/main.py @@ -1,12 +1,14 @@ from fastapi import FastAPI -# Initialize the FastAPI app with a custom title +# Create an instance of the FastAPI application with a custom title app = FastAPI(title="Testify AI") -@app.get("/", response_model=dict) +@app.get("/api/assistant", response_model=dict) async def read_root() -> dict: """ - Root GET endpoint to return a simple greeting. - Returns a JSON object with a greeting message. + Root GET endpoint that provides a simple greeting message. + + Returns: + dict: A dictionary containing a greeting message. """ - return {"Hello": "World"} + return {"message": "Welcome to the Testify AI Assistant!"} diff --git a/app/routers/questionGenerate.py b/app/routers/questionGenerate.py index 0a2dc9d..4c67563 100644 --- a/app/routers/questionGenerate.py +++ b/app/routers/questionGenerate.py @@ -1,14 +1,16 @@ -from fastapi import APIRouter, Query, HTTPException -from typing import List - +from fastapi import APIRouter, HTTPException, Query from ..services.prompt import prompt - router = APIRouter() -@router.get("/generate-question/", response_model=str) +@router.get("/generate-question/", response_model=dict) async def generate_question(text: str = Query(..., description="The text to generate a question for"), - examid: str = Query(..., description="The ID of the exam related to the text")) -> str: + examid: str = Query(..., description="The ID of the exam related to the text")) -> dict: """Endpoint to generate a question for a given text using OpenAI's model.""" - - return prompt(text, examid) \ No newline at end of file + try: + # Assuming 'prompt' function is synchronous; if it's async, use 'await prompt(text, examid)' + question_response = prompt(text, examid) + return question_response + except Exception as e: + # Catching a broad exception is not best practice; adjust according to specific exceptions expected from 'prompt' + raise HTTPException(status_code=500, detail=f"An error occurred while generating the question: {str(e)}") diff --git a/app/routers/upload.py b/app/routers/upload.py index c2a1bb8..3c14fdc 100644 --- a/app/routers/upload.py +++ b/app/routers/upload.py @@ -6,15 +6,17 @@ router = APIRouter() @router.post("/upload-pdf/", status_code=201) -async def upload_pdf(file: UploadFile = File(...), examid:str = Query(..., description="The ID of the exam related to the uploaded PDF") ) -> dict: +async def upload_pdf(file: UploadFile = File(...), examid: str = Query(..., description="The ID of the exam related to the uploaded PDF")) -> dict: """Endpoint to upload a PDF and upsert its contents into a Pinecone vector store.""" - if file.content_type != 'application/pdf': raise HTTPException(status_code=415, detail="Unsupported file type. Please upload a PDF.") - # Call the upsert function from the imported service - upsert(file, examid) + # Assuming 'upsert' is an async function; if not, consider wrapping with 'await' + # or adjust the function to be a regular call if it's designed to be synchronous + success = upsert(file, examid) + + if not success: + raise HTTPException(status_code=500, detail="Failed to process the PDF file.") - # return {"filename": file.filename} - Response(status_code=201) + # Directly return a message if upsert is successful; 'Response(status_code=201)' is redundant with `status_code=201` in the decorator return {"message": "PDF uploaded successfully."} diff --git a/app/services/pinecone_upsert.py b/app/services/pinecone_upsert.py index 7c351a8..f3edfd2 100644 --- a/app/services/pinecone_upsert.py +++ b/app/services/pinecone_upsert.py @@ -1,19 +1,15 @@ -from typing import Any, BinaryIO +from typing import BinaryIO import os import dotenv import pdfplumber from langchain_openai import OpenAIEmbeddings from langchain_pinecone import PineconeVectorStore -from pinecone import Pinecone, ServerlessSpec dotenv.load_dotenv() -pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY')) - -def generate_embeddings_from_pdf(pdf_file: BinaryIO) -> str: - """Generates embeddings from a PDF file using OpenAI's model.""" - - print(pdf_file.filename) +def generate_text_from_pdf(pdf_file: BinaryIO) -> str: + """Generates and returns text extracted from a PDF file.""" + print(f"Processing file: {pdf_file.filename}") full_text = "" with pdfplumber.open(pdf_file.file) as pdf: for page in pdf.pages: @@ -23,13 +19,11 @@ def generate_embeddings_from_pdf(pdf_file: BinaryIO) -> str: if not full_text.strip(): raise ValueError("No text found in the PDF.") - return full_text def upsert(pdf_file: BinaryIO, examid: str) -> str: - """Upserts PDF text into a Pinecone vector store and returns the extracted text.""" - - text = generate_embeddings_from_pdf(pdf_file) + """Extracts text from a PDF file, generates embeddings, and upserts them into a Pinecone vector store.""" + text = generate_text_from_pdf(pdf_file) embeddings = OpenAIEmbeddings( model="text-embedding-3-large", @@ -44,4 +38,7 @@ def upsert(pdf_file: BinaryIO, examid: str) -> str: index_name="abc" ) + + + return text diff --git a/app/services/prompt.py b/app/services/prompt.py index 0d5c2a6..45c18ac 100644 --- a/app/services/prompt.py +++ b/app/services/prompt.py @@ -1,26 +1,43 @@ -from typing import Any, BinaryIO import os import dotenv -import pdfplumber -from langchain_openai import OpenAIEmbeddings +import json +from langchain_openai import OpenAIEmbeddings, ChatOpenAI from langchain_pinecone import PineconeVectorStore -from pinecone import Pinecone, ServerlessSpec - -from langchain.chat_models import ChatOpenAI -from langchain.chains import RetrievalQA - +from pinecone import Pinecone +from langchain_core.pydantic_v1 import BaseModel, Field +from langchain_core.output_parsers import JsonOutputParser +from langchain.prompts import PromptTemplate +from ..data.questionPrompts import mcq_prompt, essay_prompt dotenv.load_dotenv() -pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY')) +class QuestionParser(BaseModel): + question: str = Field(description="The question generated from the text.") + answer: str = Field(description="The answer to the generated question.") + +class MultipleChoiceQuestionParser(BaseModel): + question: str = Field(description="The multiple choice question generated from the text.") + options: list[str] = Field(description="The options for the multiple choice question.") + answer: int = Field(description="The index of the correct answer in the options list.") +def format_docs(docs): + """Helper function to format document content.""" + return "\n\n".join([d.page_content for d in docs]) +def select_prompt(question_type: str) -> tuple[str, JsonOutputParser]: + """Selects the appropriate prompt and parser based on the question type.""" + if question_type == "mcq": + return mcq_prompt(4) # This function is assumed to return a tuple (prompt, parser) + elif question_type == "essay": + return essay_prompt() # This function is assumed to return a tuple (prompt, parser) + else: + raise ValueError("Invalid question type. Please select 'mcq' or 'essay'.") +def prompt(text: str, examid: str, question_type: str = "mcq") -> dict: + """Generates a question based on the provided text and exam ID.""" + question, parser = select_prompt(question_type) -def prompt(text: str, examid: str) -> str: - """Upserts PDF text into a Pinecone vector store and returns the extracted text.""" - embed = OpenAIEmbeddings( model="text-embedding-3-large", api_key=os.getenv('OPENAI_API_KEY'), @@ -31,34 +48,29 @@ def prompt(text: str, examid: str) -> str: namespace=examid, index_name="abc", embedding=embed - ) - vectorstore.similarity_search( - text, - # top_k=5 - ) + docs = vectorstore.similarity_search(text) # Assuming this method returns relevant documents llm = ChatOpenAI( - model="gpt-3.5-turbo", - api_key=os.getenv('OPENAI_API_KEY') + model="gpt-4o", + api_key=os.getenv('OPENAI_API_KEY'), + model_kwargs={"response_format": {"type": "json_object"}} + ) - qa = RetrievalQA.from_chain_type( - llm=llm, - chain_type="stuff", - retriever=vectorstore.as_retriever() + prompt_template = PromptTemplate( + template="Generate one question, {question} about {query} from {document}. Output is only json format.", + input_variables=["query", "document", "question"], + partial_variables={"format_instructions": parser.get_format_instructions()}, + ) - print(qa.invoke(text)) - return "Question generated successfully." - - - - - - + chain = prompt_template | llm | parser + formatted_docs = format_docs(docs) + result = chain.invoke({"query": text, "document": formatted_docs, "question": question}) - + return result + return json.dumps(result) # Converting the result to a JSON string for consistency diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bdee9c7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,81 @@ +aiohttp==3.9.5 +aiosignal==1.3.1 +annotated-types==0.7.0 +anyio==4.4.0 +attrs==23.2.0 +beautifulsoup4==4.12.3 +certifi==2024.6.2 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +cryptography==42.0.8 +dataclasses-json==0.6.6 +distro==1.9.0 +dnspython==2.6.1 +email_validator==2.1.1 +fastapi==0.111.0 +fastapi-cli==0.0.4 +frozenlist==1.4.1 +google==3.0.0 +greenlet==3.0.3 +h11==0.14.0 +httpcore==1.0.5 +httptools==0.6.1 +httpx==0.27.0 +idna==3.7 +Jinja2==3.1.4 +jsonpatch==1.33 +jsonpointer==2.4 +langchain==0.2.3 +langchain-community==0.2.4 +langchain-core==0.2.5 +langchain-openai==0.1.8 +langchain-pinecone==0.1.1 +langchain-text-splitters==0.2.1 +langsmith==0.1.75 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +marshmallow==3.21.3 +mdurl==0.1.2 +multidict==6.0.5 +mypy-extensions==1.0.0 +numpy==1.26.4 +openai==1.32.0 +orjson==3.10.3 +packaging==23.2 +pdfminer.six==20231228 +pdfplumber==0.11.0 +pillow==10.3.0 +pinecone-client==3.2.2 +pinecone-plugin-interface==0.0.7 +protobuf==5.27.1 +protobuf-to-dict==0.1.0 +pycparser==2.22 +pydantic==2.7.3 +pydantic_core==2.18.4 +Pygments==2.18.0 +pypdfium2==4.30.0 +python-dotenv==1.0.1 +python-multipart==0.0.9 +PyYAML==6.0.1 +regex==2024.5.15 +requests==2.32.3 +rich==13.7.1 +shellingham==1.5.4 +sniffio==1.3.1 +soupsieve==2.5 +SQLAlchemy==2.0.30 +starlette==0.37.2 +tenacity==8.3.0 +tiktoken==0.7.0 +tqdm==4.66.4 +typer==0.12.3 +typing-inspect==0.9.0 +typing_extensions==4.12.1 +ujson==5.10.0 +urllib3==2.2.1 +uvicorn==0.30.1 +uvloop==0.19.0 +watchfiles==0.22.0 +websockets==12.0 +yarl==1.9.4