Skip to content

Commit

Permalink
chore: modify old and remove unused stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
tuantran0910 committed Dec 2, 2024
1 parent 65cc9d2 commit ca44a20
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 143 deletions.
30 changes: 18 additions & 12 deletions chatbot-core/backend/app/models/connector.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
from datetime import datetime
from enum import Enum
from pydantic import BaseModel, Field
from sqlalchemy import Column, DateTime, Integer, String, JSON
from typing import Optional, List, Dict
from sqlalchemy import DateTime, String
from sqlalchemy import Enum as SQLAlchemyEnum
from sqlalchemy.dialects.mssql import UNIQUEIDENTIFIER
from sqlalchemy.orm import mapped_column, Mapped
from typing import Optional, List
from uuid import uuid4, UUID

from app.models import Base

Expand All @@ -15,13 +19,15 @@ class DocumentSource(str, Enum):
class Connector(Base):
__tablename__ = "connectors"

id = Column(Integer, primary_key=True, index=True, autoincrement=True)
name = Column(String)
source = Column(String)
connector_specific_config = Column(JSON)
created_at = Column(DateTime, default=datetime.now)
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
deleted_at = Column(DateTime, default=None, nullable=True)
id: Mapped[UNIQUEIDENTIFIER] = mapped_column(
UNIQUEIDENTIFIER(as_uuid=True), primary_key=True, index=True, default=uuid4
)
name: Mapped[str] = mapped_column(String(255), nullable=False)
source: Mapped[DocumentSource] = mapped_column(SQLAlchemyEnum(DocumentSource, native_enum=False), nullable=False)
connector_specific_config: Mapped[str] = mapped_column(String, nullable=True)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.now)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.now, onupdate=datetime.now)
deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, default=None)


class ConnectorRequest(BaseModel):
Expand All @@ -34,13 +40,13 @@ class Config:


class ConnectorResponse(BaseModel):
id: int = Field(..., description="Connector ID")
id: UUID = Field(..., description="Connector ID")
name: str = Field(..., description="Connector name")
source: DocumentSource = Field(..., description="Document source")
connector_specific_config: Optional[Dict] = Field(..., description="Connector specific configuration")
connector_specific_config: Optional[str] = Field(None, description="Connector specific configuration")
created_at: datetime = Field(..., description="Created at timestamp")
updated_at: datetime = Field(..., description="Updated at timestamp")
deleted_at: Optional[datetime] = Field(..., description="Deleted at timestamp")
deleted_at: Optional[datetime] = Field(None, description="Deleted at timestamp")

class Config:
from_attributes = True
Expand Down
66 changes: 31 additions & 35 deletions chatbot-core/backend/app/models/document.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,52 @@
from datetime import datetime
from pydantic import BaseModel, Field
from sqlalchemy import Column, DateTime, ForeignKey, Integer, String
from sqlalchemy.orm import relationship
from typing import List
from sqlalchemy import DateTime, ForeignKey, String
from sqlalchemy.dialects.mssql import UNIQUEIDENTIFIER
from sqlalchemy.orm import relationship, mapped_column, Mapped
from typing import List, Optional
from uuid import uuid4

from app.models import Base


class DocumentMetadata(Base):
__tablename__ = "document_metadata"

id = Column(Integer, primary_key=True, index=True, autoincrement=True)
name = Column(String, index=True)
description = Column(String)
object_url = Column(String)
created_at = Column(DateTime, default=datetime.now)
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
deleted_at = Column(DateTime, default=None, nullable=True)

tags = relationship(
"DocumentMetadataTags",
back_populates="document_metadata",
cascade="all, delete-orphan",
id: Mapped[UNIQUEIDENTIFIER] = mapped_column(
UNIQUEIDENTIFIER(as_uuid=True), primary_key=True, index=True, default=uuid4
)
name: Mapped[str] = mapped_column(String(255), nullable=False)
description: Mapped[str] = mapped_column(String, nullable=True)
document_url: Mapped[str] = mapped_column(String, nullable=False)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.now)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.now, onupdate=datetime.now)
deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, default=None)

tags: Mapped[List["DocumentMetadataTags"]] = relationship(
"DocumentMetadataTags", back_populates="document_metadata", lazy="joined"
)


class DocumentMetadataTags(Base):
__tablename__ = "document_metadata_tags"

id = Column(Integer, primary_key=True, index=True, autoincrement=True)
name = Column(String, index=True)
document_metadata_id = Column(Integer, ForeignKey("document_metadata.id"))
created_at = Column(DateTime, default=datetime.now)
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
deleted_at = Column(DateTime, default=None, nullable=True)

document_metadata = relationship("DocumentMetadata", back_populates="tags")


class DocumentMetadataRequest(BaseModel):
name: str = Field(..., description="Document name")
description: str = Field(..., description="Document description")
tags: List[str] = Field(default_factory=list, description="List of tags")
id: Mapped[UNIQUEIDENTIFIER] = mapped_column(
UNIQUEIDENTIFIER(as_uuid=True), primary_key=True, index=True, default=uuid4
)
name: Mapped[str] = mapped_column(String(255), nullable=False)
document_metadata_id: Mapped[UNIQUEIDENTIFIER] = mapped_column(
UNIQUEIDENTIFIER(as_uuid=True), ForeignKey("document_metadata.id"), nullable=False
)
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.now)
updated_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), default=datetime.now, onupdate=datetime.now)
deleted_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True, default=None)

class Config:
from_attributes = True
document_metadata: Mapped[DocumentMetadata] = relationship("DocumentMetadata", back_populates="tags")


class DocumentMetadataResponse(BaseModel):
name: str = Field(..., description="Document name")
description: str = Field(default="", description="Document description")
tags: List[str] = Field(default_factory=list, description="List of tags")
class DocumentUploadResponse(BaseModel):
document_url: str = Field(..., description="Object URL")

class Config:
from_attributes = True
arbitrary_types_allowed = True
2 changes: 1 addition & 1 deletion chatbot-core/backend/app/routers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ async def home():
BackendAPIResponse()
.set_data(
{
"org": "Tinh Hoa Solutions",
"organization": "Tinh Hoa Solutions",
"description": "API for LLM-based Application Chatbot",
}
)
Expand Down
38 changes: 24 additions & 14 deletions chatbot-core/backend/app/routers/v1/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,21 @@

from app.databases.mssql import get_db_session
from app.models.api import APIResponse
from app.models.document import DocumentUploadResponse
from app.models.connector import ConnectorRequest, ConnectorResponse
from app.models.document import DocumentMetadataResponse
from app.services.connector import ConnectorService
from app.services.document import DocumentService
from app.settings import Constants
from app.utils.api_response import BackendAPIResponse
from app.utils.logger import LoggerFactory

logger = LoggerFactory().get_logger(__name__)
router = APIRouter(prefix="/connectors", tags=["connectors", "files"])
router = APIRouter(prefix="/connectors", tags=["connectors", "documents"])


@router.post("/files/upload", response_model=APIResponse, status_code=status.HTTP_201_CREATED)
@router.post("/documents/upload", response_model=APIResponse, status_code=status.HTTP_201_CREATED)
def upload_documents(
files: Annotated[List[UploadFile], File(description="One or multiple documents")],
documents: Annotated[List[UploadFile], File(description="One or multiple documents")],
db_session: Session = Depends(get_db_session),
) -> None:
"""
Expand All @@ -29,13 +29,14 @@ def upload_documents(
db_session (Session, optional): Database session. Defaults to relational database engine.
"""
# Upload documents to object storage and trigger indexing pipeline
err = DocumentService(db_session=db_session).upload_documents(files=files)
document_urls, err = DocumentService(db_session=db_session).upload_documents(documents=documents)
if err:
status_code, detail = err.kind
raise HTTPException(status_code=status_code, detail=detail)

# Parse response
data = [DocumentMetadataResponse(name=file.filename) for file in files]
data = [DocumentUploadResponse(document_url=document_url) for document_url in document_urls]

return BackendAPIResponse().set_message(message=Constants.API_SUCCESS).set_data(data=data).respond()


Expand All @@ -60,12 +61,12 @@ def get_connectors(db_session: Session = Depends(get_db_session)) -> None:


@router.get("/{connector_id}", response_model=APIResponse, status_code=status.HTTP_200_OK)
def get_connector(connector_id: int, db_session: Session = Depends(get_db_session)) -> None:
def get_connector(connector_id: str, db_session: Session = Depends(get_db_session)) -> None:
"""
Get connector by id.
Args:
connector_id (int): Connector id
connector_id (str): Connector id
db_session (Session, optional): Database session. Defaults to relational database engine.
"""
# Get connector by id
Expand All @@ -75,7 +76,10 @@ def get_connector(connector_id: int, db_session: Session = Depends(get_db_sessio
raise HTTPException(status_code=status_code, detail=detail)

# Parse response
data = ConnectorResponse.model_validate(connector)
if connector:
data = ConnectorResponse.model_validate(connector)
else:
data = None

return BackendAPIResponse().set_message(message=Constants.API_SUCCESS).set_data(data=data).respond()

Expand All @@ -95,12 +99,15 @@ def create_connector(connector_request: ConnectorRequest, db_session: Session =
status_code, detail = err.kind
raise HTTPException(status_code=status_code, detail=detail)

return BackendAPIResponse().set_message(message=Constants.API_SUCCESS).respond()
# Parse response
data = connector_request.model_dump(exclude_unset=True)

return BackendAPIResponse().set_message(message=Constants.API_SUCCESS).set_data(data=data).respond()


@router.patch("/{connector_id}", response_model=APIResponse, status_code=status.HTTP_200_OK)
def update_connector(
connector_id: int, connector_request: ConnectorRequest, db_session: Session = Depends(get_db_session)
connector_id: str, connector_request: ConnectorRequest, db_session: Session = Depends(get_db_session)
) -> None:
"""
Update connector by connector_id.
Expand All @@ -118,16 +125,19 @@ def update_connector(
status_code, detail = err.kind
raise HTTPException(status_code=status_code, detail=detail)

return BackendAPIResponse().set_message(message=Constants.API_SUCCESS).respond()
# Parse response
data = connector_request.model_dump(exclude_unset=True)

return BackendAPIResponse().set_message(message=Constants.API_SUCCESS).set_data(data=data).respond()


@router.delete("/{connector_id}", status_code=status.HTTP_204_NO_CONTENT)
def delete_connector(connector_id: int, db_session: Session = Depends(get_db_session)) -> None:
def delete_connector(connector_id: str, db_session: Session = Depends(get_db_session)) -> None:
"""
Delete connector by connector_id.
Args:
connector_id (int): Connector id
connector_id (str): Connector id
db_session (Session, optional): Database session. Defaults to relational database engine.
"""
# Delete connector by id
Expand Down
6 changes: 3 additions & 3 deletions chatbot-core/backend/app/settings/secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ class Secrets:
MSSQL_HOST = os.getenv("MSSQL_HOST", "127.0.0.1")
MSSQL_USER = os.getenv("MSSQL_USER", "SA")
MSSQL_SA_PASSWORD = os.getenv("MSSQL_SA_PASSWORD", "P&ssword123")
MSSQL_DB = os.getenv("MSSQL_DB", "ezhr_chatbot")
MSSQL_DB = os.getenv("MSSQL_DB", "chatbot_core")

# Minio Credentials
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "127.0.0.1:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "minioadmin")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "minioadmin")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "S3User")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "P&ssword123")

# Qdrant Credentials
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
Expand Down
47 changes: 32 additions & 15 deletions chatbot-core/backend/app/utils/indexing.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
from fastapi import File, UploadFile
from llama_index.core import Settings
from llama_index.core.storage.storage_context import StorageContext
from llama_index.core.extractors import KeywordExtractor, QuestionsAnsweredExtractor
from llama_index.core.ingestion import IngestionCache, IngestionPipeline
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.schema import BaseNode
from llama_index.vector_stores.qdrant import QdrantVectorStore
from typing import Annotated, Any, List

from app.databases.qdrant import get_vector_db_client
from app.databases.redis import get_cache_store_client
from app.databases.qdrant import get_vector_db_connector
from app.databases.redis import get_cache_connector
from app.settings import Constants
from app.utils.pdf_reader import parse_pdf


def get_transformations() -> List[Any]:
"""
Get the transformation components for the ingestion pipeline
Returns:
List[Any]: List of LlamaIndex transformation components
"""
# Define node postprocessor methods
extractors = [
Expand All @@ -36,24 +40,37 @@ def get_transformations() -> List[Any]:


def index_document_to_vector_db(
file: Annotated[UploadFile, File(description="PDF file")],
document: Annotated[UploadFile, File(description="PDF file")],
) -> None:
"""
Index a PDF document into the vector database.
Args:
document (UploadFile): PDF file
"""
# Parse PDF file into LlamaIndex Document objects
documents = parse_pdf(file)
documents = parse_pdf(document=document)

# Define vector store
vector_db_client = get_vector_db_client()
vector_store = QdrantVectorStore(
client=vector_db_client,
collection_name=Constants.LLM_QDRANT_COLLECTION,
)
# Initialize the vector store for the ingestion pipeline
with get_vector_db_connector() as vector_db_connector:
# Create a collection in the vector database
vector_db_connector.create_collection(
collection_name=Constants.LLM_QDRANT_COLLECTION,
)

vector_db_client = vector_db_connector.client
vector_store = QdrantVectorStore(
client=vector_db_client,
collection_name=Constants.LLM_QDRANT_COLLECTION,
)

# Initialize the cache store for the ingestion pipeline
cache = get_cache_store_client()
ingest_cache = IngestionCache(
cache=cache,
collection=Constants.LLM_REDIS_CACHE_COLLECTION,
)
with get_cache_connector() as cache_connector:
cache_store = cache_connector.get_cache_store()
ingest_cache = IngestionCache(
cache=cache_store,
collection=Constants.LLM_REDIS_CACHE_COLLECTION,
)

# Define transformation components (chunking + node postprocessors)
transformations = get_transformations()
Expand Down
6 changes: 3 additions & 3 deletions chatbot-core/backend/app/utils/pdf_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,20 @@


def parse_pdf(
file: Annotated[UploadFile, File(description="PDF file")],
document: Annotated[UploadFile, File(description="PDF file")],
) -> List[Document] | None:
"""
Parse a PDF file into Llamaindex Document objects.
Args:
file (UploadFile): PDF file to parse.
document (UploadFile): PDF file to parse.
Returns:
List[Document]: List of Llamaindex Document objects.
"""
try:
documents = []
with pdfplumber.open(file.file) as pdf:
with pdfplumber.open(document.file) as pdf:
for page in pdf.pages:
documents.append(Document(text=page.extract_text()))
except Exception as e:
Expand Down
3 changes: 1 addition & 2 deletions chatbot-core/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ dependencies = [
"pdfplumber>=0.11.4",
"qdrant-client>=1.12.1",
"minio>=7.2.12",
"sqlalchemy>=1.4.54",
"pymssql>=2.3.2",
"pyodbc>=5.2.0",
"sqlalchemy>=2.0.36",
]

[project.optional-dependencies]
Expand Down
Loading

0 comments on commit ca44a20

Please sign in to comment.