-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(pipeline): Add foundational ZenML-based pipeline stack with Loca…
…l MongoDB Atlas integration and related updates This commit introduces a modular stack for building RAG pipelines. Implements ingestion and retrieval steps using ZenML. Integrates with Local MongoDB Atlas for document storage and querying. Adds Docker support for seamless development and deployment. Reorders config.py attributes alphabetically based on env-var names. Ensures comments in config.py align with .env file descriptions for better maintainability. Updates .gitignore to include relevant entries for the project. Updates pyproject.toml with required dependencies and configuration adjustments.
- Loading branch information
1 parent
91025b4
commit b166038
Showing
14 changed files
with
4,035 additions
and
61 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# Git-related files and directories | ||
.git | ||
|
||
# Python bytecode files | ||
__pycache__/ | ||
*.pyc | ||
*.pyo | ||
*.pyd | ||
|
||
# Log files | ||
*.log | ||
|
||
# Test directories and files | ||
tests/ | ||
|
||
# Virtual environments | ||
.venv/ | ||
*.venv/ | ||
venv/ | ||
|
||
# Build and distribution files | ||
build/ | ||
dist/ | ||
*.tar.gz | ||
|
||
# macOS-related files | ||
.DS_Store | ||
|
||
# Temporary or large data files | ||
large_data/ | ||
temp_files/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,78 @@ | ||
OPENAI_API_KEY= | ||
# .env | ||
|
||
# AWS Configuration | ||
# AWS region for cloud services | ||
AWS_REGION="eu-central-1" | ||
# AWS access key for authentication | ||
AWS_ACCESS_KEY="<aws_access_key>" | ||
# AWS secret key for authentication | ||
AWS_SECRET_KEY="<aws_secret_key>" | ||
# ARN for AWS cross-account access role | ||
AWS_CROSS_ACCOUNT_ROLE_ARN="<aws_cross_account_role_arn>" | ||
# Name of the S3 bucket for storing application data | ||
AWS_S3_BUCKET_NAME="notion-second-brain-data" | ||
|
||
# CometML Configuration | ||
# API key for CometML integration | ||
COMET_API_KEY="<comet_api_key>" | ||
# CometML project name for tracking experiments | ||
COMET_PROJECT_NAME="<comet_project_name>" | ||
|
||
# Data Fetching Limits | ||
# Maximum number of documents to fetch from the database | ||
MAX_FETCH_LIMIT=50 | ||
|
||
# Default Genre for Querying | ||
DEFAULT_GENRE="Western" | ||
|
||
# Docker and Network Configuration | ||
# Flag to indicate if the application is running inside a Docker container | ||
IS_RUNNING_IN_DOCKER=True | ||
# Docker network for inter-container communication | ||
DOCKER_NETWORK_NAME="zenml_network" | ||
|
||
# Enable Configurations | ||
# Flag to enable ingestion from MongoDB Atlas | ||
ENABLE_MONGODB_ATLAS_INGESTION=False | ||
# Flag to enable offline mode (disables online ingestion) | ||
ENABLE_OFFLINE_MODE=True | ||
# Enable or disable structured logging | ||
ENABLE_STRUCTURED_LOGGING=false | ||
|
||
# GROQ Configuration | ||
# API key for accessing GROQ services | ||
GROQ_API_KEY="<groq_api_key>" | ||
|
||
# Hugging Face Configuration | ||
# Token for Hugging Face API | ||
HUGGINGFACE_ACCESS_TOKEN="<huggingface_access_token>" | ||
|
||
# Local Data Files | ||
# Path to the local JSON file for offline processing | ||
LOCAL_JSON_FILE_PATH="data/sample_data_set.json" | ||
|
||
# MongoDB Configuration | ||
# Connection URI for local MongoDB instance | ||
MONGODB_OFFLINE_URI="mongodb://mongodb-atlas-local:27017" | ||
# Name of the offline database | ||
MONGODB_OFFLINE_DATABASE="rag_pipeline" | ||
# Name of the collection in the offline database | ||
MONGODB_OFFLINE_COLLECTION="offline_documents" | ||
|
||
# MongoDB Online Configuration | ||
# MongoDB Atlas URI | ||
MONGODB_ONLINE_URI="mongodb+srv://<user>:<password>@<cluster>.ggtk1.mongodb.net/<database_name>?retryWrites=true&w=majority" | ||
# Name of the online database | ||
MONGODB_ONLINE_DATABASE="sample_mflix" | ||
# Name of the collection in the online database | ||
MONGODB_ONLINE_COLLECTION="movies" | ||
|
||
# Notion API Configuration | ||
# Secret key for accessing Notion API | ||
NOTION_SECRET_KEY="<notion_secret_key>" | ||
|
||
# OpenAI API Configuration | ||
# API key for accessing OpenAI services | ||
OPENAI_API_KEY="<openai_api_key>" | ||
# Model identifier for OpenAI | ||
OPENAI_MODEL_IDENTIFIER="openai-gpt-4o-mini" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
3.12 | ||
3.12.8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# Base Python image | ||
FROM python:3.12-slim | ||
|
||
# Set environment variables to suppress interactive prompts | ||
ENV DEBIAN_FRONTEND=noninteractive | ||
ENV TERM=xterm | ||
|
||
# Update and install system dependencies | ||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
apt-utils \ | ||
build-essential \ | ||
gcc \ | ||
libffi-dev \ | ||
libssl-dev \ | ||
curl \ | ||
gnupg \ | ||
iputils-ping \ | ||
nano \ | ||
dnsutils \ | ||
&& curl -fsSL https://pgp.mongodb.com/server-6.0.asc | gpg --dearmor -o /usr/share/keyrings/mongodb-archive-keyring.gpg \ | ||
&& echo "deb [signed-by=/usr/share/keyrings/mongodb-archive-keyring.gpg] https://repo.mongodb.org/apt/debian buster/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \ | ||
&& apt-get update && apt-get install -y --no-install-recommends \ | ||
mongodb-mongosh \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Set the working directory inside the container | ||
WORKDIR /app | ||
|
||
# Suppress pip warnings about root user | ||
ENV PIP_ROOT_USER_ACTION=ignore | ||
|
||
# Copy project dependency files | ||
COPY pyproject.toml uv.lock ./ | ||
|
||
# Create virtual environment and install dependencies | ||
RUN python3 -m venv /app/.venv && \ | ||
/app/.venv/bin/pip install --no-cache-dir --upgrade pip uv && \ | ||
/app/.venv/bin/uv sync --python /app/.venv/bin/python && \ | ||
ls -la /app/.venv/bin | ||
|
||
# Copy the project files into the container | ||
COPY . . | ||
|
||
# Set the PYTHONPATH to the project root | ||
ENV PYTHONPATH=/app | ||
|
||
# Set environment variables for the virtual environment | ||
ENV PATH="/app/.venv/bin:$PATH" | ||
|
||
# Ensure Python output is not buffered | ||
ENV PYTHONUNBUFFERED=1 | ||
|
||
# Expose the application port | ||
EXPOSE 8000 | ||
|
||
# Default command to allow debugging | ||
CMD ["tail", "-f", "/dev/null"] |
Oops, something went wrong.