Skip to content

Commit

Permalink
feat(pipeline): Add foundational ZenML-based pipeline stack with Loca…
Browse files Browse the repository at this point in the history
…l MongoDB Atlas integration and related updates

This commit introduces a modular stack for building RAG pipelines.

Implements ingestion and retrieval steps using ZenML.

Integrates with Local MongoDB Atlas for document storage and querying.

Adds Docker support for seamless development and deployment.

Reorders config.py attributes alphabetically based on env-var names.

Ensures comments in config.py align with .env file descriptions for better maintainability.

Updates .gitignore to include relevant entries for the project.

Updates pyproject.toml with required dependencies and configuration adjustments.
  • Loading branch information
Cre4T3Tiv3 committed Jan 15, 2025
1 parent 91025b4 commit b166038
Show file tree
Hide file tree
Showing 14 changed files with 4,035 additions and 61 deletions.
31 changes: 31 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Git-related files and directories
.git

# Python bytecode files
__pycache__/
*.pyc
*.pyo
*.pyd

# Log files
*.log

# Test directories and files
tests/

# Virtual environments
.venv/
*.venv/
venv/

# Build and distribution files
build/
dist/
*.tar.gz

# macOS-related files
.DS_Store

# Temporary or large data files
large_data/
temp_files/
79 changes: 78 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
@@ -1 +1,78 @@
OPENAI_API_KEY=
# .env

# AWS Configuration
# AWS region for cloud services
AWS_REGION="eu-central-1"
# AWS access key for authentication
AWS_ACCESS_KEY="<aws_access_key>"
# AWS secret key for authentication
AWS_SECRET_KEY="<aws_secret_key>"
# ARN for AWS cross-account access role
AWS_CROSS_ACCOUNT_ROLE_ARN="<aws_cross_account_role_arn>"
# Name of the S3 bucket for storing application data
AWS_S3_BUCKET_NAME="notion-second-brain-data"

# CometML Configuration
# API key for CometML integration
COMET_API_KEY="<comet_api_key>"
# CometML project name for tracking experiments
COMET_PROJECT_NAME="<comet_project_name>"

# Data Fetching Limits
# Maximum number of documents to fetch from the database
MAX_FETCH_LIMIT=50

# Default Genre for Querying
DEFAULT_GENRE="Western"

# Docker and Network Configuration
# Flag to indicate if the application is running inside a Docker container
IS_RUNNING_IN_DOCKER=True
# Docker network for inter-container communication
DOCKER_NETWORK_NAME="zenml_network"

# Enable Configurations
# Flag to enable ingestion from MongoDB Atlas
ENABLE_MONGODB_ATLAS_INGESTION=False
# Flag to enable offline mode (disables online ingestion)
ENABLE_OFFLINE_MODE=True
# Enable or disable structured logging
ENABLE_STRUCTURED_LOGGING=false

# GROQ Configuration
# API key for accessing GROQ services
GROQ_API_KEY="<groq_api_key>"

# Hugging Face Configuration
# Token for Hugging Face API
HUGGINGFACE_ACCESS_TOKEN="<huggingface_access_token>"

# Local Data Files
# Path to the local JSON file for offline processing
LOCAL_JSON_FILE_PATH="data/sample_data_set.json"

# MongoDB Configuration
# Connection URI for local MongoDB instance
MONGODB_OFFLINE_URI="mongodb://mongodb-atlas-local:27017"
# Name of the offline database
MONGODB_OFFLINE_DATABASE="rag_pipeline"
# Name of the collection in the offline database
MONGODB_OFFLINE_COLLECTION="offline_documents"

# MongoDB Online Configuration
# MongoDB Atlas URI
MONGODB_ONLINE_URI="mongodb+srv://<user>:<password>@<cluster>.ggtk1.mongodb.net/<database_name>?retryWrites=true&w=majority"
# Name of the online database
MONGODB_ONLINE_DATABASE="sample_mflix"
# Name of the collection in the online database
MONGODB_ONLINE_COLLECTION="movies"

# Notion API Configuration
# Secret key for accessing Notion API
NOTION_SECRET_KEY="<notion_secret_key>"

# OpenAI API Configuration
# API key for accessing OpenAI services
OPENAI_API_KEY="<openai_api_key>"
# Model identifier for OpenAI
OPENAI_MODEL_IDENTIFIER="openai-gpt-4o-mini"
20 changes: 7 additions & 13 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# ruff
.ruff_cache
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -27,6 +25,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
model_cache/

# PyInstaller
# Usually these files are written by a python script from a template
Expand All @@ -37,6 +36,7 @@ MANIFEST
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
*.txt

# Unit test / coverage reports
htmlcov/
Expand Down Expand Up @@ -100,7 +100,7 @@ ipython_config.py
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
uv.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
Expand Down Expand Up @@ -137,6 +137,7 @@ venv/
ENV/
env.bak/
venv.bak/
.vscode/

# Spyder project settings
.spyderproject
Expand Down Expand Up @@ -169,16 +170,9 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# VSCode
.vscode/

# PyPI configuration file
.pypirc

# MacOs
.DS_Store

# Data
faiss/
langflow_poc/
data/
# Logs
logs/
log/
2 changes: 1 addition & 1 deletion .python-version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.12
3.12.8
57 changes: 57 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Base Python image
FROM python:3.12-slim

# Set environment variables to suppress interactive prompts
ENV DEBIAN_FRONTEND=noninteractive
ENV TERM=xterm

# Update and install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
apt-utils \
build-essential \
gcc \
libffi-dev \
libssl-dev \
curl \
gnupg \
iputils-ping \
nano \
dnsutils \
&& curl -fsSL https://pgp.mongodb.com/server-6.0.asc | gpg --dearmor -o /usr/share/keyrings/mongodb-archive-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/mongodb-archive-keyring.gpg] https://repo.mongodb.org/apt/debian buster/mongodb-org/6.0 main" | tee /etc/apt/sources.list.d/mongodb-org-6.0.list \
&& apt-get update && apt-get install -y --no-install-recommends \
mongodb-mongosh \
&& rm -rf /var/lib/apt/lists/*

# Set the working directory inside the container
WORKDIR /app

# Suppress pip warnings about root user
ENV PIP_ROOT_USER_ACTION=ignore

# Copy project dependency files
COPY pyproject.toml uv.lock ./

# Create virtual environment and install dependencies
RUN python3 -m venv /app/.venv && \
/app/.venv/bin/pip install --no-cache-dir --upgrade pip uv && \
/app/.venv/bin/uv sync --python /app/.venv/bin/python && \
ls -la /app/.venv/bin

# Copy the project files into the container
COPY . .

# Set the PYTHONPATH to the project root
ENV PYTHONPATH=/app

# Set environment variables for the virtual environment
ENV PATH="/app/.venv/bin:$PATH"

# Ensure Python output is not buffered
ENV PYTHONUNBUFFERED=1

# Expose the application port
EXPOSE 8000

# Default command to allow debugging
CMD ["tail", "-f", "/dev/null"]
Loading

0 comments on commit b166038

Please sign in to comment.