Skip to content

Commit

Permalink
Merge pull request #3 from decodingml/feat/crawling
Browse files Browse the repository at this point in the history
feat: Implement crawling
  • Loading branch information
iusztinpaul authored Jan 22, 2025
2 parents 7afda18 + 0435f2e commit 7b4333f
Show file tree
Hide file tree
Showing 63 changed files with 2,311 additions and 1,078 deletions.
31 changes: 22 additions & 9 deletions apps/second-brain-offline/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ export PYTHONPATH = .
# --- Default Values ---

CHECK_DIRS := .
LOCAL_DATA_PATH := data
NOTION_LOCAL_DATA_PATH := data/notion
CRAWLED_LOCAL_DATA_PATH := data/crawled


# --- Utilities ---
Expand Down Expand Up @@ -42,16 +43,25 @@ local-infrastructure-stop: local-docker-infrastructure-stop local-zenml-server-s

# --- AWS ---

s3-upload: # Upload a local folder to S3
@echo "Uploading to S3 bucket: $(AWS_S3_BUCKET_NAME)"
uv run python -m tools.use_s3 upload $(LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix $(AWS_S3_PREFIX)
s3-upload-raw-dataset: # Upload raw Notion dataset from local folder to S3
@echo "Uploading raw Notion dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion"
uv run python -m tools.use_s3 upload $(NOTION_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/notion

s3-download: # Download from S3 to local folder using AWS
@echo "Downloading from S3 bucket: $(AWS_S3_BUCKET_NAME)"
@echo "######### TRYING S3 at ..... | $(AWS_S3_BUCKET_NAME) $(AWS_S3_PREFIX)/data.zip $(LOCAL_DATA_PATH)"
uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) $(AWS_S3_PREFIX)/data.zip $(LOCAL_DATA_PATH)
s3-download-raw-dataset: # Download raw Notion dataset from S3 to local folder
@echo "Downloading raw Notion dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion/notion.zip"
uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/notion/notion.zip $(NOTION_LOCAL_DATA_PATH)

download-raw-dataset: s3-download
s3-upload-crawled-dataset: # Upload processed crawled dataset from local folder to S3
@echo "Uploading crawled dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled"
uv run python -m tools.use_s3 upload $(CRAWLED_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/crawled

s3-download-crawled-dataset: # Download processed crawled dataset from S3 to local folder
@echo "Downloading crawled dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled/crawled.zip"
uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/crawled/crawled.zip $(CRAWLED_LOCAL_DATA_PATH)

download-raw-dataset: s3-download-raw-dataset

download-crawled-dataset: s3-download-crawled-dataset

# --- Pipelines ---

Expand All @@ -61,6 +71,9 @@ collect-notion-data-pipeline:
etl-pipeline:
uv run python -m tools.run --run-etl-pipeline --no-cache

etl-precomputed-pipeline:
uv run python -m tools.run --run-etl-precomputed-pipeline --no-cache

generate-dataset-pipeline:
uv run python -m tools.run --run-generate-dataset-pipeline --no-cache

Expand Down
59 changes: 50 additions & 9 deletions apps/second-brain-offline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,56 @@ To set it up and run

```bash
uv venv .venv-offline
. ./.venv-offline/bin/activate
. ./.venv-offline/bin/activate # or source ./.venv-offline/bin/activate
uv pip install -e .
```

Setup `Crew4AI` for crawling:
```bash
# Run post-installation setup
uv pip install -U "crawl4ai==0.4.247" # We have to upgrade crawl4ai to support these CLI commands (we couldn't add it to pyproject.toml due to ZenML version incompatibility with Pydantic).
crawl4ai-setup

# Verify your installation
crawl4ai-doctor
```

> [!IMPORTANT]
> As crawling can often fail, both during installation and while running the crawling logic, you can skip the crawling step and use our pre-computed dataset. More on this in the [Running the ML pipelines / Lessons](#running-the-ml-pipelines--lessons) section.
After running the doctor command, you should see something like this:
```console
[INIT].... → Running Crawl4AI health check...
[INIT].... → Crawl4AI 0.4.247
[TEST].... ℹ Testing crawling capabilities...
[EXPORT].. ℹ Exporting PDF and taking screenshot took 0.84s
[FETCH]... ↓ https://crawl4ai.com... | Status: True | Time: 3.91s
[SCRAPE].. ◆ Processed https://crawl4ai.com... | Time: 11ms
[COMPLETE] ● https://crawl4ai.com... | Status: True | Total: 3.92s
[COMPLETE] ● ✅ Crawling test passed!
```
[More on installing Crawl4AI](https://docs.crawl4ai.com/core/installation/)


## Infrastructure

```bash
make local-infrastructure-up
```

## Running the ML pipelines / Lessons

## Run ZenML pipelines

### Notion (optionl)
## Optional - Collect custom Notion data
```bash
make collect-notion-data-pipeline
```

### Populate MongoDB vector index
### Lesson 1

NO CODE


### Lesson 2: Populate MongoDB NoSQL and vector database

```bash
make download-raw-dataset
Expand All @@ -42,27 +72,38 @@ make compute-rag-vector-index-pipeline
# Validate using test: make test-rag-vector-index-pipeline
```

## Formatting
Or if you have issues with crawling, you can use our pre-computed dataset to populate MongoDB:
```bash
make download-crawled-dataset
make etl-precomputed-pipeline
make compute-rag-vector-index-pipeline
```

## Utility commands

### Formatting

```
make format-check
make format-fix
```

## Linting
### Linting

```bash
make lint-check
make lint-fix
```

## Tests
### Tests

```bash
make test
```

## Notion
## Others

### Notion

1. Go to [https://www.notion.so/profile].
2. Create an integration following [this tutorial](https://developers.notion.com/docs/authorization).
Expand Down
2 changes: 1 addition & 1 deletion apps/second-brain-offline/configs/collect_notion_data.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ parameters:
- be6505f5e7544b66a75fe0d444aba1b2
- f54dbddcaa4c43c7ae17935716761536
- 31fcaab5a9404d41b922897d32b901b3
output_dir: data/
data_dir: data/
to_s3: true
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
parameters:
extract_collection_name: raw_data
load_collection_name: rag_data
extract_collection_name: raw
load_collection_name: rag
processing_batch_size: 256
processing_max_workers: 2
fetch_limit: 100
Expand Down
6 changes: 4 additions & 2 deletions apps/second-brain-offline/configs/etl.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
parameters:
data_directory: data/
load_collection_name: raw_data
data_dir: data/
load_collection_name: raw
to_s3: true
max_workers: 16
3 changes: 3 additions & 0 deletions apps/second-brain-offline/configs/etl_precomputed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
parameters:
data_dir: data/
load_collection_name: raw
9 changes: 8 additions & 1 deletion apps/second-brain-offline/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from .collect_notion_data import collect_notion_data
from .compute_rag_vector_index import compute_rag_vector_index
from .etl import etl
from .etl_precomputed import etl_precomputed
from .generate_dataset import generate_dataset

__all__ = ["collect_notion_data", "etl", "generate_dataset", "compute_rag_vector_index"]
__all__ = [
"collect_notion_data",
"etl",
"etl_precomputed",
"generate_dataset",
"compute_rag_vector_index",
]
31 changes: 18 additions & 13 deletions apps/second-brain-offline/pipelines/collect_notion_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,34 @@
from zenml import pipeline

from steps.collect_notion_data import (
extract_notion_pages,
extract_notion_pages_metadata,
save_notion_pages,
extract_notion_documents,
extract_notion_documents_metadata,
)
from steps.infrastructure import upload_to_s3
from steps.infrastructure import save_documents_to_disk, upload_to_s3


@pipeline
def collect_notion_data(
database_ids: list[str], output_dir: Path, to_s3: bool = False
database_ids: list[str], data_dir: Path, to_s3: bool = False
) -> None:
notion_data_dir = data_dir / "notion"
notion_data_dir.mkdir(parents=True, exist_ok=True)

invocation_ids = []
for database_id in database_ids:
for index, database_id in enumerate(database_ids):
logger.info(f"Collecting pages from database '{database_id}'")
pages_metadata = extract_notion_pages_metadata(database_id=database_id)
pages_data = extract_notion_pages(pages_metadata=pages_metadata)
documents_metadata = extract_notion_documents_metadata(database_id=database_id)
documents_data = extract_notion_documents(documents_metadata=documents_metadata)

result = save_notion_pages(
database_id,
pages=pages_data,
output_dir=output_dir,
result = save_documents_to_disk(
documents=documents_data,
output_dir=notion_data_dir / f"database_{index}",
)
invocation_ids.append(result.invocation_id)

if to_s3:
upload_to_s3(folder_path=output_dir, after=invocation_ids)
upload_to_s3(
folder_path=notion_data_dir,
s3_prefix="second_brain_course/notion",
after=invocation_ids,
)
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

@pipeline
def compute_rag_vector_index(
extract_collection_name: str = "raw_data",
load_collection_name: str = "vector_index",
extract_collection_name: str = "raw",
load_collection_name: str = "rag",
processing_batch_size: int = 256,
processing_max_workers: int = 10,
fetch_limit: int = 100,
Expand Down
34 changes: 29 additions & 5 deletions apps/second-brain-offline/pipelines/etl.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,37 @@
from pathlib import Path

from loguru import logger
from zenml import pipeline

from steps.etl import crawl, read_pages_from_disk
from steps.etl import crawl
from steps.infrastructure import (
ingest_to_mongodb,
read_documents_from_disk,
save_documents_to_disk,
upload_to_s3,
)


@pipeline
def etl(data_directory: str, load_collection_name: str) -> None:
pages = read_pages_from_disk(data_directory=data_directory)
documents = crawl(pages=pages)
ingest_to_mongodb(documents=documents, collection_name=load_collection_name)
def etl(
data_dir: Path,
load_collection_name: str,
to_s3: bool = False,
max_workers: int = 10,
) -> None:
notion_data_dir = data_dir / "notion"
logger.info(f"Reading notion data from {notion_data_dir}")
crawled_data_dir = data_dir / "crawled"
logger.info(f"Saving crawled data to {crawled_data_dir}")

documents = read_documents_from_disk(data_directory=notion_data_dir, nesting_level=1)
augmented_documents = crawl(documents=documents, max_workers=max_workers)

save_documents_to_disk(documents=augmented_documents, output_dir=crawled_data_dir)
if to_s3:
upload_to_s3(
folder_path=crawled_data_dir,
s3_prefix="second_brain_course/crawled",
after="save_documents_to_disk",
)
ingest_to_mongodb(models=augmented_documents, collection_name=load_collection_name)
17 changes: 17 additions & 0 deletions apps/second-brain-offline/pipelines/etl_precomputed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from pathlib import Path

from zenml import pipeline

from steps.infrastructure import ingest_to_mongodb, read_documents_from_disk


@pipeline
def etl_precomputed(
data_dir: Path,
load_collection_name: str,
) -> None:
crawled_data_dir = data_dir / "crawled"
documents = read_documents_from_disk(
data_directory=crawled_data_dir, nesting_level=0
)
ingest_to_mongodb(models=documents, collection_name=load_collection_name)
7 changes: 4 additions & 3 deletions apps/second-brain-offline/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[project]
name = "second_brain_course"
name = "second-brain-offline-course"
version = "0.1.0"
description = "Self-paced course on production LLMs and RAG by teaching you how to build an AI assistant on top of your second brain."
description = "Self-paced course on production LLMs and RAG by teaching you how to build an AI assistant on top of your Notion second brain."
readme = "README.md"
authors = [
{name = "Paul Iusztin", email = "[email protected]"},
Expand All @@ -15,12 +15,13 @@ dependencies = [
"pydantic>=2.8.2",
"pydantic-settings>=2.7.0",
"pymongo>=4.4.0",
"zenml[server]>=0.72.0",
"zenml[server]>=0.73.0",
"boto3>=1.36.0",
"langchain>=0.3.14",
"langchain-mongodb>=0.4.0",
"langchain-openai>=0.3.0",
"ipykernel>=6.29.5",
"crawl4ai>=0.3.745",
]

[dependency-groups]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .crawl4ai import Crawl4AICrawler

__all__ = ["Crawl4AICrawler"]
Loading

0 comments on commit 7b4333f

Please sign in to comment.