Merge pull request #3 from decodingml/feat/crawling

feat: Implement crawling
decodingml · Jan 22, 2025 · 7b4333f · 7b4333f
2 parents 7afda18 + 0435f2e
commit 7b4333f
Show file tree

Hide file tree

Showing 63 changed files with 2,311 additions and 1,078 deletions.
diff --git a/apps/second-brain-offline/Makefile b/apps/second-brain-offline/Makefile
@@ -10,7 +10,8 @@ export PYTHONPATH = .
 # --- Default Values ---
 
 CHECK_DIRS := .
-LOCAL_DATA_PATH := data
+NOTION_LOCAL_DATA_PATH := data/notion
+CRAWLED_LOCAL_DATA_PATH := data/crawled
 
 
 # --- Utilities ---
@@ -42,16 +43,25 @@ local-infrastructure-stop: local-docker-infrastructure-stop local-zenml-server-s
 
 # --- AWS ---
 
-s3-upload:  # Upload a local folder to S3
-	@echo "Uploading to S3 bucket: $(AWS_S3_BUCKET_NAME)"
-	uv run python -m tools.use_s3 upload $(LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix $(AWS_S3_PREFIX)
+s3-upload-raw-dataset:  # Upload raw Notion dataset from local folder to S3
+	@echo "Uploading raw Notion dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion"
+	uv run python -m tools.use_s3 upload $(NOTION_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/notion
 
-s3-download:  # Download from S3 to local folder using AWS 
-	@echo "Downloading from S3 bucket: $(AWS_S3_BUCKET_NAME)"
-	@echo "######### TRYING S3 at ..... |  $(AWS_S3_BUCKET_NAME) $(AWS_S3_PREFIX)/data.zip $(LOCAL_DATA_PATH)"
-	uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) $(AWS_S3_PREFIX)/data.zip $(LOCAL_DATA_PATH)
+s3-download-raw-dataset:  # Download raw Notion dataset from S3 to local folder
+	@echo "Downloading raw Notion dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/notion/notion.zip"
+	uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/notion/notion.zip $(NOTION_LOCAL_DATA_PATH)
 
-download-raw-dataset: s3-download
+s3-upload-crawled-dataset:  # Upload processed crawled dataset from local folder to S3
+	@echo "Uploading crawled dataset to S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled"
+	uv run python -m tools.use_s3 upload $(CRAWLED_LOCAL_DATA_PATH) $(AWS_S3_BUCKET_NAME) --s3-prefix second_brain_course/crawled
+
+s3-download-crawled-dataset:  # Download processed crawled dataset from S3 to local folder
+	@echo "Downloading crawled dataset from S3 bucket: $(AWS_S3_BUCKET_NAME)/second_brain_course/crawled/crawled.zip"
+	uv run python -m tools.use_s3 download $(AWS_S3_BUCKET_NAME) second_brain_course/crawled/crawled.zip $(CRAWLED_LOCAL_DATA_PATH)
+
+download-raw-dataset: s3-download-raw-dataset
+
+download-crawled-dataset: s3-download-crawled-dataset
 
 # --- Pipelines ---
 
@@ -61,6 +71,9 @@ collect-notion-data-pipeline:
 etl-pipeline:
 	uv run python -m tools.run --run-etl-pipeline --no-cache
 
+etl-precomputed-pipeline:
+	uv run python -m tools.run --run-etl-precomputed-pipeline --no-cache
+
 generate-dataset-pipeline:
 	uv run python -m tools.run --run-generate-dataset-pipeline --no-cache
 

diff --git a/apps/second-brain-offline/README.md b/apps/second-brain-offline/README.md
@@ -12,26 +12,56 @@ To set it up and run
 
 ```bash
 uv venv .venv-offline
-. ./.venv-offline/bin/activate
+. ./.venv-offline/bin/activate # or source ./.venv-offline/bin/activate
 uv pip install -e .
 ```
 
+Setup `Crew4AI` for crawling:
+```bash
+# Run post-installation setup
+uv pip install -U "crawl4ai==0.4.247" # We have to upgrade crawl4ai to support these CLI commands (we couldn't add it to pyproject.toml due to ZenML version incompatibility with Pydantic).
+crawl4ai-setup
+
+# Verify your installation
+crawl4ai-doctor
+```
+
+> [!IMPORTANT]
+> As crawling can often fail, both during installation and while running the crawling logic, you can skip the crawling step and use our pre-computed dataset. More on this in the [Running the ML pipelines / Lessons](#running-the-ml-pipelines--lessons) section.
+
+After running the doctor command, you should see something like this:
+```console
+[INIT].... → Running Crawl4AI health check...
+[INIT].... → Crawl4AI 0.4.247
+[TEST].... ℹ Testing crawling capabilities...
+[EXPORT].. ℹ Exporting PDF and taking screenshot took 0.84s
+[FETCH]... ↓ https://crawl4ai.com... | Status: True | Time: 3.91s
+[SCRAPE].. ◆ Processed https://crawl4ai.com... | Time: 11ms
+[COMPLETE] ● https://crawl4ai.com... | Status: True | Total: 3.92s
+[COMPLETE] ● ✅ Crawling test passed!
+```
+[More on installing Crawl4AI](https://docs.crawl4ai.com/core/installation/)
+
 
 ## Infrastructure
 
 ```bash
 make local-infrastructure-up
 ```
 
+## Running the ML pipelines / Lessons
 
-## Run ZenML pipelines
-
-### Notion (optionl)
+## Optional - Collect custom Notion data
 ```bash
 make collect-notion-data-pipeline   
 ```
 
-### Populate MongoDB vector index
+### Lesson 1
+
+NO CODE
+
+
+### Lesson 2: Populate MongoDB NoSQL and vector database
 
 ```bash
 make download-raw-dataset
@@ -42,27 +72,38 @@ make compute-rag-vector-index-pipeline
 # Validate using test: make test-rag-vector-index-pipeline
 ```
 
-## Formatting
+Or if you have issues with crawling, you can use our pre-computed dataset to populate MongoDB:
+```bash
+make download-crawled-dataset
+make etl-precomputed-pipeline
+make compute-rag-vector-index-pipeline
+```
+
+## Utility commands
+
+### Formatting
 
 ```
 make format-check
 make format-fix
 ```
 
-## Linting
+### Linting
 
 ```bash
 make lint-check
 make lint-fix
 ```
 
-## Tests
+### Tests
 
 ```bash
 make test
 ```
 
-## Notion
+## Others
+
+### Notion
 
 1. Go to [https://www.notion.so/profile].
 2. Create an integration following [this tutorial](https://developers.notion.com/docs/authorization).

diff --git a/apps/second-brain-offline/configs/collect_notion_data.yaml b/apps/second-brain-offline/configs/collect_notion_data.yaml
@@ -8,5 +8,5 @@ parameters:
     - be6505f5e7544b66a75fe0d444aba1b2
     - f54dbddcaa4c43c7ae17935716761536
     - 31fcaab5a9404d41b922897d32b901b3
-  output_dir: data/
+  data_dir: data/
   to_s3: true
diff --git a/apps/second-brain-offline/configs/compute_rag_vector_index.yaml b/apps/second-brain-offline/configs/compute_rag_vector_index.yaml
@@ -1,6 +1,6 @@
 parameters:
-  extract_collection_name: raw_data
-  load_collection_name: rag_data
+  extract_collection_name: raw
+  load_collection_name: rag
   processing_batch_size: 256
   processing_max_workers: 2
   fetch_limit: 100

diff --git a/apps/second-brain-offline/configs/etl.yaml b/apps/second-brain-offline/configs/etl.yaml
@@ -1,3 +1,5 @@
 parameters:
-  data_directory: data/
-  load_collection_name: raw_data
+  data_dir: data/
+  load_collection_name: raw
+  to_s3: true
+  max_workers: 16
diff --git a/apps/second-brain-offline/configs/etl_precomputed.yaml b/apps/second-brain-offline/configs/etl_precomputed.yaml
@@ -0,0 +1,3 @@
+parameters:
+  data_dir: data/
+  load_collection_name: raw
diff --git a/apps/second-brain-offline/pipelines/__init__.py b/apps/second-brain-offline/pipelines/__init__.py
@@ -1,6 +1,13 @@
 from .collect_notion_data import collect_notion_data
 from .compute_rag_vector_index import compute_rag_vector_index
 from .etl import etl
+from .etl_precomputed import etl_precomputed
 from .generate_dataset import generate_dataset
 
-__all__ = ["collect_notion_data", "etl", "generate_dataset", "compute_rag_vector_index"]
+__all__ = [
+    "collect_notion_data",
+    "etl",
+    "etl_precomputed",
+    "generate_dataset",
+    "compute_rag_vector_index",
+]
diff --git a/apps/second-brain-offline/pipelines/collect_notion_data.py b/apps/second-brain-offline/pipelines/collect_notion_data.py
@@ -4,29 +4,34 @@
 from zenml import pipeline
 
 from steps.collect_notion_data import (
-    extract_notion_pages,
-    extract_notion_pages_metadata,
-    save_notion_pages,
+    extract_notion_documents,
+    extract_notion_documents_metadata,
 )
-from steps.infrastructure import upload_to_s3
+from steps.infrastructure import save_documents_to_disk, upload_to_s3
 
 
 @pipeline
 def collect_notion_data(
-    database_ids: list[str], output_dir: Path, to_s3: bool = False
+    database_ids: list[str], data_dir: Path, to_s3: bool = False
 ) -> None:
+    notion_data_dir = data_dir / "notion"
+    notion_data_dir.mkdir(parents=True, exist_ok=True)
+
     invocation_ids = []
-    for database_id in database_ids:
+    for index, database_id in enumerate(database_ids):
         logger.info(f"Collecting pages from database '{database_id}'")
-        pages_metadata = extract_notion_pages_metadata(database_id=database_id)
-        pages_data = extract_notion_pages(pages_metadata=pages_metadata)
+        documents_metadata = extract_notion_documents_metadata(database_id=database_id)
+        documents_data = extract_notion_documents(documents_metadata=documents_metadata)
 
-        result = save_notion_pages(
-            database_id,
-            pages=pages_data,
-            output_dir=output_dir,
+        result = save_documents_to_disk(
+            documents=documents_data,
+            output_dir=notion_data_dir / f"database_{index}",
         )
         invocation_ids.append(result.invocation_id)
 
     if to_s3:
-        upload_to_s3(folder_path=output_dir, after=invocation_ids)
+        upload_to_s3(
+            folder_path=notion_data_dir,
+            s3_prefix="second_brain_course/notion",
+            after=invocation_ids,
+        )
diff --git a/apps/second-brain-offline/pipelines/compute_rag_vector_index.py b/apps/second-brain-offline/pipelines/compute_rag_vector_index.py
@@ -8,8 +8,8 @@
 
 @pipeline
 def compute_rag_vector_index(
-    extract_collection_name: str = "raw_data",
-    load_collection_name: str = "vector_index",
+    extract_collection_name: str = "raw",
+    load_collection_name: str = "rag",
     processing_batch_size: int = 256,
     processing_max_workers: int = 10,
     fetch_limit: int = 100,

diff --git a/apps/second-brain-offline/pipelines/etl.py b/apps/second-brain-offline/pipelines/etl.py
@@ -1,13 +1,37 @@
+from pathlib import Path
+
+from loguru import logger
 from zenml import pipeline
 
-from steps.etl import crawl, read_pages_from_disk
+from steps.etl import crawl
 from steps.infrastructure import (
     ingest_to_mongodb,
+    read_documents_from_disk,
+    save_documents_to_disk,
+    upload_to_s3,
 )
 
 
 @pipeline
-def etl(data_directory: str, load_collection_name: str) -> None:
-    pages = read_pages_from_disk(data_directory=data_directory)
-    documents = crawl(pages=pages)
-    ingest_to_mongodb(documents=documents, collection_name=load_collection_name)
+def etl(
+    data_dir: Path,
+    load_collection_name: str,
+    to_s3: bool = False,
+    max_workers: int = 10,
+) -> None:
+    notion_data_dir = data_dir / "notion"
+    logger.info(f"Reading notion data from {notion_data_dir}")
+    crawled_data_dir = data_dir / "crawled"
+    logger.info(f"Saving crawled data to {crawled_data_dir}")
+
+    documents = read_documents_from_disk(data_directory=notion_data_dir, nesting_level=1)
+    augmented_documents = crawl(documents=documents, max_workers=max_workers)
+
+    save_documents_to_disk(documents=augmented_documents, output_dir=crawled_data_dir)
+    if to_s3:
+        upload_to_s3(
+            folder_path=crawled_data_dir,
+            s3_prefix="second_brain_course/crawled",
+            after="save_documents_to_disk",
+        )
+    ingest_to_mongodb(models=augmented_documents, collection_name=load_collection_name)
diff --git a/apps/second-brain-offline/pipelines/etl_precomputed.py b/apps/second-brain-offline/pipelines/etl_precomputed.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+
+from zenml import pipeline
+
+from steps.infrastructure import ingest_to_mongodb, read_documents_from_disk
+
+
+@pipeline
+def etl_precomputed(
+    data_dir: Path,
+    load_collection_name: str,
+) -> None:
+    crawled_data_dir = data_dir / "crawled"
+    documents = read_documents_from_disk(
+        data_directory=crawled_data_dir, nesting_level=0
+    )
+    ingest_to_mongodb(models=documents, collection_name=load_collection_name)
diff --git a/apps/second-brain-offline/pyproject.toml b/apps/second-brain-offline/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
-name = "second_brain_course"
+name = "second-brain-offline-course"
 version = "0.1.0"
-description = "Self-paced course on production LLMs and RAG by teaching you how to build an AI assistant on top of your second brain."
+description = "Self-paced course on production LLMs and RAG by teaching you how to build an AI assistant on top of your Notion second brain."
 readme = "README.md"
 authors = [
     {name = "Paul Iusztin", email = "[email protected]"},
@@ -15,12 +15,13 @@ dependencies = [
     "pydantic>=2.8.2",
     "pydantic-settings>=2.7.0",
     "pymongo>=4.4.0",
-    "zenml[server]>=0.72.0",
+    "zenml[server]>=0.73.0",
     "boto3>=1.36.0",
     "langchain>=0.3.14",
     "langchain-mongodb>=0.4.0",
     "langchain-openai>=0.3.0",
     "ipykernel>=6.29.5",
+    "crawl4ai>=0.3.745",
 ]
 
 [dependency-groups]

diff --git a/apps/second-brain-offline/src/second_brain_offline/application/crawlers/__init__.py b/apps/second-brain-offline/src/second_brain_offline/application/crawlers/__init__.py
@@ -0,0 +1,3 @@
+from .crawl4ai import Crawl4AICrawler
+
+__all__ = ["Crawl4AICrawler"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .crawl4ai import Crawl4AICrawler

		__all__ = ["Crawl4AICrawler"]