-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3 from decodingml/feat/crawling
feat: Implement crawling
- Loading branch information
Showing
63 changed files
with
2,311 additions
and
1,078 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 2 additions & 2 deletions
4
apps/second-brain-offline/configs/compute_rag_vector_index.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,5 @@ | ||
parameters: | ||
data_directory: data/ | ||
load_collection_name: raw_data | ||
data_dir: data/ | ||
load_collection_name: raw | ||
to_s3: true | ||
max_workers: 16 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
parameters: | ||
data_dir: data/ | ||
load_collection_name: raw |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,13 @@ | ||
from .collect_notion_data import collect_notion_data | ||
from .compute_rag_vector_index import compute_rag_vector_index | ||
from .etl import etl | ||
from .etl_precomputed import etl_precomputed | ||
from .generate_dataset import generate_dataset | ||
|
||
__all__ = ["collect_notion_data", "etl", "generate_dataset", "compute_rag_vector_index"] | ||
__all__ = [ | ||
"collect_notion_data", | ||
"etl", | ||
"etl_precomputed", | ||
"generate_dataset", | ||
"compute_rag_vector_index", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,37 @@ | ||
from pathlib import Path | ||
|
||
from loguru import logger | ||
from zenml import pipeline | ||
|
||
from steps.etl import crawl, read_pages_from_disk | ||
from steps.etl import crawl | ||
from steps.infrastructure import ( | ||
ingest_to_mongodb, | ||
read_documents_from_disk, | ||
save_documents_to_disk, | ||
upload_to_s3, | ||
) | ||
|
||
|
||
@pipeline | ||
def etl(data_directory: str, load_collection_name: str) -> None: | ||
pages = read_pages_from_disk(data_directory=data_directory) | ||
documents = crawl(pages=pages) | ||
ingest_to_mongodb(documents=documents, collection_name=load_collection_name) | ||
def etl( | ||
data_dir: Path, | ||
load_collection_name: str, | ||
to_s3: bool = False, | ||
max_workers: int = 10, | ||
) -> None: | ||
notion_data_dir = data_dir / "notion" | ||
logger.info(f"Reading notion data from {notion_data_dir}") | ||
crawled_data_dir = data_dir / "crawled" | ||
logger.info(f"Saving crawled data to {crawled_data_dir}") | ||
|
||
documents = read_documents_from_disk(data_directory=notion_data_dir, nesting_level=1) | ||
augmented_documents = crawl(documents=documents, max_workers=max_workers) | ||
|
||
save_documents_to_disk(documents=augmented_documents, output_dir=crawled_data_dir) | ||
if to_s3: | ||
upload_to_s3( | ||
folder_path=crawled_data_dir, | ||
s3_prefix="second_brain_course/crawled", | ||
after="save_documents_to_disk", | ||
) | ||
ingest_to_mongodb(models=augmented_documents, collection_name=load_collection_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from pathlib import Path | ||
|
||
from zenml import pipeline | ||
|
||
from steps.infrastructure import ingest_to_mongodb, read_documents_from_disk | ||
|
||
|
||
@pipeline | ||
def etl_precomputed( | ||
data_dir: Path, | ||
load_collection_name: str, | ||
) -> None: | ||
crawled_data_dir = data_dir / "crawled" | ||
documents = read_documents_from_disk( | ||
data_directory=crawled_data_dir, nesting_level=0 | ||
) | ||
ingest_to_mongodb(models=documents, collection_name=load_collection_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
[project] | ||
name = "second_brain_course" | ||
name = "second-brain-offline-course" | ||
version = "0.1.0" | ||
description = "Self-paced course on production LLMs and RAG by teaching you how to build an AI assistant on top of your second brain." | ||
description = "Self-paced course on production LLMs and RAG by teaching you how to build an AI assistant on top of your Notion second brain." | ||
readme = "README.md" | ||
authors = [ | ||
{name = "Paul Iusztin", email = "[email protected]"}, | ||
|
@@ -15,12 +15,13 @@ dependencies = [ | |
"pydantic>=2.8.2", | ||
"pydantic-settings>=2.7.0", | ||
"pymongo>=4.4.0", | ||
"zenml[server]>=0.72.0", | ||
"zenml[server]>=0.73.0", | ||
"boto3>=1.36.0", | ||
"langchain>=0.3.14", | ||
"langchain-mongodb>=0.4.0", | ||
"langchain-openai>=0.3.0", | ||
"ipykernel>=6.29.5", | ||
"crawl4ai>=0.3.745", | ||
] | ||
|
||
[dependency-groups] | ||
|
3 changes: 3 additions & 0 deletions
3
apps/second-brain-offline/src/second_brain_offline/application/crawlers/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from .crawl4ai import Crawl4AICrawler | ||
|
||
__all__ = ["Crawl4AICrawler"] |
Oops, something went wrong.