Skip to content

Commit

Permalink
feat: Improve collecting data from Notion
Browse files Browse the repository at this point in the history
  • Loading branch information
iusztinpaul committed Jan 3, 2025
1 parent 7be4827 commit 9830b87
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 64 deletions.
Binary file added .DS_Store
Binary file not shown.
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
export PYTHONPATH = .
check_dirs := src

help:
@grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done

# --- Infrastructure ---

local-docker-infrastructure-up:
Expand Down
8 changes: 7 additions & 1 deletion configs/collect_notion.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
parameters:
database_ids:
- 7ec04ecd2d00406284674b1a614f1ec7
- 7ec04ecd2d00406284674b1a614f1ec7
# - 140bc009043f80e1b14bd568381b93a8
# - 6d8dd6ae29c442b1baea9c610dd1e92d
# - 20438dae7f8842838593af3411754867
# - e9b2f80fb10e474ba95c7c219c9c6462
# - be6505f5e7544b66a75fe0d444aba1b2
# - f54dbddcaa4c43c7ae17935716761536
# - 31fcaab5a9404d41b922897d32b901b3
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ requires-python = ">=3.12"
dependencies = [
"click>=8.1.3",
"loguru>=0.7.3",
"pydantic>=2.8.2",
"pydantic-settings>=2.7.0",
"zenml[server]>=0.71.0",
]
Expand Down
3 changes: 3 additions & 0 deletions src/second_brain/entities/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .page import Page

__all__ = ["Page"]
18 changes: 18 additions & 0 deletions src/second_brain/entities/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import json
from pathlib import Path

from pydantic import BaseModel


class Page(BaseModel):
content: str
metadata: dict

def write(self, file_path: Path) -> None:
with open(file_path, "w", encoding="utf-8") as f:
json.dump(
{"content": self.content, "metadata": self.metadata},
f,
indent=4,
ensure_ascii=False,
)
16 changes: 16 additions & 0 deletions src/second_brain/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
def merge_dicts(dict1: dict, dict2: dict) -> dict:
"""Recursively merge two dictionaries with list handling."""
result = dict1.copy()

for key, value in dict2.items():
if key in result:
if isinstance(result[key], dict) and isinstance(value, dict):
result[key] = merge_dicts(result[key], value)
elif isinstance(result[key], list) and isinstance(value, list):
result[key] = result[key] + value
else:
result[key] = value
else:
result[key] = value

return result
113 changes: 56 additions & 57 deletions steps/collect_notion/extract_notion_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,27 @@
from typing_extensions import Annotated
from zenml import step

from second_brain import settings
from second_brain import settings, utils
from second_brain.entities import Page


@step
def extract_notion_pages(
page_ids: list[str],
) -> Annotated[dict[str, str], "pages"]:
) -> Annotated[dict[str, Page], "pages"]:
page_contents = {}
for page_id in page_ids:
page_content = _retrieve_page_content(page_id)
page_contents[page_id] = page_content
blocks = _retrieve_child_blocks(page_id)
content, metadata = _parse_blocks(blocks)
page_contents[page_id] = Page(content=content, metadata=metadata)

return page_contents


def _retrieve_page_content(page_id: str, depth: int = 0) -> str:
blocks_url = f"https://api.notion.com/v1/blocks/{page_id}/children?page_size=100"
def _retrieve_child_blocks(block_id: str, page_size: int = 100) -> list[dict]:
blocks_url = (
f"https://api.notion.com/v1/blocks/{block_id}/children?page_size={page_size}"
)
headers = {
"Authorization": f"Bearer {settings.NOTION_SECRET_KEY}",
"Notion-Version": "2022-06-28",
Expand All @@ -28,79 +32,91 @@ def _retrieve_page_content(page_id: str, depth: int = 0) -> str:
blocks_response = requests.get(blocks_url, headers=headers, timeout=10)
blocks_response.raise_for_status()
blocks_data = blocks_response.json()
return parse_blocks(blocks_data.get("results", []), depth)

return blocks_data.get("results", [])
except requests.exceptions.RequestException as e:
error_message = f"Error: Failed to retrieve Notion page content. {e}"
if hasattr(e, "response") and e.response is not None:
error_message += (
f" Status code: {e.response.status_code}, Response: {e.response.text}"
)
return error_message
except Exception as e: # noqa: BLE001
logger.opt(exception=True).debug("Error retrieving Notion page content")
return f"Error: An unexpected error occurred while retrieving Notion page content. {e}"
logger.exception(error_message)

return []
except Exception:
error_message = "Error retrieving Notion page content"
logger.exception(error_message)

return []


def parse_blocks(blocks: list, depth: int = 0) -> str:
def _parse_blocks(blocks: list, depth: int = 0) -> tuple[str, dict]:
content = ""
metadata = {
"urls": [],
}
for block in blocks:
block_type = block.get("type")
block_id = block.get("id")

if block_type == "toggle":
toggle_text = parse_rich_text(block["toggle"].get("rich_text", []))
content += f"▼ {toggle_text}\n\n"

if "has_children" in block and block["has_children"]:
children_blocks = _fetch_block_children(block_id)
children_content = parse_blocks(children_blocks, depth + 1)
content += (
"\n".join(" " + line for line in children_content.split("\n"))
+ "\n\n"
)
elif block_type in {
if block_type in {
"paragraph",
"heading_1",
"heading_2",
"heading_3",
"quote",
}:
text_content = parse_rich_text(block[block_type].get("rich_text", []))
text_content = _parse_rich_text(block[block_type].get("rich_text", []))
content += text_content + "\n\n"
urls = extract_urls(block[block_type].get("rich_text", []))

urls = _extract_urls(block[block_type].get("rich_text", []))
if urls:
content += "\n".join(f"- {url}" for url in urls) + "\n\n"
metadata["urls"].extend(urls)

if "has_children" in block and block["has_children"]:
child_blocks = _retrieve_child_blocks(block_id)
child_content, child_metadata = _parse_blocks(child_blocks, depth + 1)
content += (
"\n".join(" " + line for line in child_content.split("\n"))
+ "\n\n"
)
metadata = utils.merge_dicts(metadata, child_metadata)

elif block_type in {"bulleted_list_item", "numbered_list_item"}:
content += parse_rich_text(block[block_type].get("rich_text", [])) + "\n"
content += _parse_rich_text(block[block_type].get("rich_text", [])) + "\n"
elif block_type == "to_do":
content += parse_rich_text(block["to_do"].get("rich_text", [])) + "\n"
content += _parse_rich_text(block["to_do"].get("rich_text", [])) + "\n"
elif block_type == "code":
content += parse_rich_text(block["code"].get("rich_text", [])) + "\n\n"
content += _parse_rich_text(block["code"].get("rich_text", [])) + "\n\n"
elif block_type == "image":
content += f"[Image: {block['image'].get('external', {}).get('url', 'No URL')}]\n\n"
elif block_type == "divider":
content += "---\n\n"
elif block_type == "child_page" and depth < 3: # Limit recursion depth
elif block_type == "child_page" and depth < 3:
child_id = block.get("id")
child_title = block.get("child_page", {}).get("title", "Untitled")
content += f"\n### {child_title}\n\n"
child_content = _retrieve_page_content(child_id, depth + 1)

child_blocks = _retrieve_child_blocks(child_id)
child_content, child_metadata = _parse_blocks(child_blocks, depth + 1)
content += child_content + "\n\n"
metadata = utils.merge_dicts(metadata, child_metadata)

elif block_type == "child_database":
db_id = block.get("id")
db_title = block.get("child_database", {}).get("title", "Untitled Database")
content += f"\n### Database: {db_title}\n\n"
elif block_type == "link_preview":
url = block.get("link_preview", {}).get("url", "")
content += f"[Link Preview: {url}]\n\n"

return content.strip()
metadata["urls"].append(url)
else:
logger.warning(f"Unknown block type: {block_type}")

return content.strip("\n "), metadata

def parse_rich_text(rich_text: list) -> str:

def _parse_rich_text(rich_text: list) -> str:
return "".join(segment.get("plain_text", "") for segment in rich_text)


def extract_urls(rich_text: list) -> list:
def _extract_urls(rich_text: list) -> list:
"""Extract URLs from rich text blocks."""
urls = []
for text in rich_text:
Expand All @@ -109,20 +125,3 @@ def extract_urls(rich_text: list) -> list:
if "url" in text.get("annotations", {}):
urls.append(text["annotations"]["url"])
return urls


def _fetch_block_children(block_id: str) -> list:
"""Fetch children blocks for a given block ID."""
children_url = f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100"
headers = {
"Authorization": f"Bearer {settings.NOTION_SECRET_KEY}",
"Notion-Version": "2022-06-28",
}
try:
children_response = requests.get(children_url, headers=headers, timeout=10)
children_response.raise_for_status()
children_data = children_response.json()
return children_data.get("results", [])
except requests.exceptions.RequestException as e:
logger.error(f"Failed to fetch block children: {e}")
return []
16 changes: 10 additions & 6 deletions steps/collect_notion/save_notion_pages.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import shutil
from pathlib import Path

from zenml import step

from second_brain.entities import Page


@step
def save_notion_pages(
database_id: str,
pages: dict[str, str],
pages: dict[str, Page],
) -> None:
output_dir = Path("data") / database_id
output_dir.mkdir(parents=True, exist_ok=True)
if output_dir.exists():
shutil.rmtree(output_dir)
output_dir.mkdir(parents=True)

for page_name, content in pages.items():
file_path = output_dir / f"{page_name}.txt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(content)
for page_id, page in pages.items():
file_path = output_dir / f"{page_id}.json"
page.write(file_path)
13 changes: 13 additions & 0 deletions type.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
[
"paragraph",
"heading_3",
"heading_1",
"heading_2",
"child_page",
"image",
"divider",
"code",
"link_preview",
"bulleted_list_item",
"numbered_list_item"
]
2 changes: 2 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 9830b87

Please sign in to comment.