feat: Improve collecting data from Notion

decodingml · Jan 3, 2025 · 9830b87 · 9830b87
1 parent 7be4827
commit 9830b87
Show file tree

Hide file tree

Showing 11 changed files with 129 additions and 64 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/Makefile b/Makefile
@@ -1,6 +1,9 @@
 export PYTHONPATH = .
 check_dirs := src
 
+help:
+	@grep -E '^[a-zA-Z0-9 -]+:.*#'  Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m:$$(echo $$l | cut -f 2- -d'#')\n"; done
+
 # --- Infrastructure --- 
 
 local-docker-infrastructure-up:

diff --git a/configs/collect_notion.yaml b/configs/collect_notion.yaml
@@ -1,4 +1,10 @@
 parameters:
   database_ids:
     - 7ec04ecd2d00406284674b1a614f1ec7
-    - 7ec04ecd2d00406284674b1a614f1ec7
+    # - 140bc009043f80e1b14bd568381b93a8
+    # - 6d8dd6ae29c442b1baea9c610dd1e92d
+    # - 20438dae7f8842838593af3411754867
+    # - e9b2f80fb10e474ba95c7c219c9c6462
+    # - be6505f5e7544b66a75fe0d444aba1b2
+    # - f54dbddcaa4c43c7ae17935716761536
+    # - 31fcaab5a9404d41b922897d32b901b3
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 dependencies = [
     "click>=8.1.3",
     "loguru>=0.7.3",
+    "pydantic>=2.8.2",
     "pydantic-settings>=2.7.0",
     "zenml[server]>=0.71.0",
 ]

diff --git a/src/second_brain/entities/__init__.py b/src/second_brain/entities/__init__.py
@@ -0,0 +1,3 @@
+from .page import Page
+
+__all__ = ["Page"]
diff --git a/src/second_brain/entities/page.py b/src/second_brain/entities/page.py
@@ -0,0 +1,18 @@
+import json
+from pathlib import Path
+
+from pydantic import BaseModel
+
+
+class Page(BaseModel):
+    content: str
+    metadata: dict
+
+    def write(self, file_path: Path) -> None:
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(
+                {"content": self.content, "metadata": self.metadata},
+                f,
+                indent=4,
+                ensure_ascii=False,
+            )
diff --git a/src/second_brain/utils.py b/src/second_brain/utils.py
@@ -0,0 +1,16 @@
+def merge_dicts(dict1: dict, dict2: dict) -> dict:
+    """Recursively merge two dictionaries with list handling."""
+    result = dict1.copy()
+
+    for key, value in dict2.items():
+        if key in result:
+            if isinstance(result[key], dict) and isinstance(value, dict):
+                result[key] = merge_dicts(result[key], value)
+            elif isinstance(result[key], list) and isinstance(value, list):
+                result[key] = result[key] + value
+            else:
+                result[key] = value
+        else:
+            result[key] = value
+
+    return result
diff --git a/steps/collect_notion/extract_notion_pages.py b/steps/collect_notion/extract_notion_pages.py
@@ -3,23 +3,27 @@
 from typing_extensions import Annotated
 from zenml import step
 
-from second_brain import settings
+from second_brain import settings, utils
+from second_brain.entities import Page
 
 
 @step
 def extract_notion_pages(
     page_ids: list[str],
-) -> Annotated[dict[str, str], "pages"]:
+) -> Annotated[dict[str, Page], "pages"]:
     page_contents = {}
     for page_id in page_ids:
-        page_content = _retrieve_page_content(page_id)
-        page_contents[page_id] = page_content
+        blocks = _retrieve_child_blocks(page_id)
+        content, metadata = _parse_blocks(blocks)
+        page_contents[page_id] = Page(content=content, metadata=metadata)
 
     return page_contents
 
 
-def _retrieve_page_content(page_id: str, depth: int = 0) -> str:
-    blocks_url = f"https://api.notion.com/v1/blocks/{page_id}/children?page_size=100"
+def _retrieve_child_blocks(block_id: str, page_size: int = 100) -> list[dict]:
+    blocks_url = (
+        f"https://api.notion.com/v1/blocks/{block_id}/children?page_size={page_size}"
+    )
     headers = {
         "Authorization": f"Bearer {settings.NOTION_SECRET_KEY}",
         "Notion-Version": "2022-06-28",
@@ -28,79 +32,91 @@ def _retrieve_page_content(page_id: str, depth: int = 0) -> str:
         blocks_response = requests.get(blocks_url, headers=headers, timeout=10)
         blocks_response.raise_for_status()
         blocks_data = blocks_response.json()
-        return parse_blocks(blocks_data.get("results", []), depth)
+
+        return blocks_data.get("results", [])
     except requests.exceptions.RequestException as e:
         error_message = f"Error: Failed to retrieve Notion page content. {e}"
         if hasattr(e, "response") and e.response is not None:
             error_message += (
                 f" Status code: {e.response.status_code}, Response: {e.response.text}"
             )
-        return error_message
-    except Exception as e:  # noqa: BLE001
-        logger.opt(exception=True).debug("Error retrieving Notion page content")
-        return f"Error: An unexpected error occurred while retrieving Notion page content. {e}"
+        logger.exception(error_message)
+
+        return []
+    except Exception:
+        error_message = "Error retrieving Notion page content"
+        logger.exception(error_message)
+
+        return []
 
 
-def parse_blocks(blocks: list, depth: int = 0) -> str:
+def _parse_blocks(blocks: list, depth: int = 0) -> tuple[str, dict]:
     content = ""
+    metadata = {
+        "urls": [],
+    }
     for block in blocks:
         block_type = block.get("type")
         block_id = block.get("id")
 
-        if block_type == "toggle":
-            toggle_text = parse_rich_text(block["toggle"].get("rich_text", []))
-            content += f"▼ {toggle_text}\n\n"
-
-            if "has_children" in block and block["has_children"]:
-                children_blocks = _fetch_block_children(block_id)
-                children_content = parse_blocks(children_blocks, depth + 1)
-                content += (
-                    "\n".join("    " + line for line in children_content.split("\n"))
-                    + "\n\n"
-                )
-        elif block_type in {
+        if block_type in {
             "paragraph",
             "heading_1",
             "heading_2",
             "heading_3",
             "quote",
         }:
-            text_content = parse_rich_text(block[block_type].get("rich_text", []))
+            text_content = _parse_rich_text(block[block_type].get("rich_text", []))
             content += text_content + "\n\n"
-            urls = extract_urls(block[block_type].get("rich_text", []))
-
+            urls = _extract_urls(block[block_type].get("rich_text", []))
             if urls:
-                content += "\n".join(f"- {url}" for url in urls) + "\n\n"
+                metadata["urls"].extend(urls)
+
+            if "has_children" in block and block["has_children"]:
+                child_blocks = _retrieve_child_blocks(block_id)
+                child_content, child_metadata = _parse_blocks(child_blocks, depth + 1)
+                content += (
+                    "\n".join("    " + line for line in child_content.split("\n"))
+                    + "\n\n"
+                )
+                metadata = utils.merge_dicts(metadata, child_metadata)
+
         elif block_type in {"bulleted_list_item", "numbered_list_item"}:
-            content += parse_rich_text(block[block_type].get("rich_text", [])) + "\n"
+            content += _parse_rich_text(block[block_type].get("rich_text", [])) + "\n"
         elif block_type == "to_do":
-            content += parse_rich_text(block["to_do"].get("rich_text", [])) + "\n"
+            content += _parse_rich_text(block["to_do"].get("rich_text", [])) + "\n"
         elif block_type == "code":
-            content += parse_rich_text(block["code"].get("rich_text", [])) + "\n\n"
+            content += _parse_rich_text(block["code"].get("rich_text", [])) + "\n\n"
         elif block_type == "image":
             content += f"[Image: {block['image'].get('external', {}).get('url', 'No URL')}]\n\n"
         elif block_type == "divider":
             content += "---\n\n"
-        elif block_type == "child_page" and depth < 3:  # Limit recursion depth
+        elif block_type == "child_page" and depth < 3:
             child_id = block.get("id")
             child_title = block.get("child_page", {}).get("title", "Untitled")
             content += f"\n### {child_title}\n\n"
-            child_content = _retrieve_page_content(child_id, depth + 1)
+
+            child_blocks = _retrieve_child_blocks(child_id)
+            child_content, child_metadata = _parse_blocks(child_blocks, depth + 1)
             content += child_content + "\n\n"
+            metadata = utils.merge_dicts(metadata, child_metadata)
 
-        elif block_type == "child_database":
-            db_id = block.get("id")
-            db_title = block.get("child_database", {}).get("title", "Untitled Database")
-            content += f"\n### Database: {db_title}\n\n"
+        elif block_type == "link_preview":
+            url = block.get("link_preview", {}).get("url", "")
+            content += f"[Link Preview: {url}]\n\n"
 
-    return content.strip()
+            metadata["urls"].append(url)
+        else:
+            logger.warning(f"Unknown block type: {block_type}")
 
+    return content.strip("\n "), metadata
 
-def parse_rich_text(rich_text: list) -> str:
+
+def _parse_rich_text(rich_text: list) -> str:
     return "".join(segment.get("plain_text", "") for segment in rich_text)
 
 
-def extract_urls(rich_text: list) -> list:
+def _extract_urls(rich_text: list) -> list:
     """Extract URLs from rich text blocks."""
     urls = []
     for text in rich_text:
@@ -109,20 +125,3 @@ def extract_urls(rich_text: list) -> list:
         if "url" in text.get("annotations", {}):
             urls.append(text["annotations"]["url"])
     return urls
-
-
-def _fetch_block_children(block_id: str) -> list:
-    """Fetch children blocks for a given block ID."""
-    children_url = f"https://api.notion.com/v1/blocks/{block_id}/children?page_size=100"
-    headers = {
-        "Authorization": f"Bearer {settings.NOTION_SECRET_KEY}",
-        "Notion-Version": "2022-06-28",
-    }
-    try:
-        children_response = requests.get(children_url, headers=headers, timeout=10)
-        children_response.raise_for_status()
-        children_data = children_response.json()
-        return children_data.get("results", [])
-    except requests.exceptions.RequestException as e:
-        logger.error(f"Failed to fetch block children: {e}")
-        return []
diff --git a/steps/collect_notion/save_notion_pages.py b/steps/collect_notion/save_notion_pages.py
@@ -1,17 +1,21 @@
+import shutil
 from pathlib import Path
 
 from zenml import step
 
+from second_brain.entities import Page
+
 
 @step
 def save_notion_pages(
     database_id: str,
-    pages: dict[str, str],
+    pages: dict[str, Page],
 ) -> None:
     output_dir = Path("data") / database_id
-    output_dir.mkdir(parents=True, exist_ok=True)
+    if output_dir.exists():
+        shutil.rmtree(output_dir)
+    output_dir.mkdir(parents=True)
 
-    for page_name, content in pages.items():
-        file_path = output_dir / f"{page_name}.txt"
-        with open(file_path, "w", encoding="utf-8") as f:
-            f.write(content)
+    for page_id, page in pages.items():
+        file_path = output_dir / f"{page_id}.json"
+        page.write(file_path)
diff --git a/type.json b/type.json
@@ -0,0 +1,13 @@
+[
+    "paragraph",
+    "heading_3",
+    "heading_1",
+    "heading_2",
+    "child_page",
+    "image",
+    "divider",
+    "code", 
+    "link_preview",
+    "bulleted_list_item",
+    "numbered_list_item"
+]
diff --git a/uv.lock b/uv.lock