Merge pull request #10 from phansch/save-json-etc

Save json, save files to date dir, refactor
phansch · Apr 30, 2024 · ebde728 · ebde728
2 parents 295d546 + 14c4956
commit ebde728
Show file tree

Hide file tree

Showing 8 changed files with 123 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,7 @@
 # infravelo-py
 
+[![CI](https://github.com/phansch/infravelo-py/actions/workflows/python.yml/badge.svg)](https://github.com/phansch/infravelo-py/actions/workflows/python.yml)
+
 Some tools based on the [infravelo.de](https://infravelo.de) API and website.
 
 * API fetcher

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ packages = [
 python = "^3.12"
 requests = "^2.31.0"
 beautifulsoup4 = "^4.12.3"
+py-dateutil = "^2.2"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "*"

diff --git a/sync/core.py b/sync/core.py
@@ -1,10 +1,9 @@
-from api_fetcher import core as api_fetcher
-from scraper import core as scraper
 from enum import Enum
 from pathlib import Path
 import os
-from urllib.parse import urlparse, unquote
-import requests
+from sync import runner
+
+# Various util methods for filesystem and request things
 
 class ProjectState(Enum):
     DONE = "Abgeschlossen"
@@ -18,10 +17,6 @@ def without_done_projects(project):
 def project_slug_from_url(project_url):
     return project_url.split('/')[-2]
 
-def filename_from_url(url: str) -> str:
-    path = urlparse(url).path
-    return unquote(Path(path).name)
-
 def root_dir() -> Path:
     return Path(__file__).parent.parent
 
@@ -30,24 +25,10 @@ def dir_for_project(project) -> Path:
 
 def create_project_dirs(projects):
     for project in projects:
-        dir_for_project(project).mkdir(parents=True,exist_ok=True)
+        create_dir_if_not_exists(dir_for_project(project))
+
+def create_dir_if_not_exists(path: Path):
+    path.mkdir(parents=True,exist_ok=True)
 
 def run():
-    all_projects = api_fetcher.all_projects();
-    print(f"Downloaded {len(all_projects)} projects from the Infravelo API")
-
-    # Select projects that are not finished yet, to avoid needless scraping
-    projects_to_scrape = list(filter(without_done_projects, all_projects))
-    print(f"About to scrape {len(projects_to_scrape)} projects.")
-
-    create_project_dirs(projects_to_scrape)
-
-    for project in projects_to_scrape:
-        print(f"Scraping {project["link"]}")
-        document_urls = scraper.get_document_urls_from_page(project['link'])
-        for url in document_urls:
-            path = dir_for_project(project) / filename_from_url(url)
-            print(f"Downloading {url} to {path}")
-            response = requests.get(url)
-            with open(path, 'wb') as f:
-                f.write(response.content)
+    runner.run()
diff --git a/sync/document_downloader.py b/sync/document_downloader.py
@@ -0,0 +1,28 @@
+import requests
+from urllib.parse import urlparse, unquote
+from dateutil.parser import parse as parsedate
+from pathlib import Path
+from sync import core
+
+def filename_from_url(url: str) -> str:
+    path = urlparse(url).path
+    return unquote(Path(path).name)
+
+def last_modified_date(headers) -> str:
+    # https://stackoverflow.com/a/70641487/6830113
+    import collections
+    collections.Callable = collections.abc.Callable
+    last_modified = headers["last-modified"]
+    url_date = parsedate(last_modified)
+    return url_date.strftime("%Y-%m-%d")
+
+def download(project: any, url: str):
+    # TODO: Get the latest document on disk
+    response = requests.get(url)
+    filepath = core.dir_for_project(project) / last_modified_date(response.headers) / filename_from_url(url)
+    core.create_dir_if_not_exists(filepath.parent)
+    print(f"Downloaded {url} to {filepath}")
+    # TODO: Don't create day dir if no changes happened
+    # if changed_date > last_date, write new file
+    with open(filepath, 'wb') as f:
+        f.write(response.content)
diff --git a/sync/runner.py b/sync/runner.py
@@ -0,0 +1,32 @@
+from sync import document_downloader
+from sync import core
+from api_fetcher import core as api_fetcher
+from scraper import core as scraper
+from datetime import date
+import json
+
+def project_to_json(project: dict):
+    filepath = core.dir_for_project(project) / date.today().strftime("%Y-%m-%d") / "project.json"
+    core.create_dir_if_not_exists(filepath.parent)
+    with open(filepath, "w") as outfile:
+        json.dump(project, outfile)
+
+def run():
+    today = date.today() # format: 2024-12-21
+    all_projects = api_fetcher.all_projects();
+    print(f"Downloaded {len(all_projects)} projects from the Infravelo API")
+
+    # Select projects that are not finished yet, to avoid needless scraping
+    projects_to_scrape = list(filter(core.without_done_projects, all_projects))
+    print(f"About to scrape {len(projects_to_scrape)} projects.")
+
+    core.create_project_dirs(projects_to_scrape)
+
+    for project in projects_to_scrape:
+        print(f"Scraping {project["link"]}")
+        project_to_json(project)
+        # TODO: Skip project if already checked today
+        # TODO: Download json
+        document_urls = scraper.get_document_urls_from_page(project['link'])
+        for url in document_urls:
+            document_downloader.download(project, url)
diff --git a/tests/test_document_downloader.py b/tests/test_document_downloader.py
@@ -0,0 +1,23 @@
+from sync import document_downloader as document_downloader
+
+def test_filename_from_url():
+    url = "https://www.example.com./projekt/oderberger-strasse-8/"
+    result = document_downloader.filename_from_url(url)
+    assert result == "oderberger-strasse-8"
+
+    url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf"
+    result = document_downloader.filename_from_url(url)
+    assert result == "filename.pdf"
+
+    url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234"
+    result = document_downloader.filename_from_url(url)
+    assert result == "filename.pdf"
+
+    url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc"
+    result = document_downloader.filename_from_url(url)
+    assert result == "filename.pdf"
+
+def test_last_modified_date():
+    headers = { "last-modified": "Tue, 30 Apr 2024 07:38:39 GMT" }
+    result = document_downloader.last_modified_date(headers)
+    assert result == '2024-04-30'
diff --git a/tests/test_sync.py b/tests/test_sync.py
@@ -1,33 +1,16 @@
-from sync import core as sync
+from sync import core
 from pathlib import Path
 
 def test_filter():
     p1 = [{ 'status': 'Abgeschlossen' }, { 'status': 'in Bau'}]
-    result = list(filter(sync.without_done_projects, p1))
+    result = list(filter(core.without_done_projects, p1))
     assert result == [{ 'status': 'in Bau'}]
 
 
 def test_project_slug_from_url():
     url = "https://www.example.com/projekt/oderberger-strasse-8/"
-    result = sync.project_slug_from_url(url)
+    result = core.project_slug_from_url(url)
     assert result == "oderberger-strasse-8"
 
-def test_filename_from_url():
-    url = "https://www.example.com./projekt/oderberger-strasse-8/"
-    result = sync.filename_from_url(url)
-    assert result == "oderberger-strasse-8"
-
-    url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf"
-    result = sync.filename_from_url(url)
-    assert result == "filename.pdf"
-
-    url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234"
-    result = sync.filename_from_url(url)
-    assert result == "filename.pdf"
-
-    url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc"
-    result = sync.filename_from_url(url)
-    assert result == "filename.pdf"
-
 def test_root_dir():
-    assert sync.root_dir().name == "infravelo-py"
+    assert core.root_dir().name == "infravelo-py"