diff --git a/README.md b/README.md index 854b501..1d1c3d7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # infravelo-py +[![CI](https://github.com/phansch/infravelo-py/actions/workflows/python.yml/badge.svg)](https://github.com/phansch/infravelo-py/actions/workflows/python.yml) + Some tools based on the [infravelo.de](https://infravelo.de) API and website. * API fetcher diff --git a/poetry.lock b/poetry.lock index 2798e86..e8fe432 100644 --- a/poetry.lock +++ b/poetry.lock @@ -247,6 +247,19 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "py-dateutil" +version = "2.2" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "*" +files = [ + {file = "py-dateutil-2.2.tar.gz", hash = "sha256:7efa2ca17159c590408cb624de9aa10d360f14097cb70dd7559e632f2cf4b048"}, +] + +[package.dependencies] +six = "*" + [[package]] name = "pytest" version = "8.1.1" @@ -288,6 +301,17 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + [[package]] name = "soupsieve" version = "2.5" @@ -330,4 +354,4 @@ zstd = ["zstandard (>=0.18.0)"] [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "f96b6643f3e96f7f811d8a4db898494897a83553fa41b63c9f2052de61a284af" +content-hash = "6566854348befa86354ac8e9b03659ede8ee9ec33fbadef1a4280e661d42afbb" diff --git a/pyproject.toml b/pyproject.toml index 8519874..22af92a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ packages = [ python = "^3.12" requests = "^2.31.0" beautifulsoup4 = "^4.12.3" +py-dateutil = "^2.2" [tool.poetry.group.dev.dependencies] mypy = "*" diff --git a/sync/core.py b/sync/core.py index 36dcc6d..0eab2d5 100644 --- a/sync/core.py +++ b/sync/core.py @@ -1,10 +1,9 @@ -from api_fetcher import core as api_fetcher -from scraper import core as scraper from enum import Enum from pathlib import Path import os -from urllib.parse import urlparse, unquote -import requests +from sync import runner + +# Various util methods for filesystem and request things class ProjectState(Enum): DONE = "Abgeschlossen" @@ -18,10 +17,6 @@ def without_done_projects(project): def project_slug_from_url(project_url): return project_url.split('/')[-2] -def filename_from_url(url: str) -> str: - path = urlparse(url).path - return unquote(Path(path).name) - def root_dir() -> Path: return Path(__file__).parent.parent @@ -30,24 +25,10 @@ def dir_for_project(project) -> Path: def create_project_dirs(projects): for project in projects: - dir_for_project(project).mkdir(parents=True,exist_ok=True) + create_dir_if_not_exists(dir_for_project(project)) + +def create_dir_if_not_exists(path: Path): + path.mkdir(parents=True,exist_ok=True) def run(): - all_projects = api_fetcher.all_projects(); - print(f"Downloaded {len(all_projects)} projects from the Infravelo API") - - # Select projects that are not finished yet, to avoid needless scraping - projects_to_scrape = list(filter(without_done_projects, all_projects)) - print(f"About to scrape {len(projects_to_scrape)} projects.") - - create_project_dirs(projects_to_scrape) - - for project in projects_to_scrape: - print(f"Scraping {project["link"]}") - document_urls = scraper.get_document_urls_from_page(project['link']) - for url in document_urls: - path = dir_for_project(project) / filename_from_url(url) - print(f"Downloading {url} to {path}") - response = requests.get(url) - with open(path, 'wb') as f: - f.write(response.content) + runner.run() diff --git a/sync/document_downloader.py b/sync/document_downloader.py new file mode 100644 index 0000000..3203bed --- /dev/null +++ b/sync/document_downloader.py @@ -0,0 +1,28 @@ +import requests +from urllib.parse import urlparse, unquote +from dateutil.parser import parse as parsedate +from pathlib import Path +from sync import core + +def filename_from_url(url: str) -> str: + path = urlparse(url).path + return unquote(Path(path).name) + +def last_modified_date(headers) -> str: + # https://stackoverflow.com/a/70641487/6830113 + import collections + collections.Callable = collections.abc.Callable + last_modified = headers["last-modified"] + url_date = parsedate(last_modified) + return url_date.strftime("%Y-%m-%d") + +def download(project: any, url: str): + # TODO: Get the latest document on disk + response = requests.get(url) + filepath = core.dir_for_project(project) / last_modified_date(response.headers) / filename_from_url(url) + core.create_dir_if_not_exists(filepath.parent) + print(f"Downloaded {url} to {filepath}") + # TODO: Don't create day dir if no changes happened + # if changed_date > last_date, write new file + with open(filepath, 'wb') as f: + f.write(response.content) diff --git a/sync/runner.py b/sync/runner.py new file mode 100644 index 0000000..12ace4b --- /dev/null +++ b/sync/runner.py @@ -0,0 +1,32 @@ +from sync import document_downloader +from sync import core +from api_fetcher import core as api_fetcher +from scraper import core as scraper +from datetime import date +import json + +def project_to_json(project: dict): + filepath = core.dir_for_project(project) / date.today().strftime("%Y-%m-%d") / "project.json" + core.create_dir_if_not_exists(filepath.parent) + with open(filepath, "w") as outfile: + json.dump(project, outfile) + +def run(): + today = date.today() # format: 2024-12-21 + all_projects = api_fetcher.all_projects(); + print(f"Downloaded {len(all_projects)} projects from the Infravelo API") + + # Select projects that are not finished yet, to avoid needless scraping + projects_to_scrape = list(filter(core.without_done_projects, all_projects)) + print(f"About to scrape {len(projects_to_scrape)} projects.") + + core.create_project_dirs(projects_to_scrape) + + for project in projects_to_scrape: + print(f"Scraping {project["link"]}") + project_to_json(project) + # TODO: Skip project if already checked today + # TODO: Download json + document_urls = scraper.get_document_urls_from_page(project['link']) + for url in document_urls: + document_downloader.download(project, url) diff --git a/tests/test_document_downloader.py b/tests/test_document_downloader.py new file mode 100644 index 0000000..75df477 --- /dev/null +++ b/tests/test_document_downloader.py @@ -0,0 +1,23 @@ +from sync import document_downloader as document_downloader + +def test_filename_from_url(): + url = "https://www.example.com./projekt/oderberger-strasse-8/" + result = document_downloader.filename_from_url(url) + assert result == "oderberger-strasse-8" + + url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf" + result = document_downloader.filename_from_url(url) + assert result == "filename.pdf" + + url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234" + result = document_downloader.filename_from_url(url) + assert result == "filename.pdf" + + url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc" + result = document_downloader.filename_from_url(url) + assert result == "filename.pdf" + +def test_last_modified_date(): + headers = { "last-modified": "Tue, 30 Apr 2024 07:38:39 GMT" } + result = document_downloader.last_modified_date(headers) + assert result == '2024-04-30' diff --git a/tests/test_sync.py b/tests/test_sync.py index 680981d..218bfe5 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -1,33 +1,16 @@ -from sync import core as sync +from sync import core from pathlib import Path def test_filter(): p1 = [{ 'status': 'Abgeschlossen' }, { 'status': 'in Bau'}] - result = list(filter(sync.without_done_projects, p1)) + result = list(filter(core.without_done_projects, p1)) assert result == [{ 'status': 'in Bau'}] def test_project_slug_from_url(): url = "https://www.example.com/projekt/oderberger-strasse-8/" - result = sync.project_slug_from_url(url) + result = core.project_slug_from_url(url) assert result == "oderberger-strasse-8" -def test_filename_from_url(): - url = "https://www.example.com./projekt/oderberger-strasse-8/" - result = sync.filename_from_url(url) - assert result == "oderberger-strasse-8" - - url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf" - result = sync.filename_from_url(url) - assert result == "filename.pdf" - - url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234" - result = sync.filename_from_url(url) - assert result == "filename.pdf" - - url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc" - result = sync.filename_from_url(url) - assert result == "filename.pdf" - def test_root_dir(): - assert sync.root_dir().name == "infravelo-py" + assert core.root_dir().name == "infravelo-py"