Skip to content

Commit

Permalink
Merge pull request #10 from phansch/save-json-etc
Browse files Browse the repository at this point in the history
Save json, save files to date dir, refactor
  • Loading branch information
phansch authored Apr 30, 2024
2 parents 295d546 + 14c4956 commit ebde728
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 49 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# infravelo-py

[![CI](https://github.com/phansch/infravelo-py/actions/workflows/python.yml/badge.svg)](https://github.com/phansch/infravelo-py/actions/workflows/python.yml)

Some tools based on the [infravelo.de](https://infravelo.de) API and website.

* API fetcher
Expand Down
26 changes: 25 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ packages = [
python = "^3.12"
requests = "^2.31.0"
beautifulsoup4 = "^4.12.3"
py-dateutil = "^2.2"

[tool.poetry.group.dev.dependencies]
mypy = "*"
Expand Down
35 changes: 8 additions & 27 deletions sync/core.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from api_fetcher import core as api_fetcher
from scraper import core as scraper
from enum import Enum
from pathlib import Path
import os
from urllib.parse import urlparse, unquote
import requests
from sync import runner

# Various util methods for filesystem and request things

class ProjectState(Enum):
DONE = "Abgeschlossen"
Expand All @@ -18,10 +17,6 @@ def without_done_projects(project):
def project_slug_from_url(project_url):
return project_url.split('/')[-2]

def filename_from_url(url: str) -> str:
path = urlparse(url).path
return unquote(Path(path).name)

def root_dir() -> Path:
return Path(__file__).parent.parent

Expand All @@ -30,24 +25,10 @@ def dir_for_project(project) -> Path:

def create_project_dirs(projects):
for project in projects:
dir_for_project(project).mkdir(parents=True,exist_ok=True)
create_dir_if_not_exists(dir_for_project(project))

def create_dir_if_not_exists(path: Path):
path.mkdir(parents=True,exist_ok=True)

def run():
all_projects = api_fetcher.all_projects();
print(f"Downloaded {len(all_projects)} projects from the Infravelo API")

# Select projects that are not finished yet, to avoid needless scraping
projects_to_scrape = list(filter(without_done_projects, all_projects))
print(f"About to scrape {len(projects_to_scrape)} projects.")

create_project_dirs(projects_to_scrape)

for project in projects_to_scrape:
print(f"Scraping {project["link"]}")
document_urls = scraper.get_document_urls_from_page(project['link'])
for url in document_urls:
path = dir_for_project(project) / filename_from_url(url)
print(f"Downloading {url} to {path}")
response = requests.get(url)
with open(path, 'wb') as f:
f.write(response.content)
runner.run()
28 changes: 28 additions & 0 deletions sync/document_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import requests
from urllib.parse import urlparse, unquote
from dateutil.parser import parse as parsedate
from pathlib import Path
from sync import core

def filename_from_url(url: str) -> str:
path = urlparse(url).path
return unquote(Path(path).name)

def last_modified_date(headers) -> str:
# https://stackoverflow.com/a/70641487/6830113
import collections
collections.Callable = collections.abc.Callable
last_modified = headers["last-modified"]
url_date = parsedate(last_modified)
return url_date.strftime("%Y-%m-%d")

def download(project: any, url: str):
# TODO: Get the latest document on disk
response = requests.get(url)
filepath = core.dir_for_project(project) / last_modified_date(response.headers) / filename_from_url(url)
core.create_dir_if_not_exists(filepath.parent)
print(f"Downloaded {url} to {filepath}")
# TODO: Don't create day dir if no changes happened
# if changed_date > last_date, write new file
with open(filepath, 'wb') as f:
f.write(response.content)
32 changes: 32 additions & 0 deletions sync/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from sync import document_downloader
from sync import core
from api_fetcher import core as api_fetcher
from scraper import core as scraper
from datetime import date
import json

def project_to_json(project: dict):
filepath = core.dir_for_project(project) / date.today().strftime("%Y-%m-%d") / "project.json"
core.create_dir_if_not_exists(filepath.parent)
with open(filepath, "w") as outfile:
json.dump(project, outfile)

def run():
today = date.today() # format: 2024-12-21
all_projects = api_fetcher.all_projects();
print(f"Downloaded {len(all_projects)} projects from the Infravelo API")

# Select projects that are not finished yet, to avoid needless scraping
projects_to_scrape = list(filter(core.without_done_projects, all_projects))
print(f"About to scrape {len(projects_to_scrape)} projects.")

core.create_project_dirs(projects_to_scrape)

for project in projects_to_scrape:
print(f"Scraping {project["link"]}")
project_to_json(project)
# TODO: Skip project if already checked today
# TODO: Download json
document_urls = scraper.get_document_urls_from_page(project['link'])
for url in document_urls:
document_downloader.download(project, url)
23 changes: 23 additions & 0 deletions tests/test_document_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from sync import document_downloader as document_downloader

def test_filename_from_url():
url = "https://www.example.com./projekt/oderberger-strasse-8/"
result = document_downloader.filename_from_url(url)
assert result == "oderberger-strasse-8"

url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf"
result = document_downloader.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234"
result = document_downloader.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc"
result = document_downloader.filename_from_url(url)
assert result == "filename.pdf"

def test_last_modified_date():
headers = { "last-modified": "Tue, 30 Apr 2024 07:38:39 GMT" }
result = document_downloader.last_modified_date(headers)
assert result == '2024-04-30'
25 changes: 4 additions & 21 deletions tests/test_sync.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,16 @@
from sync import core as sync
from sync import core
from pathlib import Path

def test_filter():
p1 = [{ 'status': 'Abgeschlossen' }, { 'status': 'in Bau'}]
result = list(filter(sync.without_done_projects, p1))
result = list(filter(core.without_done_projects, p1))
assert result == [{ 'status': 'in Bau'}]


def test_project_slug_from_url():
url = "https://www.example.com/projekt/oderberger-strasse-8/"
result = sync.project_slug_from_url(url)
result = core.project_slug_from_url(url)
assert result == "oderberger-strasse-8"

def test_filename_from_url():
url = "https://www.example.com./projekt/oderberger-strasse-8/"
result = sync.filename_from_url(url)
assert result == "oderberger-strasse-8"

url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf"
result = sync.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234"
result = sync.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc"
result = sync.filename_from_url(url)
assert result == "filename.pdf"

def test_root_dir():
assert sync.root_dir().name == "infravelo-py"
assert core.root_dir().name == "infravelo-py"

0 comments on commit ebde728

Please sign in to comment.