Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Save json, save files to date dir, refactor #10

Merged
merged 3 commits into from
Apr 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# infravelo-py

[![CI](https://github.com/phansch/infravelo-py/actions/workflows/python.yml/badge.svg)](https://github.com/phansch/infravelo-py/actions/workflows/python.yml)

Some tools based on the [infravelo.de](https://infravelo.de) API and website.

* API fetcher
Expand Down
26 changes: 25 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ packages = [
python = "^3.12"
requests = "^2.31.0"
beautifulsoup4 = "^4.12.3"
py-dateutil = "^2.2"

[tool.poetry.group.dev.dependencies]
mypy = "*"
Expand Down
35 changes: 8 additions & 27 deletions sync/core.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from api_fetcher import core as api_fetcher
from scraper import core as scraper
from enum import Enum
from pathlib import Path
import os
from urllib.parse import urlparse, unquote
import requests
from sync import runner

# Various util methods for filesystem and request things

class ProjectState(Enum):
DONE = "Abgeschlossen"
Expand All @@ -18,10 +17,6 @@ def without_done_projects(project):
def project_slug_from_url(project_url):
return project_url.split('/')[-2]

def filename_from_url(url: str) -> str:
path = urlparse(url).path
return unquote(Path(path).name)

def root_dir() -> Path:
return Path(__file__).parent.parent

Expand All @@ -30,24 +25,10 @@ def dir_for_project(project) -> Path:

def create_project_dirs(projects):
for project in projects:
dir_for_project(project).mkdir(parents=True,exist_ok=True)
create_dir_if_not_exists(dir_for_project(project))

def create_dir_if_not_exists(path: Path):
path.mkdir(parents=True,exist_ok=True)

def run():
all_projects = api_fetcher.all_projects();
print(f"Downloaded {len(all_projects)} projects from the Infravelo API")

# Select projects that are not finished yet, to avoid needless scraping
projects_to_scrape = list(filter(without_done_projects, all_projects))
print(f"About to scrape {len(projects_to_scrape)} projects.")

create_project_dirs(projects_to_scrape)

for project in projects_to_scrape:
print(f"Scraping {project["link"]}")
document_urls = scraper.get_document_urls_from_page(project['link'])
for url in document_urls:
path = dir_for_project(project) / filename_from_url(url)
print(f"Downloading {url} to {path}")
response = requests.get(url)
with open(path, 'wb') as f:
f.write(response.content)
runner.run()
28 changes: 28 additions & 0 deletions sync/document_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import requests
from urllib.parse import urlparse, unquote
from dateutil.parser import parse as parsedate
from pathlib import Path
from sync import core

def filename_from_url(url: str) -> str:
path = urlparse(url).path
return unquote(Path(path).name)

def last_modified_date(headers) -> str:
# https://stackoverflow.com/a/70641487/6830113
import collections
collections.Callable = collections.abc.Callable
last_modified = headers["last-modified"]
url_date = parsedate(last_modified)
return url_date.strftime("%Y-%m-%d")

def download(project: any, url: str):
# TODO: Get the latest document on disk
response = requests.get(url)
filepath = core.dir_for_project(project) / last_modified_date(response.headers) / filename_from_url(url)
core.create_dir_if_not_exists(filepath.parent)
print(f"Downloaded {url} to {filepath}")
# TODO: Don't create day dir if no changes happened
# if changed_date > last_date, write new file
with open(filepath, 'wb') as f:
f.write(response.content)
32 changes: 32 additions & 0 deletions sync/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from sync import document_downloader
from sync import core
from api_fetcher import core as api_fetcher
from scraper import core as scraper
from datetime import date
import json

def project_to_json(project: dict):
filepath = core.dir_for_project(project) / date.today().strftime("%Y-%m-%d") / "project.json"
core.create_dir_if_not_exists(filepath.parent)
with open(filepath, "w") as outfile:
json.dump(project, outfile)

def run():
today = date.today() # format: 2024-12-21
all_projects = api_fetcher.all_projects();
print(f"Downloaded {len(all_projects)} projects from the Infravelo API")

# Select projects that are not finished yet, to avoid needless scraping
projects_to_scrape = list(filter(core.without_done_projects, all_projects))
print(f"About to scrape {len(projects_to_scrape)} projects.")

core.create_project_dirs(projects_to_scrape)

for project in projects_to_scrape:
print(f"Scraping {project["link"]}")
project_to_json(project)
# TODO: Skip project if already checked today
# TODO: Download json
document_urls = scraper.get_document_urls_from_page(project['link'])
for url in document_urls:
document_downloader.download(project, url)
23 changes: 23 additions & 0 deletions tests/test_document_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from sync import document_downloader as document_downloader

def test_filename_from_url():
url = "https://www.example.com./projekt/oderberger-strasse-8/"
result = document_downloader.filename_from_url(url)
assert result == "oderberger-strasse-8"

url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf"
result = document_downloader.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234"
result = document_downloader.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc"
result = document_downloader.filename_from_url(url)
assert result == "filename.pdf"

def test_last_modified_date():
headers = { "last-modified": "Tue, 30 Apr 2024 07:38:39 GMT" }
result = document_downloader.last_modified_date(headers)
assert result == '2024-04-30'
25 changes: 4 additions & 21 deletions tests/test_sync.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,16 @@
from sync import core as sync
from sync import core
from pathlib import Path

def test_filter():
p1 = [{ 'status': 'Abgeschlossen' }, { 'status': 'in Bau'}]
result = list(filter(sync.without_done_projects, p1))
result = list(filter(core.without_done_projects, p1))
assert result == [{ 'status': 'in Bau'}]


def test_project_slug_from_url():
url = "https://www.example.com/projekt/oderberger-strasse-8/"
result = sync.project_slug_from_url(url)
result = core.project_slug_from_url(url)
assert result == "oderberger-strasse-8"

def test_filename_from_url():
url = "https://www.example.com./projekt/oderberger-strasse-8/"
result = sync.filename_from_url(url)
assert result == "oderberger-strasse-8"

url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf"
result = sync.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234"
result = sync.filename_from_url(url)
assert result == "filename.pdf"

url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc"
result = sync.filename_from_url(url)
assert result == "filename.pdf"

def test_root_dir():
assert sync.root_dir().name == "infravelo-py"
assert core.root_dir().name == "infravelo-py"
Loading