-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from phansch/save-json-etc
Save json, save files to date dir, refactor
- Loading branch information
Showing
8 changed files
with
123 additions
and
49 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
import requests | ||
from urllib.parse import urlparse, unquote | ||
from dateutil.parser import parse as parsedate | ||
from pathlib import Path | ||
from sync import core | ||
|
||
def filename_from_url(url: str) -> str: | ||
path = urlparse(url).path | ||
return unquote(Path(path).name) | ||
|
||
def last_modified_date(headers) -> str: | ||
# https://stackoverflow.com/a/70641487/6830113 | ||
import collections | ||
collections.Callable = collections.abc.Callable | ||
last_modified = headers["last-modified"] | ||
url_date = parsedate(last_modified) | ||
return url_date.strftime("%Y-%m-%d") | ||
|
||
def download(project: any, url: str): | ||
# TODO: Get the latest document on disk | ||
response = requests.get(url) | ||
filepath = core.dir_for_project(project) / last_modified_date(response.headers) / filename_from_url(url) | ||
core.create_dir_if_not_exists(filepath.parent) | ||
print(f"Downloaded {url} to {filepath}") | ||
# TODO: Don't create day dir if no changes happened | ||
# if changed_date > last_date, write new file | ||
with open(filepath, 'wb') as f: | ||
f.write(response.content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
from sync import document_downloader | ||
from sync import core | ||
from api_fetcher import core as api_fetcher | ||
from scraper import core as scraper | ||
from datetime import date | ||
import json | ||
|
||
def project_to_json(project: dict): | ||
filepath = core.dir_for_project(project) / date.today().strftime("%Y-%m-%d") / "project.json" | ||
core.create_dir_if_not_exists(filepath.parent) | ||
with open(filepath, "w") as outfile: | ||
json.dump(project, outfile) | ||
|
||
def run(): | ||
today = date.today() # format: 2024-12-21 | ||
all_projects = api_fetcher.all_projects(); | ||
print(f"Downloaded {len(all_projects)} projects from the Infravelo API") | ||
|
||
# Select projects that are not finished yet, to avoid needless scraping | ||
projects_to_scrape = list(filter(core.without_done_projects, all_projects)) | ||
print(f"About to scrape {len(projects_to_scrape)} projects.") | ||
|
||
core.create_project_dirs(projects_to_scrape) | ||
|
||
for project in projects_to_scrape: | ||
print(f"Scraping {project["link"]}") | ||
project_to_json(project) | ||
# TODO: Skip project if already checked today | ||
# TODO: Download json | ||
document_urls = scraper.get_document_urls_from_page(project['link']) | ||
for url in document_urls: | ||
document_downloader.download(project, url) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from sync import document_downloader as document_downloader | ||
|
||
def test_filename_from_url(): | ||
url = "https://www.example.com./projekt/oderberger-strasse-8/" | ||
result = document_downloader.filename_from_url(url) | ||
assert result == "oderberger-strasse-8" | ||
|
||
url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf" | ||
result = document_downloader.filename_from_url(url) | ||
assert result == "filename.pdf" | ||
|
||
url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234" | ||
result = document_downloader.filename_from_url(url) | ||
assert result == "filename.pdf" | ||
|
||
url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc" | ||
result = document_downloader.filename_from_url(url) | ||
assert result == "filename.pdf" | ||
|
||
def test_last_modified_date(): | ||
headers = { "last-modified": "Tue, 30 Apr 2024 07:38:39 GMT" } | ||
result = document_downloader.last_modified_date(headers) | ||
assert result == '2024-04-30' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,16 @@ | ||
from sync import core as sync | ||
from sync import core | ||
from pathlib import Path | ||
|
||
def test_filter(): | ||
p1 = [{ 'status': 'Abgeschlossen' }, { 'status': 'in Bau'}] | ||
result = list(filter(sync.without_done_projects, p1)) | ||
result = list(filter(core.without_done_projects, p1)) | ||
assert result == [{ 'status': 'in Bau'}] | ||
|
||
|
||
def test_project_slug_from_url(): | ||
url = "https://www.example.com/projekt/oderberger-strasse-8/" | ||
result = sync.project_slug_from_url(url) | ||
result = core.project_slug_from_url(url) | ||
assert result == "oderberger-strasse-8" | ||
|
||
def test_filename_from_url(): | ||
url = "https://www.example.com./projekt/oderberger-strasse-8/" | ||
result = sync.filename_from_url(url) | ||
assert result == "oderberger-strasse-8" | ||
|
||
url = "https://www.example.com./projekt/oderberger-strasse-8/filename.pdf" | ||
result = sync.filename_from_url(url) | ||
assert result == "filename.pdf" | ||
|
||
url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234" | ||
result = sync.filename_from_url(url) | ||
assert result == "filename.pdf" | ||
|
||
url = "https://www.example.com/projekt/oderberger-strasse-8/filename.pdf?1234#abc" | ||
result = sync.filename_from_url(url) | ||
assert result == "filename.pdf" | ||
|
||
def test_root_dir(): | ||
assert sync.root_dir().name == "infravelo-py" | ||
assert core.root_dir().name == "infravelo-py" |