diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 73be760..f41ebae 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -16,4 +16,4 @@ jobs: - name: Prep mypy run: poetry run mypy --install-types - name: Run mypy - run: poetry run mypy api-fetcher + run: poetry run mypy scraper api-fetcher diff --git a/pyproject.toml b/pyproject.toml index 72376cd..31c56ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,3 +23,4 @@ build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] fetcher = "api-fetcher.main:run" +download = "scraper.core:test" diff --git a/scraper/core.py b/scraper/core.py new file mode 100644 index 0000000..7f19cb9 --- /dev/null +++ b/scraper/core.py @@ -0,0 +1,22 @@ +from bs4 import BeautifulSoup +import requests + +INFRAVELO_DOMAIN = "https://infravelo.de" + +def get_document_urls_from_page(page_url: str) -> list[str]: + """TODO: Docstring for scrape. + :returns: Array + + """ + page = requests.get(page_url) + soup = BeautifulSoup(page.content, "html.parser") + download_items = soup.find_all("a", class_="download--item-download") # a download link + urls = [] + for item in download_items: + urls.append(INFRAVELO_DOMAIN + item["href"]) + print(urls) + return urls + + +def test(): + scrape("https://www.infravelo.de/projekt/otto-braun-strasse/")