Skip to content

Commit

Permalink
Add scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
phansch committed Apr 12, 2024
1 parent ff247d8 commit 746657e
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
- name: Prep mypy
run: poetry run mypy --install-types
- name: Run mypy
run: poetry run mypy api-fetcher
run: poetry run mypy scraper api-fetcher
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry.scripts]
fetcher = "api-fetcher.main:run"
download = "scraper.core:test"
22 changes: 22 additions & 0 deletions scraper/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from bs4 import BeautifulSoup
import requests

INFRAVELO_DOMAIN = "https://infravelo.de"

def get_document_urls_from_page(page_url: str) -> list[str]:
"""TODO: Docstring for scrape.
:returns: Array
"""
page = requests.get(page_url)
soup = BeautifulSoup(page.content, "html.parser")
download_items = soup.find_all("a", class_="download--item-download") # a download link
urls = []
for item in download_items:
urls.append(INFRAVELO_DOMAIN + item["href"])
print(urls)
return urls


def test():
scrape("https://www.infravelo.de/projekt/otto-braun-strasse/")

0 comments on commit 746657e

Please sign in to comment.