Add scraper

phansch · Apr 12, 2024 · 746657e · 746657e
1 parent ff247d8
commit 746657e
Show file tree

Hide file tree

Showing 3 changed files with 24 additions and 1 deletion.
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -16,4 +16,4 @@ jobs:
       - name: Prep mypy
         run: poetry run mypy --install-types
       - name: Run mypy
-        run: poetry run mypy api-fetcher
+        run: poetry run mypy scraper api-fetcher
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,3 +23,4 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
 fetcher = "api-fetcher.main:run"
+download = "scraper.core:test"
diff --git a/scraper/core.py b/scraper/core.py
@@ -0,0 +1,22 @@
+from bs4 import BeautifulSoup
+import requests
+
+INFRAVELO_DOMAIN = "https://infravelo.de"
+
+def get_document_urls_from_page(page_url: str) -> list[str]:
+    """TODO: Docstring for scrape.
+    :returns: Array
+
+    """
+    page = requests.get(page_url)
+    soup = BeautifulSoup(page.content, "html.parser")
+    download_items = soup.find_all("a", class_="download--item-download") # a download link
+    urls = []
+    for item in download_items:
+        urls.append(INFRAVELO_DOMAIN + item["href"])
+    print(urls)
+    return urls
+
+
+def test():
+    scrape("https://www.infravelo.de/projekt/otto-braun-strasse/")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,3 +23,4 @@ build-backend = "poetry.core.masonry.api"

		[tool.poetry.scripts]
		fetcher = "api-fetcher.main:run"
		download = "scraper.core:test"