-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01_get_overiew.py
102 lines (83 loc) · 2.97 KB
/
01_get_overiew.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# source venv/bin/activate
# collect links from overview pages
import requests
from bs4 import BeautifulSoup
import csv
import re
import os
import numpy
import pandas as pd
# the scripts queries Parlementsspiegel twice: Once for searchwords, another time for keywords
SEARCHWORDS = "input/searchwords.csv"
KEYWORDS = "input/keywords.csv"
OUTPUT_FILE = "input/data/links_beratungsstand.csv"
BASE_URL = "https://www.parlamentsspiegel.de/sites/parlamentsspiegel/home/suchergebnisseparlamentsspiegel.html"
def load_querywords(INPUT_FILE, list_querywords):
ifile = open(INPUT_FILE, "r",)
reader = csv.reader(ifile)
next(reader)
for row in reader: # each row is a list
list_querywords.append(row)
ifile.close()
return(list_querywords)
def save_csv(OUTPUT_FILE, list_links, queryword):
with open(OUTPUT_FILE, 'w') as f:
writer = csv.writer(f, dialect="excel", quoting = csv.QUOTE_ALL)
# for row in rows:
# writer.writerow(row)
keys = ["link"]#, "queryword"]
writer.writerow(keys)
for item in list_links:
writer.writerow([item])
def get_overview(BASE_URL, queryword):
if queryword is searchword:
url = BASE_URL + "?db=psakt&vir=alle&suchbegriff=" + queryword[0] + "&sortierung=dat_desc&verknuepfung=and"
elif queryword is keyword:
url = BASE_URL + "?db=psakt&vir=alle&schlagwort=" + queryword[0] + "&sortierung=dat_desc&verknuepfung=and"
else:
url = BASE_URL
# get content from url
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
# get link of last link in pagination
# there is m=xxx: indicates the pagination
# every page has 50 entries
pagination = soup.find_all("ul", {"class": "paging"})[0].find("li", {"class": "last"})
if pagination:
pagination = pagination.find_all("a")[0]['href']
pagination = re.search(r"(?<=m=)\d+", pagination)[0]
print("pages: " + pagination)
pagination_input = numpy.arange(1, int(pagination)+1, 50)
else:
pagination_input = [1]
print("entries under 50 ")
# get html pages
for i in pagination_input:
url_paginated = url + "&m=" + str(i)
page = requests.get(url_paginated)
soup = BeautifulSoup(page.content, 'html.parser')
# save html file
with open("input/html/overview/" + queryword[0] + "_" + str(i) + ".html", "w") as file:
file.write(str(soup))
# get all link zu detailseiten
links = soup.find_all("a", {"class": "beratungsstand"})#
for link in links:
if link.has_attr('href'):
beratungslinks.append(link["href"])
print("vorm speichern " + queryword[0])
print(len(beratungslinks))
save_csv(OUTPUT_FILE, beratungslinks, queryword)
# init lists and load querywords
searchwords = []
searchwords = load_querywords(SEARCHWORDS, searchwords)
# searchwords = searchwords[1:2]
keywords = []
keywords = load_querywords(KEYWORDS, keywords)
# keywords = keywords[1:2]
beratungslinks = []
for searchword in searchwords:
print("searchword: " + searchword[0])
get_overview(BASE_URL, searchword)
for keyword in keywords:
print("keyword: " + keyword[0])
get_overview(BASE_URL, keyword)