-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsfa.py
78 lines (63 loc) · 2.29 KB
/
sfa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from datetime import date, timedelta
from urllib.parse import urljoin
import helpers
from bs4 import BeautifulSoup
_base_url = "http://www.fca.org.uk/your-fca/list?ttypes=Final+Notice&yyear=&ssearch=&ppage="
_site_url = "http://www.fca.org.uk"
companies = ("ltd", "limited", "company", "services", "bank", "trading", "financial")
def get_date(mydate):
if mydate == "Today":
return date.today().strftime("%y-%m-%d")
elif mydate == "Yesterday":
return (date.today() - timedelta(days=1)).strftime("%y-%m-%d")
else:
mydate = mydate.split('/')
return "{}-{}-{}".format(mydate[2], mydate[1], mydate[0])
def _generate_entities():
"""for each scrapable page, yield an entity"""
run = True
page = 0
while run:
url = _base_url + str(page)
page += 1
doc = BeautifulSoup(helpers.fetch_string(url), "html.parser")
div = doc.find('div', id="resultsSearchBox")
all_h3 = div.find_all("h3", id='')
if not all_h3:
run = False
return
for h3 in all_h3:
a = h3.find('a')
href = urljoin(_site_url, a['href'])
name = a.get_text().split(':')[1].strip()
sub = h3.find_next_sibling('sub')
spans = sub.find_all('span')
if spans:
published = get_date(spans[0].get_text().strip())
modified = get_date(spans[1].get_text().strip())
else:
sub = sub.get_text().strip()
published = get_date(sub[11:21])
modified = get_date(sub[-10:])
if any(company in name.lower() for company in companies):
entity_type = "company"
else:
entity_type = "person"
fields = [
{"tag": "url", "value": href},
{"tag": "Published", "value": published},
{"tag": "Last Modified", "value": modified}
]
yield {
"_meta": {
"id": helpers.make_id(name),
"entity_type": entity_type
},
"fields": fields,
"name": name
}
def main():
for entity in _generate_entities():
helpers.check(entity)
if __name__ == "__main__":
main()