forked from kanedata/find-that-charity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_data.py
126 lines (106 loc) · 5.53 KB
/
fetch_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Script for fetching data from charity regulators
"""
import urllib.request
import argparse
import zipfile
import re
import os
from bs4 import BeautifulSoup
import mechanicalsoup
import bcp
DUAL_CSV = 'https://raw.githubusercontent.com/drkane/charity-lookups/master/dual-registered-uk-charities.csv'
OSCR_URL = "https://www.oscr.org.uk/umbraco/Surface/FormsSurface/CharityRegDownload"
CCEW_URL = "http://data.charitycommission.gov.uk/"
CCNI_URL = "https://www.charitycommissionni.org.uk/umbraco/api/charityApi/ExportSearchResultsToCsv/?pageNumber=1&include=Linked&include=Removed"
CCNI_EXTRA = "https://gist.githubusercontent.com/BobHarper1/2687545c562b47bc755aef2e9e0de537/raw/ac052c33fd14a08dd4c2a0604b54c50bc1ecc0db/ccni_extra"
def main():
def add_bool_arg(parser, name, default=False, help=None):
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument('--' + name, dest=name, action='store_true', help=help)
group.add_argument('--no-' + name, dest=name, action='store_false', help=help)
parser.set_defaults(**{name:default})
"""
Function to fetch data from Charity regulators
"""
parser = argparse.ArgumentParser(description='Fetch needed data sources.')
parser.add_argument('--dual', type=str,
default=DUAL_CSV,
help='CSV with dual registered charities in')
parser.add_argument('--oscr-url', type=str,
default=OSCR_URL,
help="URL of page containing Scottish charity data")
parser.add_argument('--ccew-url', type=str,
default=CCEW_URL,
help="URL of page containing Charity Commission data")
parser.add_argument('--ccni-url', type=str,
default=CCNI_URL,
help="CSV of Northern Ireland Charity Commission data")
parser.add_argument('--ccni-extra', type=str,
default=CCNI_EXTRA,
help='CSV for NI charities with other names')
# add args to turn on or off the various data sources
add_bool_arg(parser, 'oscr', default=True, help='Fetch data from Office of the Scottish Charity Regulator')
add_bool_arg(parser, 'ccew', default=True, help='Fetch data from Charity Commission for England and Wales')
add_bool_arg(parser, 'ccni', default=True, help='fetch data from Charity Commission for Northern Ireland')
parser.add_argument('--folder', type=str, default='data',
help='Root path of the data folder.')
args = parser.parse_args()
# make folder if it's not already there
if not os.path.exists(args.folder):
os.makedirs(args.folder)
# retrieve dual registered charities
urllib.request.urlretrieve(
args.dual,
os.path.join(args.folder, "dual-registered-uk-charities.csv")
)
print("[Dual] Dual registered charities fetched")
# get oscr data
if args.oscr:
oscr_out = os.path.join(args.folder, "oscr.zip")
urllib.request.urlretrieve(args.oscr_url, oscr_out)
print("[OSCR] ZIP downloaded")
try:
with zipfile.ZipFile(oscr_out) as oscrzip:
files = oscrzip.infolist()
if len(files) != 1:
raise ValueError("More than one file in OSCR zip")
with open(os.path.join(args.folder, "oscr.csv"), "wb") as oscrcsv:
oscrcsv.write(oscrzip.read(files[0]))
print("[OSCR] data extracted")
except zipfile.BadZipFile:
print("[OSCR] data could not be found")
# get charity commission data
if args.ccew:
ccew_html = urllib.request.urlopen(args.ccew_url)
ccew_out = os.path.join(args.folder, "ccew.zip")
ccew_folder = os.path.join(args.folder, "ccew")
if ccew_html.status != 200:
raise ValueError("[CCEW] Could not find Charity Commission data page. Status %s %s" % (ccew_html.status, ccew_html.reason))
ccew_soup = BeautifulSoup(ccew_html.read(), 'html.parser')
zip_regex = re.compile(r"http://apps.charitycommission.gov.uk/data/.*?/RegPlusExtract.*?\.zip")
ccew_data_url = ccew_soup.find("a", href=zip_regex)["href"]
print("[CCEW] Using url: %s" % ccew_data_url)
urllib.request.urlretrieve(ccew_data_url, ccew_out)
print("[CCEW] ZIP downloaded")
with zipfile.ZipFile(ccew_out) as ccew_zip:
if not os.path.isdir(ccew_folder):
os.makedirs(ccew_folder)
for ccew_zip_file in ccew_zip.infolist():
bcp_content = ccew_zip.read(ccew_zip_file)
csv_content = bcp.convert(bcp_content.decode("latin1"))
csv_filename = ccew_zip_file.filename.replace(".bcp", ".csv")
with open(os.path.join(ccew_folder, csv_filename), "w", encoding="latin1") as ccew_zip_csv:
ccew_zip_csv.write(csv_content.replace('\x00', ''))
print("[CCEW] write %s" % csv_filename)
# @TODO get charity commission register of mergers
# download Northern Ireland register of charities
if args.ccni:
# retrieve ni charity extra names
urllib.request.urlretrieve(args.ccni_extra, os.path.join(args.folder, "ccni_extra_names.csv"))
print("[CCNI Extra] Extra Northern Ireland charity names fetched")
print("[CCNI] Using url: %s" % args.ccni_url)
urllib.request.urlretrieve(args.ccni_url, os.path.join(args.folder, 'ccni.csv'))
print("[CCNI] CSV downloaded")
if __name__ == '__main__':
main()