Skip to content

Commit

Permalink
fix errors with imports
Browse files Browse the repository at this point in the history
  • Loading branch information
drkane committed Nov 22, 2019
1 parent b39b611 commit 546c0a7
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 36 deletions.
64 changes: 39 additions & 25 deletions data_import/fetch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,39 @@
CCNI_EXTRA = "https://gist.githubusercontent.com/BobHarper1/2687545c562b47bc755aef2e9e0de537/raw/ac052c33fd14a08dd4c2a0604b54c50bc1ecc0db/ccni_extra"

def main():


def add_bool_arg(parser, name, default=False, help=None):
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument('--' + name, dest=name, action='store_true', help=help)
group.add_argument('--no-' + name, dest=name, action='store_false', help=help)
parser.set_defaults(**{name:default})

"""
Function to fetch data from Charity regulators
"""
parser = argparse.ArgumentParser(description='Fetch needed data sources.')
parser.add_argument('--dual', type=str,
default=DUAL_CSV,
help='CSV with dual registered charities in')
parser.add_argument('--oscr', type=str,
parser.add_argument('--oscr-url', type=str,
default=OSCR_URL,
help="URL of page containing Scottish charity data")
parser.add_argument('--ccew', type=str,
parser.add_argument('--ccew-url', type=str,
default=CCEW_URL,
help="URL of page containing Charity Commission data")
parser.add_argument('--ccni', type=str,
parser.add_argument('--ccni-url', type=str,
default=CCNI_URL,
help="CSV of Northern Ireland Charity Commission data")
parser.add_argument('--ccni_extra', type=str,
parser.add_argument('--ccni-extra', type=str,
default=CCNI_EXTRA,
help='CSV for NI charities with other names')
parser.add_argument('--skip-oscr', action='store_true',
help='Don\'t fetch data from Office of the Scottish Charity Regulator.')
parser.add_argument('--skip-ccew', action='store_true',
help='Don\'t fetch data from Charity Commission for England and Wales.')

# add args to turn on or off the various data sources
add_bool_arg(parser, 'oscr', default=True, help='Fetch data from Office of the Scottish Charity Regulator')
add_bool_arg(parser, 'ccew', default=True, help='Fetch data from Charity Commission for England and Wales')
add_bool_arg(parser, 'ccni', default=True, help='fetch data from Charity Commission for Northern Ireland')

parser.add_argument('--folder', type=str, default='data',
help='Root path of the data folder.')
args = parser.parse_args()
Expand All @@ -57,27 +67,26 @@ def main():
)
print("[Dual] Dual registered charities fetched")

# retrieve ni charity extra names
urllib.request.urlretrieve(args.ccni_extra, os.path.join(args.folder, "ccni_extra_names.csv"))
print("[CCNI Extra] Extra Northern Ireland charity names fetched")

# get oscr data
if not args.skip_oscr:
if args.oscr:
oscr_out = os.path.join(args.folder, "oscr.zip")
urllib.request.urlretrieve(args.oscr, oscr_out)
urllib.request.urlretrieve(args.oscr_url, oscr_out)
print("[OSCR] ZIP downloaded")

with zipfile.ZipFile(oscr_out) as oscrzip:
files = oscrzip.infolist()
if len(files) != 1:
raise ValueError("More than one file in OSCR zip")
with open(os.path.join(args.folder, "oscr.csv"), "wb") as oscrcsv:
oscrcsv.write(oscrzip.read(files[0]))
print("[OSCR] data extracted")
try:
with zipfile.ZipFile(oscr_out) as oscrzip:
files = oscrzip.infolist()
if len(files) != 1:
raise ValueError("More than one file in OSCR zip")
with open(os.path.join(args.folder, "oscr.csv"), "wb") as oscrcsv:
oscrcsv.write(oscrzip.read(files[0]))
print("[OSCR] data extracted")
except zipfile.BadZipFile:
print("[OSCR] data could not be found")

# get charity commission data
if not args.skip_ccew:
ccew_html = urllib.request.urlopen(args.ccew)
if args.ccew:
ccew_html = urllib.request.urlopen(args.ccew_url)
ccew_out = os.path.join(args.folder, "ccew.zip")
ccew_folder = os.path.join(args.folder, "ccew")
if ccew_html.status != 200:
Expand All @@ -104,8 +113,13 @@ def main():

# download Northern Ireland register of charities
if args.ccni:
print("[CCNI] Using url: %s" % args.ccni)
urllib.request.urlretrieve(args.ccni, os.path.join(args.folder, 'ccni.csv'))

# retrieve ni charity extra names
urllib.request.urlretrieve(args.ccni_extra, os.path.join(args.folder, "ccni_extra_names.csv"))
print("[CCNI Extra] Extra Northern Ireland charity names fetched")

print("[CCNI] Using url: %s" % args.ccni_url)
urllib.request.urlretrieve(args.ccni_url, os.path.join(args.folder, 'ccni.csv'))
print("[CCNI] CSV downloaded")

if __name__ == '__main__':
Expand Down
33 changes: 22 additions & 11 deletions data_import/import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,10 @@ def import_oscr(chars={},
es_type="charity",
debug=False):

if not os.path.exists(datafile):
print("Could not find {}".format(datafile))
return chars

# go through the Scottish charities
with open(datafile, encoding="latin1") as a:
csvreader = csv.DictReader(a)
Expand Down Expand Up @@ -568,11 +572,12 @@ def import_ccni(chars={},
ccni_extra[row["Charity_number"]].append(n.strip())

# go through the Northern Irish charities
with open(datafile, encoding="utf-8") as a:
with open(datafile, encoding="ISO-8859-1") as a:
csvreader = csv.DictReader(a)
ccount = 0
cadded = 0
cupdated = 0

for row in csvreader:
row = clean_row(row)

Expand Down Expand Up @@ -812,6 +817,12 @@ def create_outputs(es,

def main():

def add_bool_arg(parser, name, default=False, help=None):
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument('--' + name, dest=name, action='store_true', help=help)
group.add_argument('--no-' + name, dest=name, action='store_false', help=help)
parser.set_defaults(**{name:default})

parser = argparse.ArgumentParser(description='Import charity data into elasticsearch')

parser.add_argument('--folder', type=str, default='data',
Expand All @@ -833,12 +844,11 @@ def main():
parser.add_argument('--es-pc-index', default='postcode', help='index used to store postcode data')
parser.add_argument('--es-pc-type', default='postcode', help='type used to store postcode data')

parser.add_argument('--skip-oscr', action='store_true',
help='Don\'t fetch data from Office of the Scottish Charity Regulator.')
parser.add_argument('--skip-ccew', action='store_true',
help='Don\'t fetch data from Charity Commission for England and Wales.')
parser.add_argument('--skip-output', action='store_true',
help='Don\'t create output files containing the whole dataset.')
# add args to turn on or off the various data sources
add_bool_arg(parser, 'oscr', default=True, help='Fetch data from Office of the Scottish Charity Regulator')
add_bool_arg(parser, 'ccew', default=True, help='Fetch data from Charity Commission for England and Wales')
add_bool_arg(parser, 'ccni', default=True, help='fetch data from Charity Commission for Northern Ireland')
add_bool_arg(parser, 'output', default=False, help='Create output files containing the whole dataset')

parser.add_argument('--debug', action='store_true', help='Only load first 10000 rows for ccew')

Expand Down Expand Up @@ -877,7 +887,7 @@ def main():
}

chars = {}
if not args.skip_ccew:
if args.ccew:
chars = import_extract_charity(chars, datafile=data_files["extract_charity"], es_index=args.es_index, es_type=args.es_type, debug=args.debug)
chars = import_extract_main(chars, datafile=data_files["extract_main"], debug=args.debug)
chars = import_extract_name(chars, datafile=data_files["extract_names"], debug=args.debug)
Expand All @@ -886,9 +896,10 @@ def main():
dual = {}
if os.path.isfile(data_files["dual_registration"]):
dual = import_dual_reg(data_files["dual_registration"])
if not args.skip_oscr:
if args.oscr:
chars = import_oscr(chars, dual=dual, datafile=data_files["oscr"], es_index=args.es_index, es_type=args.es_type, debug=args.debug)
chars = import_ccni(chars, dual=dual, datafile=data_files["ccni"], extra_names=data_files["ccni_extra_names"], es_index=args.es_index, es_type=args.es_type, debug=args.debug)
if args.ccni:
chars = import_ccni(chars, dual=dual, datafile=data_files["ccni"], extra_names=data_files["ccni_extra_names"], es_index=args.es_index, es_type=args.es_type, debug=args.debug)
# @TODO include charity commission register of mergers
chars = clean_chars(chars, pc_es, args.es_pc_index, args.es_pc_type)

Expand All @@ -901,7 +912,7 @@ def main():

save_to_elasticsearch(chars, es, args.es_index)

if not args.skip_output:
if args.output:
create_outputs(es, args.folder, args.es_index, args.es_type)

if __name__ == '__main__':
Expand Down

0 comments on commit 546c0a7

Please sign in to comment.