-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbg_json_parser.py
executable file
·930 lines (724 loc) · 34.8 KB
/
bg_json_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
"""
Aggregates and converts smart trap JSON files into interchange files.
This script takes a set of smart trap JSON data files and aggregates
their data into days before writing the data to interchange-format
files. The script can also automatically generate project files
necessary for the full BG-Counter Tools pipeline, splitting data into
years along the way, using the --split-years option.
For usage information, run with -h.
This script requires at least Python 3.5.
"""
import argparse
import csv
import json
import math
import os
import random
import datetime as dt
from string import Template
import bg_common as com
def parse_args():
"""Parse the command line arguments and return an args namespace."""
parser = argparse.ArgumentParser(description='Parses JSON delivered by the Biogents smart trap'
' API and creates an interchange format file '
'from the data.')
parser.add_argument('files', nargs='+', metavar='file', help='The JSON file(s) to parse.')
parser.add_argument('--preserve-metadata', action='store_true',
help="Don't change the metadata in the database in any way")
parser.add_argument('-c', '--check-locations', action='store_true',
help='Before writing to file, pause to allow the user to check for any '
'errant new locations.')
output_group = parser.add_mutually_exclusive_group()
output_group.add_argument('-o', '--output', help='The name of the output file.')
output_group.add_argument('-y', '--split-years', action='store_true',
help='Split data from different years into separate output files. '
'The files will be named [prefix]_[year].pop.')
args = parser.parse_args()
return args
def parse_json(files, output='interchange.pop', split_years=False, preserve_metadata=False,
check_locations=False):
"""Parse JSON files and create interchange format files from them.
Required arguments:
files -- A list of filenames to parse.
Optional arguments:
output -- The name of the output file. Ignored if split_years is
True.
split_years -- A boolean signalling whether to split data from
different years into separate output files. The files will be
named [prefix]_[year].pop. Also creates project files necessary
for the rest of the BG-Counter Tools pipeline.
preserve_metadata -- A boolean signalling whether to preserve the
metadata within the database, skipping all database update
operations.
check_locations -- A boolean signalling whether to pause before
writing to file for the use to check for any errant locations.
"""
random.seed()
metadata = {}
out_csv = None
projects = None
try:
if split_years:
out_csv = {}
else:
out_csv = CSVWriter(output)
for filename in files:
with open(filename, 'r') as json_f:
js = json.load(json_f)
collections = {}
capture_count = {}
print("Processing file " + filename)
for trap_wrapper in js['traps']:
trap_id = trap_wrapper['Trap']['id']
captures = trap_wrapper['Capture']
capture_count[trap_id] = len(captures)
if len(captures) != 0:
# Get metadata for this trap
for prefix, trapset in metadata.items():
if trap_id in trapset['traps']:
curr_prefix = prefix
curr_trapset = trapset
break
else:
new_metadata = get_trap_metadata(trap_id=trap_id)
metadata.update(new_metadata)
curr_prefix = list(new_metadata.keys())[0]
curr_trapset = new_metadata[curr_prefix]
collections[curr_prefix] = {}
trap_metadata = {
'locations': curr_trapset['traps'][trap_id],
'obfuscate': curr_trapset['obfuscate']
}
# Process captures.
new_collections = process_captures(captures, trap_metadata)
if trap_id not in collections[curr_prefix]:
collections[curr_prefix][trap_id] = []
collections[curr_prefix][trap_id].extend(new_collections)
# Update master metadata.
curr_trapset['traps'][trap_id] = trap_metadata['locations']
# Warn if a trap is showing no captures.
# We should reasonably expect data from each trap,
# and if we aren't getting any, it might be
# worth looking into.
else:
print('Warning: 0 captures at trap_id: ' + trap_id)
# Allow the user to manually check new locations if
# requested.
if check_locations:
collections = filter_locations(collections)
# Write collections to file.
for prefix, traps in collections.items():
for trap_id, curr_collections in traps.items():
good_captures = 0
for collection in curr_collections:
curr_metadata = {'prefix': prefix, 'ordinals': metadata[prefix]['ordinals']}
date = com.make_date(collection['captures'][0]['timestamp_start'])
year = date.year
# Get the correct output file.
if isinstance(out_csv, dict):
# If the correct output file doesn't exist, make it.
if prefix not in out_csv:
out_csv[prefix] = {}
if year not in out_csv[prefix]:
out_csv[prefix][year] = ProjectFileManager(prefix, year)
curr_csv = out_csv[prefix][year]
else:
curr_csv = out_csv
# If a collection was be made, write it to
# file and count its captures as good.
if write_collection(collection, curr_metadata, curr_csv):
good_captures += len(collection['captures'])
metadata[prefix]['ordinals'] = curr_metadata['ordinals']
# If the CSV is for a project, update
# the project's dates.
if isinstance(curr_csv, ProjectFileManager):
curr_csv.update_dates(date)
# Print a summary.
print('Trap {}: Total captures: {} - Good captures: {} ({}%)'
.format(trap_id, capture_count[trap_id], good_captures,
math.floor((good_captures / capture_count[trap_id]) * 100)))
finally:
# Close all output files.
if out_csv:
if type(out_csv) is dict:
projects = []
for prefix, years in out_csv.items():
for year, csv_writer in years.items():
project_info = csv_writer.close()
# Store the info for the valid projects
# that were created.
if project_info:
projects.append(project_info)
else:
out_csv.close()
if not preserve_metadata:
update_metadata(metadata=metadata)
return projects
def process_captures(captures, metadata):
"""Bin captures into days.
Arguments:
captures -- A dict containing the captures to process.
metadata -- A dict containing the metadata for the trap and provider
that the captures originate from.
"""
# The total number of unique captures (some are duplicates).
total_captures = 0
# The captures within a single day.
day_captures = []
# The collections created from this set of captures.
collections = []
# Will hold the timestamp_end of the last capture.
prev_end_timestamp = dt.datetime.min
num_captures = len(captures)
for i in range(num_captures):
capture = captures[i]
# We use this to do some sanity checking.
# The ending timestamp is more consistent than the starting one.
curr_end_timestamp = com.make_datetime(capture['timestamp_end'])
curr_start_timestamp = com.make_datetime(capture['timestamp_start'])
curr_date = curr_start_timestamp.date()
valid_dates = curr_end_timestamp and curr_start_timestamp
# Count a capture if it's not a duplicate
# or it has invalid dates.
if not valid_dates or curr_end_timestamp != prev_end_timestamp:
total_captures += 1
if valid_dates:
# If this end timestamp is later than the previous one,
# store the capture. We ignore the capture if it's
# identical to the previous one or if its timeframe is
# much less than 15 minutes.
if (curr_end_timestamp > prev_end_timestamp
and curr_end_timestamp - curr_start_timestamp >= dt.timedelta(minutes=12)):
day_captures.append({
'trap_id': capture['trap_id'],
'timestamp_start': capture['timestamp_start'],
'co2_status': capture['co2_status'],
'counter_status': capture['counter_status'],
'medium': capture['medium'],
'trap_latitude': capture['trap_latitude'],
'trap_longitude': capture['trap_longitude'],
})
prev_end_timestamp = curr_end_timestamp
# Else if this timestamp is earlier than the previous one,
# error out. We rely on the captures being delivered in
# forward chronological order.
elif curr_end_timestamp < prev_end_timestamp:
raise ValueError('Capture has earlier ending timestamp than preceding capture. '
'Capture ID: ' + capture['id'])
# If we're at the last capture or the next capture is
# from a different day, end this day.
if (i == num_captures - 1
or com.make_date(captures[i + 1]['timestamp_start']) != curr_date):
trap_id = capture['trap_id']
# Our current assumption is that there are no more
# than 96 unique captures in a day (4 per hour).
# If this changes, we'll need to edit this script.
if len(day_captures) > 96:
raise ValueError('More than 96 captures in a day at trap_id: {} - date: {}'
.format(trap_id, curr_date))
# Try to make a collection from this set of captures.
collection = make_collection(day_captures, metadata['locations'],
metadata['obfuscate'])
if collection:
collections.append(collection)
day_captures = []
return collections
def make_collection(captures, locations, obfuscate):
"""Bin captures into collections based on location.
Takes a set of captures within the same day, bins them based on
location, and returns the collection that is big enough, returning
None if there is none. Adds new locations to the metadata dict if
there are any.
Arguments:
captures -- A dict containing the captures to process.
locations -- A dict containing the locations that have been recorded
previously for the trap that the captures came from.
obfuscate -- A boolean determining whether to obfuscate the new
locations before adding them to the metadata.
"""
# Add a new key that will hold the captures that map
# to each location and a key that will let us know that these
# locations are not new (used later).
for location in locations:
location['captures'] = []
location['new'] = False
# First, loop through the captures to pinpoint any possible
# new locations. We're looping backwards so we can delete captures
# if necessary.
for i in range(len(captures) - 1, -1, -1):
capture = captures[i]
curr_lat = float(capture['trap_latitude'])
curr_lon = float(capture['trap_longitude'])
# If a trap can't get correct GPS data, it will either report
# a coordinate that is exactly 0 or report its location as
# (51.4778, 0.0014), which is in Greenwich near the prime
# meridian. Either way, drop the capture.
if curr_lat == 0 or curr_lon == 0 or (curr_lat == 51.4778 and curr_lon == 0.0014):
del captures[i]
else:
# Determine whether this capture is close to
# any existing locations.
for location in locations:
distance = calculate_distance(curr_lat, curr_lon, location['true_latitude'],
location['true_longitude'])
# 111 meters - arbitrary, but shouldn't be too small.
if distance < 111:
break
# If it's not close to any known location, add
# a new location at its coordinates. This bubbles up
# to the metadata dict as well.
else:
locations.append({
'true_latitude': curr_lat,
'true_longitude': curr_lon,
'captures': [],
'new': True
})
# Next, loop through the captures and assign them
# to the closest locations.
for capture in captures:
curr_lat = float(capture['trap_latitude'])
curr_lon = float(capture['trap_longitude'])
closest_location = None
closest_distance = math.inf
for location in locations:
distance = calculate_distance(curr_lat, curr_lon,
location['true_latitude'], location['true_longitude'])
if distance < closest_distance:
closest_location = location
closest_distance = distance
# Because of the previous loop, each capture will be
# within a reasonable distance of some location.
closest_location['captures'].append(capture)
# This will hold the final collection if there is one.
collection = None
# Once again, loop backwards so we can remove items.
for i in range(len(locations) - 1, -1, -1):
location = locations[i]
num_captures = len(location['captures'])
# Allow at most one cumulative hour of missing data in a day.
if num_captures >= 92:
# If the location is new, average its captures' coordinates
# to get a more accurate lat/lon, then obfuscate
# if necessary.
if location['new']:
lats, lons = [], []
for capture in location['captures']:
lats.append(float(capture['trap_latitude']))
lons.append(float(capture['trap_longitude']))
location['true_latitude'] = round(sum(lats) / len(lats), 6)
location['true_longitude'] = round(sum(lons) / len(lons), 6)
if obfuscate:
new_lat, new_lon = obfuscate_coordinates(location['true_latitude'],
location['true_longitude'], 200, 400)
location['offset_latitude'] = round(new_lat, 6)
location['offset_longitude'] = round(new_lon, 6)
else:
location['offset_latitude'] = location['true_latitude']
location['offset_longitude'] = location['true_longitude']
collection = dict(location)
# If there weren't enough captures for a full collection
# and the location was new, remove it so it doesn't get
# added to the metadata.
elif location['new']:
del locations[i]
# Remove 'captures' and 'new'.
else:
del locations[i]['captures']
del locations[i]['new']
return collection
def write_collection(collection, metadata, out_csv):
"""Write a collection of captures to file.
Take a collection containing a day's worth of captures and
aggregate and write it to file if the counter was on at some point
during the day. Return True if the collection was written
and False if it wasn't.
Arguments:
collection -- A dict containing the collection to write to file.
metadata -- A dict containing the metadata for the trap and provider
that the captures originate from.
out_csv -- A CSV writer object to write the collection to.
"""
captures = collection['captures']
mos_count = 0
# Stores whether the counter was turned on at some point in the day.
counter_on = False
# Stores whether CO2 was turned on at some point in the day.
used_co2 = False
trap_id = captures[0]['trap_id']
date = com.make_date(captures[0]['timestamp_start'])
# Sum all of the mosquitoes captured throughout the day
# and check to see whether counter and CO2 were used.
for capture in captures:
# Mosquito counts are stored in the 'medium' field.
mos_count += int(capture['medium'])
if not used_co2 and capture['co2_status']:
used_co2 = True
if not counter_on and capture['counter_status'] in {'1', True}:
counter_on = True
# Only write the collection if the counter was on.
if counter_on:
if used_co2:
attractant = 'carbon dioxide'
else:
attractant = ''
year = date.year
prefix = metadata['prefix']
# If no ordinal exists for this year, make a new one.
if year not in metadata['ordinals']:
metadata['ordinals'][year] = 0
# Increment the ordinal and store it.
ordinal = metadata['ordinals'][year] = metadata['ordinals'][year] + 1
# The ordinal string must have a leading zero, so we're giving
# it a length that probably won't be exceeded for a year's worth
# of data. If it is exceeded, make sure it has at least one
# leading zero and warn us that the ordinal is getting large.
digits = 8
len_ordinal = len(str(ordinal))
min_digits = len_ordinal + 1
if min_digits > digits:
digits = min_digits
print('Warning: Large ordinal at trap_id: {} - year: {} - ordinal: {}'
.format(trap_id, year, ordinal))
ordinal_string = str(ordinal).zfill(digits)
collection_id = '{}_{}_collection_{}'.format(prefix, year, ordinal_string)
sample_id = '{}_{}_sample_{}'.format(prefix, year, ordinal_string)
# Write the collection to file.
out_csv.writerow([collection_id, sample_id, date, date, trap_id,
'{:.6f}'.format(collection['offset_latitude']),
'{:.6f}'.format(collection['offset_longitude']),
'', 'COLLECT_BGCT', attractant, 1, 1, 'Culicidae', 'SIZE', 'adult',
'unknown sex', mos_count])
return True
else:
# If the counter was never on, print a warning. If this is
# the case for a decent number of days, it might be worth
# looking into.
print('Warning: Counter never on at date: {} - trap_id: {}'.format(date, trap_id))
return False
@com.run_with_connection
def get_trap_metadata(cur, trap_id):
"""Get metadata for the trapset containing the given trap.
trap_id -- A string representing a BG-Counter trap ID.
Note: Omit the 'cur' argument when calling and provide other
arguments as keyword args.
"""
# Get the prefix associated with the trap
# to check if the trap exists in the database.
sql = ('SELECT p.prefix, p.obfuscate FROM traps as t, providers as p '
'WHERE t.prefix = p.prefix AND t.trap_id = %s')
cur.execute(sql, (trap_id,))
row = cur.fetchone()
if not row:
raise ValueError('No database entry for trap ID: ' + trap_id)
prefix = row['prefix']
metadata = {prefix: {'traps': {}, 'ordinals': {}, 'obfuscate': row['obfuscate']}}
# Get the locations associated with the traps.
sql = ('SELECT t.trap_id, true_latitude, true_longitude, offset_latitude, offset_longitude '
'FROM traps as t LEFT OUTER JOIN locations as l ON t.trap_id = l.trap_id '
'WHERE t.prefix = %s')
cur.execute(sql, (prefix,))
rows = cur.fetchall()
for row in rows:
trap_id = row['trap_id']
if trap_id not in metadata[prefix]['traps']:
metadata[prefix]['traps'][trap_id] = []
if row['true_latitude'] and row['true_longitude']:
metadata[prefix]['traps'][trap_id].append({
'true_latitude': row['true_latitude'],
'true_longitude': row['true_longitude'],
'offset_latitude': row['offset_latitude'],
'offset_longitude': row['offset_longitude'],
})
# Get the ordinals associated with the prefix.
sql = 'SELECT year, ordinal FROM ordinals WHERE prefix = %s'
cur.execute(sql, (prefix,))
metadata[prefix]['ordinals'] = {row['year']: row['ordinal'] for row in cur.fetchall()}
return metadata
@com.run_with_connection
def get_provider_metadata(cur, prefix):
"""Get metadata for a particular data provider.
prefix -- A string corresponding to the prefix of the desired
provider.
Note: Omit the 'cur' argument when calling and provide other
arguments as keyword args.
"""
sql = ('SELECT org_name, org_email, org_url, contact_first_name, contact_last_name,'
'contact_email, study_tag, study_tag_number '
'FROM providers WHERE prefix = %s')
cur.execute(sql, (prefix,))
row = cur.fetchone()
return row
@com.run_with_connection
def update_metadata(cur, metadata):
"""Update metadata in the database.
metadata -- A dict containing the metadata to update the database
with. Can contain metadata on multiple providers.
Note: Omit the 'cur' argument when calling and provide other
arguments as keyword args.
"""
for prefix, trapset in metadata.items():
# Update the ordinals associated with the prefix.
sql = ('INSERT INTO ordinals VALUES (%s, %s, %s) '
'ON CONFLICT (prefix, year) DO UPDATE SET ordinal = EXCLUDED.ordinal')
for year, ordinal in trapset['ordinals'].items():
cur.execute(sql, (prefix, year, ordinal))
for trap_id, locations in trapset['traps'].items():
# Check the trap to make sure it still exists
# and has the same prefix.
sql = 'SELECT prefix FROM traps WHERE trap_id = %s'
cur.execute(sql, (trap_id,))
row = cur.fetchone()
if not row:
raise ValueError('Metadata update failed - trap no longer exists: ' + trap_id)
elif row['prefix'] != prefix:
raise ValueError('Metadata update failed - prefix has changed for trap: '
+ trap_id)
# Add new locations if there are any.
sql = 'INSERT INTO locations VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING'
for location in locations:
cur.execute(sql, (trap_id, location['true_latitude'], location['true_longitude'],
location['offset_latitude'], location['offset_longitude']))
def calculate_distance(lat1, lon1, lat2, lon2):
"""Get distance in meters between two sets of decimal coordinates.
Note that this code is based on the Haversine formula for spheres,
giving it an error of up to about 0.5%.
"""
# Approximate radius of earth in km.
r = 6373.0
# Convert the parameters to radians.
arguments = (lat1, lon1, lat2, lon2)
lat1, lon1, lat2, lon2 = map(math.radians, arguments)
# Get their deltas.
dlon = lon2 - lon1
dlat = lat2 - lat1
# Calculate distance in meters.
a = math.sin(dlat/2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon/2)**2
distance = r * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
distance *= 1000
return distance
def obfuscate_coordinates(lat, lon, min_distance, max_distance):
"""Obfuscate a set of GPS coordinates.
Obfuscates a set of GPS coordinates by translating the represented
location in a random direction by a random distance between two
bounds.
Arguments:
lat -- The decimal latitude.
lon -- The decimal longitude.
min_distance -- The minimum distance in meters by which to displace
the point.
max_distance -- The maximum distance in meters by which to displace
the point.
Formula source: http://www.edwilliams.org/avform.htm#LL
"""
# Randomly choose a distance within the parameters.
d_m = random.uniform(min_distance, max_distance)
# Convert this distance to nautical miles, then to radians.
d_nm = d_m / 1852
d_r = (math.pi / (180*60)) * d_nm
# Randomly choose a true course (direction) in radians.
tc = random.uniform(0, 2 * math.pi)
# Convert the lat/lon to radians.
lat = math.radians(lat)
lon = math.radians(lon)
# Calculate new coordinates.
new_lat = math.asin(math.sin(lat)*math.cos(d_r) + math.cos(lat)*math.sin(d_r)*math.cos(tc))
dlon = math.atan2(math.sin(tc)*math.sin(d_r)*math.cos(lat),
math.cos(d_r) - math.sin(lat)*math.sin(new_lat))
new_lon = ((lon-dlon + math.pi) % (2*math.pi)) - math.pi
# Convert new coordinates to degrees.
new_lat = math.degrees(new_lat)
new_lon = math.degrees(new_lon)
return new_lat, new_lon
def filter_locations(collections):
"""Let the user filter out errant new locations.
Checks a set of collections for new locations, writes them to a file
for the user to check, then filters out any collections with
locations that the user removed. The file is formatted so that it
can be directly imported into the GPS plotting app found here:
https://www.darrinward.com/lat-long/.
collections -- A dict containing the set of collections to filter.
"""
gps_filename = 'check_locations.csv'
new_locations = set()
good_locations = set()
good_collections = {}
# Signals whether we've correctly interpreted what locations the
# user filtered out.
finished = False
while not finished:
with open(gps_filename, 'w') as gps_f:
gps_csv = csv.writer(gps_f)
gps_csv.writerow(['latitude', 'longitude', 'name', 'color', 'note'])
# Loop through the collections to find new locations.
for prefix, trapset in collections.items():
for trap_id, curr_collections in trapset.items():
for collection in curr_collections:
lat = collection['true_latitude']
lon = collection['true_longitude']
if collection['new']:
# Give the location a name, write it to
# file, and store it as a new location.
name = 'loc_' + str(len(new_locations))
gps_csv.writerow([
lat, lon, name, '#FF0000', '{}_{}'.format(prefix, trap_id)
])
new_locations.add((lat, lon, name))
else:
# We assume that it has been checked before
# and is good.
good_locations.add((lat, lon))
if new_locations:
# Ask the user to check the new locations.
input("\nNew locations have been dumped to 'check_locations.csv'.\n"
"Delete errant locations from this file, then press Enter to continue.")
with open(gps_filename, newline='') as gps_f:
gps_csv = csv.DictReader(gps_f)
for row in gps_csv:
good_locations.add((float(row['latitude']), float(row['longitude']),
row['name']))
missing = new_locations - good_locations
if missing:
# Print the names of the deleted locations.
missing_names = [loc[2] for loc in missing]
missing_names.sort()
print('You have removed the following location(s): '
+ ', '.join(missing_names) + '.')
else:
print('You have removed no locations.')
# Signals whether the user has given valid input.
good_input = False
while not good_input:
# Ask the user to confirm that the desired locations
# were removed.
answer = input('Is this correct? [y/n] ').lower()
if answer in {'y', 'yes'}:
good_input = True
finished = True
os.remove(gps_filename)
elif answer in {'n', 'no'}:
good_input = True
print('Resetting...')
new_locations = set()
good_locations = set()
else:
print('Please answer yes or no.')
else:
print('No new locations. Continuing.')
finished = True
if new_locations:
# Remove the names from the locations.
good_locations = {(loc[0], loc[1]) for loc in good_locations}
# Keep the collections that have good locations.
for prefix, trapset in collections.items():
good_collections[prefix] = {}
for trap_id, curr_collections in trapset.items():
good_collections[prefix][trap_id] = []
for collection in curr_collections:
location = (collection['true_latitude'], collection['true_longitude'])
if location in good_locations:
good_collections[prefix][trap_id].append(collection)
else:
good_collections = collections
return good_collections
class ProjectFileManager:
"""Handle the formation of all files related to a project.
Public methods:
update_dates
write_config
close
"""
def __init__(self, prefix, year):
"""Initialize the instance.
prefix -- The prefix of the provider that this project's data
comes from.
year -- The year that this project's data was collected.
"""
self.prefix = prefix
self.year = year
csv_filename = '{}_{}_saf.csv'.format(prefix, year)
self.writer = CSVWriter(csv_filename)
self.writerow = self.writer.writerow
self.first_date = dt.date.max
self.last_date = dt.date.min
self.month = None
def update_dates(self, date):
"""Update the first and/or last date, as appropriate.
date -- The date of a capture that is being grouped into this
project.
"""
if date < self.first_date:
self.first_date = date
self.month = date.month
if date > self.last_date:
self.last_date = date
def write_config(self):
"""Write the config file from a template.
The YAML-format config file is intended to be passed to PopBioWizard.pl
further down the pipeline.
"""
data = get_provider_metadata(prefix=self.prefix)
template_path = 'config.yaml'
config_path = '{}_{}_config.yaml'.format(self.prefix, self.year)
with open(template_path) as template_f, open(config_path, 'w') as config_f:
template = Template(template_f.read())
config_text = template.substitute(
prefix=self.prefix, year=self.year, month=str(self.month).zfill(2),
start_date=self.first_date, end_date=self.last_date, org_name=data['org_name'],
org_email=data['org_email'], org_url=data['org_url'],
contact_first_name=data['contact_first_name'],
contact_last_name=data['contact_last_name'], contact_email=data['contact_email'],
study_tag=data['study_tag'], study_tag_number=data['study_tag_number']
)
config_f.write(config_text)
def close(self):
"""Close the data file and write other files if necessary.
This function calls self.writer's close function, which checks
to see whether any data was written to the data file and deletes
the file if not. If data was written, this function then writes
the config file and returns a dict containing the project
information. Otherwise it returns None.
"""
if self.writer.close():
self.write_config()
return {'prefix': self.prefix, 'year': self.year}
else:
return None
class CSVWriter:
"""Handle CSV data file operations.
Public methods:
is_empty
close
"""
def __init__(self, filename):
"""Initialize the instance."""
self.filename = filename
self.file = open(filename, 'w')
self.writer = csv.writer(self.file, lineterminator='\n')
self.writerow = self.writer.writerow
self.writerow([
'collection_ID', 'sample_ID', 'collection_start_date', 'collection_end_date',
'trap_ID', 'GPS_latitude', 'GPS_longitude', 'location_description', 'trap_type',
'attractant', 'trap_number', 'trap_duration', 'species',
'species_identification_method', 'developmental_stage', 'sex', 'sample_count',
])
self.empty_pos = self.file.tell()
def is_empty(self):
"""Return whether the CSV file is empty of any data rows."""
return self.file.tell() == self.empty_pos
def close(self):
"""Close the CSV file.
Closes the object's file and deletes the file if it has
no data. Returns True if there was data and False if not.
"""
if self.is_empty():
remove = True
else:
remove = False
self.file.close()
if remove:
os.remove(self.filename)
return not remove
if __name__ == '__main__':
args = vars(parse_args())
parse_json(**args)