-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcircus_readfromJSON.py
45 lines (27 loc) · 1.04 KB
/
circus_readfromJSON.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import json
import pprint
import re
##opening json file to read, parse
with open('scraped_circusposters.json') as json_data:
data = json.load(json_data)
json_data.close()
##PRETTY PRINTING TO CHECK SITUATION
##pprint.pprint(data)
poster_troupes = {}
all_troupes = []
## gets all key values in list of dictionaries when key = title
for row in (data):
titles = (row['title'])
## regex for content before a colon ^[^:]+\s*
## in ~95% of the archive this is troupe name
troupes = re.findall('^[^:]+\s*', titles)
#add troupes (with duplicates) to the list
all_troupes.append(troupes)
for a_troupe in all_troupes:
if a_troupe not in poster_troupes:
poster_troupes[a_troupe] = 0
poster_troupes[a_troupe] = poster_troupes[a_troupe] +1
pprint.pprint (poster_troupes)
## writes results back into separate json file
##with open ('troupes.json', 'w') as f:
## f.write(json.dumps(all_troupes, indent=4))