-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCircusScrape_JSON.py
146 lines (104 loc) · 5.04 KB
/
CircusScrape_JSON.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from bs4 import BeautifulSoup
import requests
import json
import time
#setting up the list (to be filled with dictionaries)
all_posters = []
#THIS WILL CREATE THE LIST OF ALL URLS FOR INDV POSTERS
ids = range (0,4976,1)
for x in ids:
url = ('http://emuseum.ringling.org/emuseum/view/objects/asitem/16579/'+str(x))
poster_page = requests.get(url)
if poster_page.status_code != 200:
print ("there was an error with", url)
time.sleep(1)
object_html = poster_page.text
soup = BeautifulSoup(object_html, "html.parser")
#Create variable for URLS of Poster IMAGES
image = soup.find("p", attrs = {"class": "centeredtext"})
if image == None:
a_link = "Image not Available"
else:
for image_link in image:
link_str = image.find ("a", attrs = {"onkeypress": "return Emuseum.onkeypressExecuteOnclick(event, this);"})
a_link = ('http://emuseum.ringling.org'+str(link_str.get('href')))
#this step just removes (what I think is) an unnecessary extra div
all_detail = soup.find_all("div", attrs = {"class": "paragraph"})
title = "no title"
maker = "no maker"
date = "no date"
medium = "no_medium"
dimensions = "no dimensions"
credit = "no cred_line"
obj_num = "no obj_num"
description = "no description"
#check if it has the 10 metadata fields
if len(all_detail) > 0:
#loop through for text inside the div
for a_div in all_detail:
all_text = a_div.text
#keyword check to identify data field
if 'Title:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
title = all_text.replace('Title:' , '').strip()
#keyword check to identify data field
if 'Maker:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
maker = all_text.replace('Maker:' , '').strip()
#keyword check to identify data field
if 'Date:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
date = all_text.replace('Date:' , '').strip()
#keyword check to identify data field
if 'Medium:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
medium = all_text.replace('Medium:' , '').strip()
#keyword check to identify data field
if 'Dimensions:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
dimensions = all_text.replace('Dimensions:' , '').strip()
#keyword check to identify data field
if 'Credit Line:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
credit = all_text.replace('Credit line:' , '').strip()
#keyword check to identify data field
if 'Object Number:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
obj_num = all_text.replace('Object Number:' , '').strip()
#keyword check to identify data field
if 'Description:' in all_text:
#create the variable for that field
#find/replace for the 'label' span, then trim white space
description = all_text.replace('Description:' , '').strip()
#create / define a dictionary
this_poster = {}
#all the fields as dictionary keys, with the variables from above
this_poster['title'] = title
this_poster['maker'] = maker
this_poster['date'] = date
this_poster['mediuum'] = medium
this_poster['dimensions'] = dimensions
this_poster['credit'] = credit
this_poster['obj_num'] = obj_num
this_poster['description'] = description
#this is the x from the url defined at the beginning
this_poster['web_id'] = x
#this is the a_link from the image source concatenation done above
this_poster['img_url'] = a_link
print (this_poster)
print ("\n")
#add it to the list created above or return message
all_posters.append(this_poster)
with open('scraped_circusposters.json', 'w') as f:
f.write(json.dumps(all_posters, indent=4))
else:
print ("This page does not meet the element requirement of > 0 :", 'http://emuseum.ringling.org/emuseum/view/objects/asitem/16579/'+str(x))
continue
print (all_posters)