-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwebscraper_inat.py
295 lines (215 loc) · 11.6 KB
/
webscraper_inat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""Webscraper iNaturalist"""
"""
Downloads n images for a given species. The user only needs to provide the species id in the url
and a target folder where the images should be downloaded.
Code inspired by:
https://medium.com/swlh/web-scraping-stock-images-using-google-selenium-and-python-8b825ba649b9
https://medium.com/geekculture/scraping-images-using-selenium-f35fab26b122
"""
# Import libraries
import time
import requests
import io, os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from PIL import Image
# Webdriver for Firefox downloaded with GeckoDriverManager. For other browsers, search for the specific webdriver service
# https://github.com/mozilla/geckodriver/releases
DRIVER_PATH = r'C:\Users\Teo\.wdm\drivers\geckodriver\win64\v0.30.0\geckodriver.exe'
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep:int = 10):
"""Find and store the image urls.
:param query: Species ID to complete the url.
:type query: str
:param max_links_to_fetch: Maximum number of urls to download.
:type max_links_to_fetch: int
:param wd: Webdriver specific for your browser.
:type wd: selenium.webdriver
:param sleep: Number of seconds to wait until next iteration. Defaults to 10 seconds.
:type sleep: int, optional
:return: Set of tuples (urls, hrefID_listID)
"""
# Enable infinite scrolling
def scroll_to_end(wd):
wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(sleep)
# Build the search query
# Only load fotos marked with Research Grade and CC-BY-NC copyright
page_num = 12 # current page number
if page_num == 1:
search_url = f"https://www.inaturalist.org/observations?photo_license=CC-BY-NC&place_id=any&quality_grade=research&subview=table&taxon_id={query}"
else:
search_url = f"https://www.inaturalist.org/observations?page={page_num}&photo_license=CC-BY-NC&place_id=any&quality_grade=research&subview=table&taxon_id={query}"
wd.get(search_url)
time.sleep(sleep)
image_urls = set() # will contain tuples of urls along with the hrefID and listID within contribution: (url, hrefID_listID)
# Note: by storing the href and list IDs, each downloaded image can be retraced exactly on the site
image_count = 0
results_start = 0
reached_max = False
# Get total number of pages
try:
page_links = WebDriverWait(wd, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//li[@class='pagination-page ng-scope']/a")))
num_pages = int( page_links[-1].get_attribute('text') )
except:
num_pages = 1 # only one page, no links to new pages
# View all results on page
while image_count < max_links_to_fetch:
scroll_to_end(wd)
thumb = wd.find_elements(By.CSS_SELECTOR, 'a.img') # after each scrolling, the list increases
num_results = len(thumb)
# If the new thumbnail list is as long as before, you have reached the end of the page
# Load next page (in case there is one)
if num_results == results_start:
if page_num < num_pages:
page_num += 1
search_url = f"https://www.inaturalist.org/observations?page={page_num}&photo_license=CC-BY-NC&place_id=any&quality_grade=research&subview=table&taxon_id={query}"
print('\nLoading next page...\n')
wd.get(search_url)
results_start = 0 # the new thumbnail list on the next page will be scanned from 0 again
# The list of page links at the bottom can only show 10 pages at once.
# If there are more than 10 pages, the links to the new ones are only shown as you
# progress through the links. Therefore, whenever you turn a page, check
# whether there are actually more pages than initially visible.
try:
page_links = WebDriverWait(wd, 30).until(EC.presence_of_all_elements_located((By.XPATH, "//li[@class='pagination-page ng-scope']/a")))
num_pages = int( page_links[-1].get_attribute('text') )
continue
except Exception as e:
print(f"ERROR - Could not find page links on page {page_num}. Returning urls stored until now - {e}")
break
else:
print('No more images left!')
break
# Iterate over (new) images in current thumbnail list
for img in thumb[results_start : num_results]:
href_att = img.get_attribute('href')
href_id = href_att[href_att.rfind('/') + 1 : ] # drop string 'observations'
# If no image counter is shown, then there is only one image in the contribution
# -> Download its url directly from the table
# If there is a counter visible, the contribution has multiple images
# -> Click on it and get all available urls
if img.find_elements(By.CSS_SELECTOR, 'span.ng-hide'):
# URLs are stored as 'background-image' inside the style-attribute
style_att = img.get_attribute('style')
# Slice the string down to the url
ind_start, ind_end = style_att.find('url') + 5, style_att.find(')') - 1
url = style_att[ind_start : ind_end]
image_urls.add( (url, str(href_id) + '_1') ) # for consistency, images in one-element-lists get listID = 1
else:
# Open new tab and get all available image urls
href_urls = get_urls_from_href(href_id, wd)
image_urls.update(href_urls) # Note: use update when adding elements of a set/list to a set
# Check if maximum number of images was reached
image_count = len(image_urls)
if image_count >= max_links_to_fetch:
reached_max = True
break
if not reached_max:
print(f"Found: {image_count} image links. Looking for more...\n")
# Move the result startpoint further down
results_start = num_results
print(f"Found: {image_count} image links, done!\n")
return image_urls
def get_urls_from_href(href_id:int, wd:webdriver, sleep:int = 5):
"""Find all image urls from a given observation page index.
:param href_id: Observation page index. Leads to the page containing all images of a single person's contribution.
:type href_id: int
:param wd: Webdriver specific for your browser.
:type wd: selenium.webdriver
:param sleep: Number of seconds to wait until next iteration. Defaults to 5 seconds.
:type sleep: int, optional
:return: Set of tuples (urls, hrefID_listID)
"""
search_url = f"https://www.inaturalist.org/observations/{href_id}"
wd.execute_script("window.open('" + search_url +"');") # open new tab
wd.switch_to.window(wd.window_handles[1]) # focus on the new tab
time.sleep(sleep)
image_urls = set()
img_list = wd.find_elements(By.XPATH, "//div[@class='image-gallery-thumbnail-inner']/img")
for i in range(len(img_list)):
src = img_list[i].get_attribute('src')
large_url = src.replace('square', 'large') # store the image in original dimensions, not thumbnail dims
image_urls.add( (large_url, str(href_id) + '_' + str(i+1)) )
wd.execute_script("window.close('" + search_url +"');") # close new tab
wd.switch_to.window(wd.window_handles[0]) # focus on original tab
return image_urls
def persist_image(folder_path:str, url_id:tuple):
"""Save image from url to a specified folder.
:param folder_path: Path to the folder where the images are saved.
:type folder_path: str
:param url_id: Address of the image to download and hrefID_listID
:type url: tuple
"""
url, img_id = url_id
try:
# Get html code of the image
headers = {'User-agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0)'}
image_content = requests.get(url, headers=headers).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
try:
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
spec = folder_path.split('\\')[-1]
file_path = os.path.join(folder_path, spec + '_' + img_id + '.jpg')
with open(file_path, 'wb') as f:
image.save(f, "JPEG", quality = 95)
print(f"SUCCESS - saved {url} - as {file_path}")
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
def search_and_download(search_term:str, target_path = './', number_images = 10):
"""Launch query, store urls and download images.
:param search_term: Image ID used as query in the url.
:type search_term: str
:param target_path: Path to the folder where the images should be downloaded. Defaults to current folder.
:type target_path: str, optional
:param number_images: Number of images to be downloaded. Defaults to 10.
:type number_images: int, optional
"""
# Create downloading path, if not already existant
if not os.path.exists(target_path):
os.makedirs(target_path)
# Store image urls
# Note: On the first run, install driver with GeckoDriverManager().install() instead of DRIVER_PATH
with webdriver.Firefox(service = Service(DRIVER_PATH)) as wd:
res = fetch_image_urls(search_term, number_images, wd = wd)
# Download images
for elem in res:
persist_image(target_path, elem)
if __name__ == '__main__':
# Test server: a browser window should open and close immediately
# from selenium.webdriver.firefox.service import Service
# service = Service(DRIVER_PATH)
# service.start()
# wd = webdriver.Remote(service.service_url)
# wd.quit()
ind_spec = [
#(60579, 'Andrena_fulva'),
#(62453, 'Anthidium_manicatum'),
#(453068, 'Bombus_cryptarum'),
#(121989, 'Bombus_hortorum'),
#(61803, 'Bombus_hypnorum'),
#(57619, 'Bombus_lapidarius'),
#(61856, 'Bombus_lucorum'),
#(424468, 'Bombus_magnus'),
#(55637, 'Bombus_pascuorum'),
#(124910, 'Bombus_pratorum'),
#(123657, 'Bombus_sylvarum'),
#(746682, 'Dasypoda_hirtipes'),
#(415589, 'Halictus_scabiosae'),
#(207574, 'Osmia_bicolor'),
#(876599, 'Osmia_bicornis'),
#(126630, 'Osmia_cornuta'),
#(154661, 'Sphecodes_albilabris'),
(124145, 'Xylocopa_violacea')
]
for ind, spec in ind_spec:
print('\n******** ' + spec + ' ********\n')
search_and_download(search_term = str(ind),
target_path = 'Z:\data\Bees\\' + spec,
#target_path = 'C:\\Users\Teo\Documents\KInsekten\data\Bees\\' + spec,
number_images = 5000)