-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBusImageScraper.py
78 lines (59 loc) · 1.98 KB
/
BusImageScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from bs4 import BeautifulSoup
import requests
import re
import shutil
def remove_newlines(data_arr):
new_data = ""
for data in data_arr:
if data != "":
new_data += data
return new_data
def get_image(url, i):
url = "https:" + url
print(url)
print("Downloading image...")
r = requests.get(url, stream=True) #Get request on full_urlif r.status_code == 200: #200 status code = OK
with open(f"imgs/img{i}.jpg", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
return
def gallery_request(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
dds = soup.find_all("dd", {"class": "col-9"})
if len(dds) == 0:
return False, None, None
service = ""
for dd in dds:
div = dd.div
if div is not None:
div_text = div.text.split("\n")
if re.match("^Serviço", div_text[1]):
service = div_text[2]
if service != "":
img_url = soup.find_all("img", {"id": "photo"})[0]["src"]
return True, service, img_url
return False, None, None
i = 0
csv = open("bus_data.csv", "a")
url = "https://onibusbrasil.com/valtermendonca/page/"
idoridx = 1
while True:
print(f"Acessando imagens de {url + str(idoridx)}")
r = requests.get(url + str(idoridx))
if (r.status_code == 404):
print(f"{url + str(idoridx)} [404]")
break
soup = BeautifulSoup(r.text, "html.parser")
galleries = soup.find_all("div", {"class": "col-6 col-sm-4 col-md-3 col-xl-2 text-center mb-2"})
for gallery in galleries:
gallery_url = gallery.a["href"]
print("Indo para:", gallery_url)
hasService, service, img_url = gallery_request(gallery_url)
if not hasService:
continue
get_image(img_url, i)
csv.write(f"{i},{service}\n")
i += 1
print(f"Número de Imagens Baixadas: {i + 1}")
idoridx += 1