-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_beautifulsoup.py
113 lines (95 loc) · 2.34 KB
/
find_beautifulsoup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from bs4 import BeautifulSoup
from sys import exit, argv
import requests
def parse_people(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
obj = soup.find(id='meterRank')
if obj.string == 'SEE RANK':
obj = 'Below Rank 5000'
else:
obj = obj.string
print('MeterRank: {}'.format(
obj))
obj_2 = soup.find(id='knownfor')
'''
print('Known for: {}'.format(
obj_2))
'''
obj_3 = obj_2.find_all('span', class_='knownfor-ellipsis')
obj_4 = obj_2.find_all('a', class_='knownfor-ellipsis')
obj_5 = obj_2.find_all(class_='knownfor-year')
list_titles = list()
list_years = list()
list_roles = list()
for i in range(0, len(obj_3), 2):
list_titles.append(obj_3[i].string)
for i in range(1, len(obj_3), 2):
list_years.append(obj_3[i].string)
for i in range(len(obj_4)):
list_roles.append(obj_4[i].string)
print('{} // {} // {}'.format(
list_titles,
list_roles,
list_years))
assert len(list_titles) == len(list_roles) == len(list_years)
list_returner = list()
for i in range(len(list_titles)):
dictionary = {
'title':list_titles[i],
'role':list_roles[i],
'year':list_years[i]
}
list_returner.append(dictionary)
'''
print('Found {} known for movies'.format(
len(obj_3)))
'''
'''
for i in obj_3:
print('Title: {}'.format(
obj_3.string))
print('Title Role: {}'.format(
obj_4.string))
print('Year: {}'.format(
obj_5.string))
'''
return list_returner
def main(mode='default', day='11', month='12', year='1996'):
url = 'https://www.imdb.com/search/name/?birth_monthday={}-{}&birth_year={}'.format(
month,
day,
year
)
list_obj = list()
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# print(soup.prettify())
obj = soup.find_all('h3')
print(obj)
print('Found {} elements'.format(
len(obj)))
for i in range(len(obj)):
try:
href = obj[i].find('a').get('href')
name = obj[i].find('a').string.rstrip()
parsed_info = parse_people('{}{}'.format(
'https://imdb.com',
href))
iteration_dict = {
'name':name,
'url':href,
'information':parsed_info
}
print(href)
print(name)
list_obj.append(iteration_dict)
except AttributeError:
print('Element {} was empty: {}'.format(
i+1,
obj[i]))
print('Objects: {}'.format(
list_obj))
return list_obj
if __name__ == '__main__':
main()