-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrequests_letras.py
123 lines (97 loc) · 3.78 KB
/
requests_letras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import re
import requests
from lxml import html
from bs4 import BeautifulSoup
from unidecode import unidecode
def print_words(words, max_words=20):
''' Print first max_words according to the num of occurences '''
count_words = {k: v for k, v in sorted(words.items(), key=lambda item: item[1], reverse=True)}
for i, (k, v) in enumerate(count_words.items()):
if i > max_words: break
print(f'{k}:\t\t{v}')
def make_request(url):
''' Make request with the request module and treat errors '''
headers = {
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
}
try:
response = requests.get(url, headers=headers)
except Exception as e:
print(f'Error - {e.args}')
return None
if response.status_code != 200:
return None
return response
def get_songs_tags_lxml(response):
''' Use lxml to parse artist page '''
tree = html.fromstring(response.text)
songs_tags = tree.xpath('//a[@class="song-name"]')
return songs_tags
def get_songs_tags_bs4(response):
''' Use BeautifulSoup to parse artist page '''
soup = BeautifulSoup(response.text, 'lxml')
songs_tags = soup.find_all('a', {'class': 'song-name'}, href=True)
return songs_tags
def get_lyrics_lxml(response):
''' Use lxml to parse song lyrics page '''
tree = html.fromstring(response.text)
lyrics = tree.xpath('//div[@class="cnt-letra p402_premium"]//text()')
return lyrics
def get_lyrics_bs4(response):
''' Use BeautifulSoup to parse song lyrics page '''
soup = BeautifulSoup(response.text, 'lxml')
lyrics_div = soup.find('div', {'class': 'cnt-letra p402_premium'})
lyrics_tags = lyrics_div.find_all('p')
lyrics = [t.get_text(separator=' ') for t in lyrics_tags]
return lyrics
if __name__ == '__main__':
use_lxml = True # set which module to use
max_songs = 10 # set amount of songs to consider
artist = 'O grilo' # set artist
main_url = 'https://www.letras.mus.br'
clean_artist = artist.lower().replace(' ', '-')
url = main_url + '/' + clean_artist
# get artist page
response = make_request(url)
if response == None:
print('Failed')
exit(1)
# parse artist page
if use_lxml:
songs_tags = get_songs_tags_lxml(response)
else:
songs_tags = get_songs_tags_bs4(response)
songs_urls = [main_url + song.get('href') for song in songs_tags]
count_words = {}
num_songs = min(len(songs_urls), max_songs)
# get each song
for index, url in enumerate(songs_urls[:num_songs], 1):
song_name = url.split('/')[-2]
print(f'[{index}/{num_songs}] - {song_name}')
response = make_request(url)
if response is None:
print(f'{url} failed')
continue
# parse lyrics page
if use_lxml:
lyrics = get_lyrics_lxml(response)
else:
lyrics = get_lyrics_bs4(response)
# loop over verses and words counting them
for verse in lyrics:
words = verse.split(' ')
for word in words:
word = word.lower()
word = unidecode(word)
word = re.sub(r'\W+', '', word)
if word == '':
continue
if word in count_words:
count_words[word] += 1
else:
count_words[word] = 1
# show result
print_words(count_words)