From 3e5e911ec3362bb9077fb055571195254805db36 Mon Sep 17 00:00:00 2001 From: Khaled ElMorshedy Date: Mon, 3 Jan 2022 00:04:05 +0200 Subject: [PATCH] [Re-Fixed] Single albums return relative urls. --- azapi/__init__.py | 12 ++-- azapi/azapi.py | 116 ++++++++++++++++++----------------- azapi/jaro.py | 35 ++++++----- azapi/requester.py | 29 +++------ azapi/tools.py | 150 ++++++++++++++++++++++----------------------- 5 files changed, 168 insertions(+), 174 deletions(-) diff --git a/azapi/__init__.py b/azapi/__init__.py index 9bdce75..ed76014 100644 --- a/azapi/__init__.py +++ b/azapi/__init__.py @@ -1,8 +1,8 @@ from .azapi import AZlyrics -__name__ = "azapi.azapi.AZlyrics" -__author__ = "Khaled H. El-Morshedy" -__url__ = "https://github.com/elmoiv/azapi" -__description__ = "Get Lyrics from AZLyrics.com like a Boss ~(0_0)~" -__license__ = "GPL-v3.0" -__version__ = "3.0.6" +__name__ = 'azapi.azapi.AZlyrics' +__author__ = 'Khaled H. El-Morshedy' +__url__ = 'https://github.com/elmoiv/azapi' +__description__ = 'Get Lyrics from AZLyrics.com like a Boss ~(0_0)~' +__license__ = 'GPL-v3.0' +__version__ = '3.0.7' diff --git a/azapi/azapi.py b/azapi/azapi.py index 092a0c4..ff623a9 100644 --- a/azapi/azapi.py +++ b/azapi/azapi.py @@ -1,10 +1,6 @@ -import os -import time - from .requester import Requester from .tools import * - class AZlyrics(Requester): """ Fast and Secure API for AZLyrics.com @@ -17,33 +13,32 @@ class AZlyrics(Requester): accuracy (float): used to determine accuracy via jaro algorithm proxies (dict): if you want to use proxy while connecting to AZLyrics.com """ - - def __init__(self, search_engine="", accuracy=0.6, proxies={}, title="", artist=""): - self.title = title - self.artist = artist + + def __init__(self, search_engine='', accuracy=0.6, proxies={}): + self.title = '' + self.artist = '' self.search_engine = search_engine - + self.accuracy = accuracy - if not 0 < accuracy <= 1: self.accuracy = 0.6 - + self.proxies = proxies self.lyrics_history = [] - self.lyrics = "" + self.lyrics = '' self.songs = {} - def getLyrics(self, url=None, ext="txt", save=False, path="", sleep=3): + def getLyrics(self, url=None, ext='txt', save=False, path='', sleep=3): """ Reterive Lyrics for a given song details - - Parameters: - url (str): url of the song's Azlyrics page. + + Parameters: + url (str): url of the song's Azlyrics page. ext (str): extension of the lyrics saved file, default is ".txt". save (bool): allow to or not to save lyrics in a file. - sleep (float): cooldown before next request. - + sleep (float): cooldown before next request. + Returns: lyrics (str): Lyrics of the detected song """ @@ -60,32 +55,35 @@ def getLyrics(self, url=None, ext="txt", save=False, path="", sleep=3): if self.search_engine: # If user can't remember the artist, # he can search by title only - + # Get AZlyrics url via Google Search link = googleGet( - self.search_engine, - self.accuracy, - self.get, - self.artist, - self.title, - 0, - self.proxies, - ) + self.search_engine, + self.accuracy, + self.get, + self.artist, + self.title, + 0, + self.proxies + ) if not link: return 0 else: # Sometimes search engines block you # If happened use the normal get method - link = normalGet(self.artist, self.title, 0) + link = normalGet( + self.artist, + self.title, + 0) page = self.get(link, self.proxies) if page.status_code != 200: - print("Error", page.status_code) + print('Error 404!') return 1 # Getting Basic metadata from azlyrics - metadata = [elm.text for elm in htmlFindAll(page)("b")] - + metadata = [elm.text for elm in htmlFindAll(page)('b')] + # v3.0.4: Update title and artist attributes with exact names self.artist = filtr(metadata[0][:-7], True) self.title = filtr(metadata[1][1:-1], True) @@ -98,18 +96,22 @@ def getLyrics(self, url=None, ext="txt", save=False, path="", sleep=3): if save: # v3.0.2: Adding custom path p = os.path.join( - path, - "{} - {}.{}".format(self.title.title(), self.artist.title(), ext), - ) - - with open(p, "w", encoding="utf-8") as f: + path, + '{} - {}.{}'.format( + self.title.title(), + self.artist.title(), + ext + ) + ) + + with open(p, 'w', encoding='utf-8') as f: f.write(lyrics.strip()) - + # Store lyrics for later usage self.lyrics_history.append(self.lyrics) return self.lyrics - self.lyrics = "No lyrics found :(" + self.lyrics = 'No lyrics found :(' return 2 def getSongs(self, sleep=3): @@ -117,39 +119,41 @@ def getSongs(self, sleep=3): Reterive a dictionary of songs with their links Parameters: - sleep (float): cooldown before next request. - + sleep (float): cooldown before next request. + Returns: dict: dictionary of songs with their links """ if not self.artist: raise Exception("Artist can't be empty!") - + # Best cooldown is 5 sec time.sleep(sleep) - + if self.search_engine: link = googleGet( - self.search_engine, - self.accuracy, - self.get, - self.artist, - "", - 1, - self.proxies, - ) + self.search_engine, + self.accuracy, + self.get, + self.artist, + '', + 1, + self.proxies + ) if not link: return {} else: - link = normalGet(self.artist, "", 1) - + link = normalGet( + self.artist, + '', + 1) + albums_page = self.get(link, self.proxies) if albums_page.status_code != 200: - print("Error", albums_page.status_code) + print('Error 404!') return {} - + # Store songs for later usage self.songs = parseSongs(albums_page) - - return self.songs + return self.songs \ No newline at end of file diff --git a/azapi/jaro.py b/azapi/jaro.py index 6824413..43cf170 100644 --- a/azapi/jaro.py +++ b/azapi/jaro.py @@ -2,38 +2,39 @@ # Source: https://www.geeksforgeeks.org/jaro-and-jaro-winkler-similarity/ # This code is contributed by mohit kumar 29 (GeeksforGeeks.com) from math import floor - - -def jaro_distance(s1, s2): - if s1 == s2: + +def jaro_distance(s1, s2): + if (s1 == s2): return 1.0 - + len1, len2 = len(s1), len(s2) max_dist = floor(max(len1, len2) / 2) - 1 match = 0 hash_s1, hash_s2 = [0] * len(s1), [0] * len(s2) - + for i in range(len1): - for j in range(max(0, i - max_dist), min(len2, i + max_dist + 1)): - if s1[i] == s2[j] and hash_s2[j] == 0: + for j in range(max(0, i - max_dist), + min(len2, i + max_dist + 1)): + if (s1[i] == s2[j] and hash_s2[j] == 0): hash_s1[i], hash_s2[j] = 1, 1 match += 1 break - if match == 0: + if (match == 0): return 0.0 t = 0 point = 0 - - for i in range(len1): - if hash_s1[i]: - while hash_s2[point] == 0: + + for i in range(len1): + if (hash_s1[i]): + while (hash_s2[point] == 0): point += 1 - - if s1[i] != s2[point]: + + if (s1[i] != s2[point]): point += 1 t += 1 - t = t // 2 + t = t//2 - return (match / len1 + match / len2 + (match - t + 1) / match) / 3.0 + return (match/ len1 + match / len2 + + (match - t + 1) / match)/ 3.0 \ No newline at end of file diff --git a/azapi/requester.py b/azapi/requester.py index 558aec5..f366c24 100644 --- a/azapi/requester.py +++ b/azapi/requester.py @@ -1,8 +1,6 @@ -import random +import requests, random -import requests - -userAgents = """Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 +userAgents = '''Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36 Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.11 Safari/535.19 Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11 @@ -41,23 +39,14 @@ Mozilla/5.0 (Macintosh; U; Mac OS X 10_5_7; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5 Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9 Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/ Safari/530.6 -Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5""" - - -class Requester: - USER_AGENTS = userAgents.split("\n") +Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5''' +class Requester(): + USER_AGENTS = userAgents.split('\n') + # Inspired from: https://github.com/brianchesley/Lyrics/blob/master/lyrics_data_scrape.py def get(self, url, _proxies={}): - return requests.get( - url, - headers={"User-Agent": random.choice(self.USER_AGENTS)}, - proxies=_proxies, - ) - + return requests.get(url, headers={'User-Agent': random.choice(self.USER_AGENTS)}, proxies=_proxies) + def head(self, url, _proxies={}): - return requests.head( - url, - headers={"User-Agent": random.choice(self.USER_AGENTS)}, - proxies=_proxies, - ) + return requests.head(url, headers={'User-Agent': random.choice(self.USER_AGENTS)}, proxies=_proxies) \ No newline at end of file diff --git a/azapi/tools.py b/azapi/tools.py index cf25b31..23d49fe 100644 --- a/azapi/tools.py +++ b/azapi/tools.py @@ -1,148 +1,148 @@ -import re +import bs4, re, time, os from urllib.parse import quote - -import bs4 - from .jaro import jaro_distance -letters = "abcdefghijklmnopqrstuvwxyz0123456789" - +letters = 'abcdefghijklmnopqrstuvwxyz0123456789' def htmlFind(page): # v3.0 # Changed page.text -> page.content.decode() to support variant unicodes - soup = bs4.BeautifulSoup(page.content.decode(), "html.parser") + soup = bs4.BeautifulSoup( + page.content.decode(), + "html.parser" + ) return soup.find - def htmlFindAll(page): # v3.0 # Changed page.text -> page.content.decode() to support variant unicodes - soup = bs4.BeautifulSoup(page.content.decode(), "html.parser") + soup = bs4.BeautifulSoup( + page.content.decode(), + "html.parser" + ) return soup.findAll - def filtr(inpt, isFile=False): - return ( - "".join(i for i in inpt if i not in r'<>:"/\|?*') - if isFile - else "".join(i.lower() for i in inpt if i.lower() in letters) - ) - + if isFile: + return ''.join(i for i in inpt if i not in r'<>:"/\|?*') + return ''.join(i.lower() for i in inpt if i.lower() in letters) -def normalGet(artist="", title="", _type=0): +def normalGet(artist='', title='', _type=0): art, tit = filtr(artist), filtr(title) - return ( - "https://www.azlyrics.com/{}/{}.html".format(art[0], art) - if type - else "https://www.azlyrics.com/lyrics/{}/{}.html".format(art, tit) - ) + if _type: + print('https://www.azlyrics.com/{}/{}.html'.format(art[0], art)) + return 'https://www.azlyrics.com/{}/{}.html'.format(art[0], art) + return 'https://www.azlyrics.com/lyrics/{}/{}.html'.format(art, tit) - -def googleGet(srch_eng, acc, get_func, artist="", title="", _type=0, proxies={}): +def googleGet(srch_eng, acc, get_func, artist='', title='', _type=0, proxies={}): # Encode artist and title to avoid url encoding errors - data = artist + " " * (title != "" and artist != "") + title - encoded_data = quote(data.replace(" ", "+")) + data = artist + ' ' * (title != '' and artist != '') + title + encoded_data = quote(data.replace(' ', '+')) # Perform a search (for accuracy) [Custom search engine] search_engines = { - "google": "https://www.google.com/search?q=", - "duckduckgo": "https://duckduckgo.com/html/?q=", + 'google': 'https://www.google.com/search?q=', + 'duckduckgo': 'https://duckduckgo.com/html/?q=' } - slctd_srch_engn = "google" + slctd_srch_engn = 'google' if srch_eng in search_engines: slctd_srch_engn = srch_eng - google_page = get_func( - "{}{}+site%3Aazlyrics.com".format( - search_engines[slctd_srch_engn], encoded_data - ), - proxies, - ) - + google_page = get_func('{}{}+site%3Aazlyrics.com'.format( + search_engines[slctd_srch_engn], + encoded_data + ), + proxies + ) + # Choose between lyrics or song according to function used regex = [ - r"(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)", - r"(azlyrics\.com\/[a-z0-9]+\/(\w+).html)", + r'(azlyrics\.com\/lyrics\/(\w+)\/(\w+).html)', + r'(azlyrics\.com\/[a-z0-9]+\/(\w+).html)' ] - + # ex result: [('azlyrics.com/t/taylorswift.html', 'taylorswift')] # result[0][0] = 'azlyrics.com/t/taylorswift.html' - results = re.findall(regex[_type], google_page.text) + results = re.findall( + regex[_type], + google_page.text + ) if len(results): # calculate jaro similarity for artist and title jaro_artist = 1.0 jaro_title = 1.0 - + if artist: - jaro_artist = jaro_distance(artist.replace(" ", ""), results[0][1]) + jaro_artist = jaro_distance( + artist.replace(' ', ''), + results[0][1] + ) if title: - jaro_title = jaro_distance(title.replace(" ", ""), results[0][2]) - + jaro_title = jaro_distance( + title.replace(' ', ''), + results[0][2] + ) + if jaro_artist >= acc and jaro_title >= acc: - return "https://www." + results[0][0] + return 'https://www.' + results[0][0] else: - print("Similarity <", acc) + print('Similarity <', acc) else: - print(srch_eng.title(), "found nothing!") - + print(srch_eng.title(), 'found nothing!') + return 0 - # v3.0.5: Re-coded ParseLyrics to be more efficient def parseLyric(page): - divs = [i.text for i in htmlFindAll(page)("div", {"class": None})] + divs = [i.text for i in htmlFindAll(page)('div', {'class': None})] return max(divs, key=len) - def parseSongs(page): songs = {} - Parent = htmlFind(page)("div", {"id": "listAlbum"}) + Parent = htmlFind(page)('div', {'id':'listAlbum'}) if Parent: Raw_Data = Parent.findChildren() - curType, curName, curYear = "", "", "" + curType, curName, curYear = '', '', '' for elmnt in Raw_Data: # v3.0.3: Removed break after script due to google ads inside listAlbum # is using script tag, which results in not all songs retrieved - # if elmnt.name == 'script': + #if elmnt.name == 'script': # break - + # album info are inside divs - if elmnt.name == "div": - if elmnt.text == "other songs:": - curType, curName, curYear = "Others", "", "" + if elmnt.name == 'div': + if elmnt.text == 'other songs:': + curType, curName, curYear = 'Others', '', '' else: # Separating to (album, name, year) rgx = re.findall(r'(.*):\s"(.*)"\s\(([0-9]+)\)', elmnt.text) if rgx: curType, curName, curYear = rgx[0] - if elmnt.name == "a": + if elmnt.name == 'a': songs[elmnt.text] = { - "year": curYear, - "album": curName, - "type": curType, + 'year': curYear, + 'album': curName, + 'type': curType, # Azlyrics puts hrefs with/without base url - "url": "http://www.azlyrics.com" + elmnt["href"][2:] - if elmnt["href"][:2] == ".." - else elmnt["href"], + 'url': 'http://www.azlyrics.com' + elmnt['href'].strip('.') \ + if elmnt['href'].startswith('/lyrics/') else elmnt['href'] } # v 3.0 # Some artists have no albums, so we cover this else: - for div in htmlFindAll(page)("div", {"class": "listalbum-item"}): - a = div.find("a") + for div in htmlFindAll(page)('div', {'class':'listalbum-item'}): + a = div.find('a') songs[a.text] = { - "year": "", - "album": "", - "type": "", + 'year': '', + 'album': '', + 'type': '', # v3.0.1: fix relative urls -> absolute url - "url": "http://www.azlyrics.com" + a["href"][2:] - if a["href"][:2] == ".." - else a["href"], - } - return songs + 'url': 'http://www.azlyrics.com' + a['href'][2:] \ + if a['href'][:2] == '..' else a['href'] + } + return songs \ No newline at end of file