diff --git a/app/scrapers/ask.py b/app/scrapers/ask.py index cab27029..bef4e771 100644 --- a/app/scrapers/ask.py +++ b/app/scrapers/ask.py @@ -9,6 +9,7 @@ def __init__(self): self.url = 'http://ask.com/web' self.defaultStart = 1 self.startKey = 'page' + self.name = 'ask' def next_start(self, current_start, prev_results): return current_start + 1 diff --git a/app/scrapers/baidu.py b/app/scrapers/baidu.py index 6837be31..77e13991 100644 --- a/app/scrapers/baidu.py +++ b/app/scrapers/baidu.py @@ -11,6 +11,7 @@ def __init__(self): self.defaultStart = 0 self.queryKey = 'wd' self.startKey = 'pn' + self.name = 'baidu' def parse_response(self, soup): """ Parse the response and return set of urls diff --git a/app/scrapers/bing.py b/app/scrapers/bing.py index e601fb82..0d79e960 100644 --- a/app/scrapers/bing.py +++ b/app/scrapers/bing.py @@ -10,6 +10,7 @@ def __init__(self): self.url = 'http://www.bing.com/search' self.defaultStart = 1 self.startKey = 'first' + self.name = 'bing' def parse_response(self, soup): """ Parses the reponse and return set of urls diff --git a/app/scrapers/dailymotion.py b/app/scrapers/dailymotion.py index c4b02ced..ade27ec4 100644 --- a/app/scrapers/dailymotion.py +++ b/app/scrapers/dailymotion.py @@ -12,6 +12,7 @@ def __init__(self): self.queryKey = 'search' self.startKey = 'page' self.defaultStart = 1 + self.name = 'dailymotion' def parse_response(self, soup): """ Parse the response and return set of urls diff --git a/app/scrapers/duckduckgo.py b/app/scrapers/duckduckgo.py index 300ffdae..08a6b94d 100644 --- a/app/scrapers/duckduckgo.py +++ b/app/scrapers/duckduckgo.py @@ -10,6 +10,7 @@ def __init__(self): self.url = 'https://duckduckgo.com/html' self.defaultStart = 0 self.startKey = 's' + self.name = 'duckduckgo' def parse_response(self, soup): """ Parse the response and return set of urls diff --git a/app/scrapers/exalead.py b/app/scrapers/exalead.py index bf0e73ef..0350ee26 100644 --- a/app/scrapers/exalead.py +++ b/app/scrapers/exalead.py @@ -10,6 +10,7 @@ def __init__(self): self.url = 'https://www.exalead.com/search/web/results/' self.defaultStart = 0 self.startKey = 'start_index' + self.name = 'exalead' def parse_response(self, soup): """ Parse the response and return set of urls diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index 7e354023..e87df5ff 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -25,15 +25,24 @@ def get_page(self, query, startIndex=0, qtype=''): """ Fetch the google search results page Returns : Results Page """ + url = self.url + if qtype == 'vid': + if self.name in ['yahoo']: + url = self.videoURL + else: + url = self.url payload = {self.queryKey: query, self.startKey: startIndex, self.qtype: qtype} - response = requests.get(self.url, headers=self.headers, params=payload) + response = requests.get(url, headers=self.headers, params=payload) print(response.url) return response def parse_response(self, soup): raise NotImplementedError + def parse_video_response(self, soup): + raise NotImplementedError + def next_start(self, current_start, prev_results): return current_start + len(prev_results) @@ -48,7 +57,13 @@ def search(self, query, num_results, qtype=''): while (len(urls) < num_results): response = self.get_page(query, current_start, qtype) soup = BeautifulSoup(response.text, 'html.parser') - new_results = self.parse_response(soup) + if qtype == 'vid': + if self.name in ['yahoo']: + new_results = self.parse_video_response(soup) + else: + new_results = self.parse_response(soup) + else: + new_results = self.parse_response(soup) if new_results is None: break urls.extend(new_results) diff --git a/app/scrapers/google.py b/app/scrapers/google.py index b8ebd79b..fba9366c 100644 --- a/app/scrapers/google.py +++ b/app/scrapers/google.py @@ -11,6 +11,7 @@ def __init__(self): self.defaultStart = 0 self.startKey = 'start' self.qtype = 'tbm' + self.name = 'google' def next_start(self, current_start, prev_results): return current_start + len(prev_results) diff --git a/app/scrapers/mojeek.py b/app/scrapers/mojeek.py index 15d1b39b..652d8f19 100644 --- a/app/scrapers/mojeek.py +++ b/app/scrapers/mojeek.py @@ -10,6 +10,7 @@ def __init__(self): self.url = 'https://www.mojeek.co.uk/search' self.defaultStart = 1 self.startKey = 's' + self.name = 'mojeek' def parse_response(self, soup): """ Parse the response and return set of urls diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index f1ed3fcb..a1134bd9 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -10,6 +10,7 @@ def __init__(self): self.url = 'https://parsijoo.ir/web' self.defaultStart = 0 self.startKey = 'co' + self.name = 'parsijoo' def parse_response(self, soup): """ Parse the response and return set of urls diff --git a/app/scrapers/quora.py b/app/scrapers/quora.py index edb5df78..dda68fb4 100644 --- a/app/scrapers/quora.py +++ b/app/scrapers/quora.py @@ -8,6 +8,7 @@ class Quora(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://www.quora.com/search' + self.name = 'quora' def parse_response(self, soup): """ Parse the response and return set of urls diff --git a/app/scrapers/yahoo.py b/app/scrapers/yahoo.py index 1b280cdb..9c8d71c8 100644 --- a/app/scrapers/yahoo.py +++ b/app/scrapers/yahoo.py @@ -13,8 +13,10 @@ class Yahoo(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://search.yahoo.com/search' + self.videoURL = 'https://video.search.yahoo.com/search/video' self.defaultStart = 1 self.startKey = 'b' + self.name = 'yahoo' def parse_response(self, soup): """ Parse response and returns the urls @@ -38,3 +40,24 @@ def parse_response(self, soup): print('Yahoo parsed: ' + str(urls)) return urls + + def parse_video_response(self, soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[Tile1, url1], [Title2, url2], ...] + """ + urls = [] + for h in soup.findAll('li', attrs={'class': 'vr vres'}): + t = h.find('a', attrs={'class': 'ng'}) + r = t.get('data-rurl') + titleDiv = t.find('div', attrs={'class': 'v-meta bx-bb'}) + title = titleDiv.find('h3').getText() + urls.append({ + 'title': title, + 'link': r + }) + + print('Yahoo parsed: ' + str(urls)) + + return urls diff --git a/app/scrapers/youtube.py b/app/scrapers/youtube.py index 86a1726a..70393d69 100644 --- a/app/scrapers/youtube.py +++ b/app/scrapers/youtube.py @@ -9,6 +9,7 @@ def __init__(self): Scraper.__init__(self) self.url = 'https://www.youtube.com/results' self.queryKey = 'search_query' + self.name = 'youtube' def parse_response(self, soup): """ Parse the response and return list of urls