Skip to content

Commit

Permalink
Addresses #320 Add video search support for Yahoo (#446)
Browse files Browse the repository at this point in the history
Removed unnecessary line from google scraper

Fixed Codacy and Travis errors

Fixed last Travis error

Removed unnecessary debug print statement

Squashed the commits
  • Loading branch information
Remorax authored and bhaveshAn committed Jan 19, 2018
1 parent 8d319e7 commit 24f17b1
Show file tree
Hide file tree
Showing 13 changed files with 51 additions and 2 deletions.
1 change: 1 addition & 0 deletions app/scrapers/ask.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def __init__(self):
self.url = 'http://ask.com/web'
self.defaultStart = 1
self.startKey = 'page'
self.name = 'ask'

def next_start(self, current_start, prev_results):
return current_start + 1
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/baidu.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def __init__(self):
self.defaultStart = 0
self.queryKey = 'wd'
self.startKey = 'pn'
self.name = 'baidu'

def parse_response(self, soup):
""" Parse the response and return set of urls
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self):
self.url = 'http://www.bing.com/search'
self.defaultStart = 1
self.startKey = 'first'
self.name = 'bing'

def parse_response(self, soup):
""" Parses the reponse and return set of urls
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/dailymotion.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def __init__(self):
self.queryKey = 'search'
self.startKey = 'page'
self.defaultStart = 1
self.name = 'dailymotion'

def parse_response(self, soup):
""" Parse the response and return set of urls
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/duckduckgo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self):
self.url = 'https://duckduckgo.com/html'
self.defaultStart = 0
self.startKey = 's'
self.name = 'duckduckgo'

def parse_response(self, soup):
""" Parse the response and return set of urls
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/exalead.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self):
self.url = 'https://www.exalead.com/search/web/results/'
self.defaultStart = 0
self.startKey = 'start_index'
self.name = 'exalead'

def parse_response(self, soup):
""" Parse the response and return set of urls
Expand Down
19 changes: 17 additions & 2 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,24 @@ def get_page(self, query, startIndex=0, qtype=''):
""" Fetch the google search results page
Returns : Results Page
"""
url = self.url
if qtype == 'vid':
if self.name in ['yahoo']:
url = self.videoURL
else:
url = self.url
payload = {self.queryKey: query, self.startKey: startIndex,
self.qtype: qtype}
response = requests.get(self.url, headers=self.headers, params=payload)
response = requests.get(url, headers=self.headers, params=payload)
print(response.url)
return response

def parse_response(self, soup):
raise NotImplementedError

def parse_video_response(self, soup):
raise NotImplementedError

def next_start(self, current_start, prev_results):
return current_start + len(prev_results)

Expand All @@ -48,7 +57,13 @@ def search(self, query, num_results, qtype=''):
while (len(urls) < num_results):
response = self.get_page(query, current_start, qtype)
soup = BeautifulSoup(response.text, 'html.parser')
new_results = self.parse_response(soup)
if qtype == 'vid':
if self.name in ['yahoo']:
new_results = self.parse_video_response(soup)
else:
new_results = self.parse_response(soup)
else:
new_results = self.parse_response(soup)
if new_results is None:
break
urls.extend(new_results)
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ def __init__(self):
self.defaultStart = 0
self.startKey = 'start'
self.qtype = 'tbm'
self.name = 'google'

def next_start(self, current_start, prev_results):
return current_start + len(prev_results)
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/mojeek.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self):
self.url = 'https://www.mojeek.co.uk/search'
self.defaultStart = 1
self.startKey = 's'
self.name = 'mojeek'

def parse_response(self, soup):
""" Parse the response and return set of urls
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/parsijoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ def __init__(self):
self.url = 'https://parsijoo.ir/web'
self.defaultStart = 0
self.startKey = 'co'
self.name = 'parsijoo'

def parse_response(self, soup):
""" Parse the response and return set of urls
Expand Down
1 change: 1 addition & 0 deletions app/scrapers/quora.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class Quora(Scraper):
def __init__(self):
Scraper.__init__(self)
self.url = 'https://www.quora.com/search'
self.name = 'quora'

def parse_response(self, soup):
""" Parse the response and return set of urls
Expand Down
23 changes: 23 additions & 0 deletions app/scrapers/yahoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ class Yahoo(Scraper):
def __init__(self):
Scraper.__init__(self)
self.url = 'https://search.yahoo.com/search'
self.videoURL = 'https://video.search.yahoo.com/search/video'
self.defaultStart = 1
self.startKey = 'b'
self.name = 'yahoo'

def parse_response(self, soup):
""" Parse response and returns the urls
Expand All @@ -38,3 +40,24 @@ def parse_response(self, soup):
print('Yahoo parsed: ' + str(urls))

return urls

def parse_video_response(self, soup):
""" Parse response and returns the urls
Returns: urls (list)
[[Tile1, url1], [Title2, url2], ...]
"""
urls = []
for h in soup.findAll('li', attrs={'class': 'vr vres'}):
t = h.find('a', attrs={'class': 'ng'})
r = t.get('data-rurl')
titleDiv = t.find('div', attrs={'class': 'v-meta bx-bb'})
title = titleDiv.find('h3').getText()
urls.append({
'title': title,
'link': r
})

print('Yahoo parsed: ' + str(urls))

return urls
1 change: 1 addition & 0 deletions app/scrapers/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def __init__(self):
Scraper.__init__(self)
self.url = 'https://www.youtube.com/results'
self.queryKey = 'search_query'
self.name = 'youtube'

def parse_response(self, soup):
""" Parse the response and return list of urls
Expand Down

0 comments on commit 24f17b1

Please sign in to comment.