Skip to content

Commit

Permalink
Addresses fossasia#320 and fossasia#321 Add Image/Video search suppor…
Browse files Browse the repository at this point in the history
…t for Bing
  • Loading branch information
bhaveshAn committed Jan 19, 2018
1 parent 24f17b1 commit b6849c5
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install:
- pip install -r requirements.txt

before_script:
- flake8 . --count --max-complexity=15 --show-source --statistics
- flake8 . --count --max-complexity=16 --show-source --statistics
script:
- python -m app.server > /dev/null &
- pytest --cov=./
Expand Down
4 changes: 4 additions & 0 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def feed_gen(query, engine, count=10, qtype=''):
engine = old_names.get(engine, engine)
if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
elif (engine in ['bing']) and (qtype == 'vid'):
urls = scrapers[engine].video_search_without_count(query)
elif (engine in ['bing']) and (qtype == 'isch'):
urls = scrapers[engine].image_search_without_count(query)
else:
urls = scrapers[engine].search(query, count, qtype)
return urls
40 changes: 40 additions & 0 deletions app/scrapers/bing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@ class Bing(Scraper):
def __init__(self):
Scraper.__init__(self)
self.url = 'http://www.bing.com/search'
self.videoURL = 'https://www.bing.com/videos/search'
self.imageURL = 'https://www.bing.com/images/search'
self.defaultStart = 1
self.startKey = 'first'
self.name = 'bing'
self.videoKey = 'FORM'
self.imageKey = 'FORM'

def parse_response(self, soup):
""" Parses the reponse and return set of urls
Expand All @@ -30,3 +34,39 @@ def parse_response(self, soup):
print('Bing parsed: ' + str(urls))

return urls

def parse_video_response(self, soup):
""" Parse response and returns the urls
Returns: urls (list)
[[Tile1, url1], [Title2, url2], ...]
"""
urls = []
for a in soup.findAll('a', attrs={'class': 'mc_vtvc_link'}):
title = a.get('aria-label').split(' Duration')[0]
url = 'https://www.bing.com' + a.get('href')
urls.append({
'title': title,
'link': url
})

print('Bing parsed: ' + str(urls))

return urls

def parse_image_response(self, soup):
""" Parse response and returns the urls
Returns: urls (list)
[[url1], [url2], ...]
"""
urls = []
for a in soup.findAll('a', attrs={'class': 'iusc'}):
url = 'https://www.bing.com' + a.get('href')
urls.append({
'link': url
})

print('Bing parsed: ' + str(urls))

return urls
28 changes: 28 additions & 0 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,31 @@ def search_without_count(self, query):
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_response(soup)
return urls

def video_search_without_count(self, query):
"""
Search for the query and return set of urls
Returns: list
"""
urls = []
if self.name in ['bing']:
url = self.videoURL
payload = {self.queryKey: query, self.videoKey: 'HDRSC3'}
response = requests.get(url, headers=self.headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_video_response(soup)
return urls

def image_search_without_count(self, query):
"""
Search for the query and return set of urls
Returns: list
"""
urls = []
if self.name in ['bing']:
url = self.imageURL
payload = {self.queryKey: query, self.imageKey: 'HDRSC2'}
response = requests.get(url, headers=self.headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_image_response(soup)
return urls
5 changes: 3 additions & 2 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from flask import (Flask, Response, abort, jsonify, make_response,
render_template, request)

from app.scrapers import feed_gen, scrapers
from scrapers import feed_gen, scrapers

DISABLE_CACHE = True # Temporarily disable the MongoDB cache
if DISABLE_CACHE:
Expand Down Expand Up @@ -77,7 +77,8 @@ def search(search_engine):
unicode # unicode is undefined in Python 3 so NameError is raised
for line in result:
line['link'] = line['link'].encode('utf-8')
line['title'] = line['title'].encode('utf-8')
if 'desc' in line:
line['title'] = line['title'].encode('utf-8')
if 'desc' in line:
line['desc'] = line['desc'].encode('utf-8')
except NameError:
Expand Down
51 changes: 51 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b6849c5

Please sign in to comment.