Addresses fossasia#320 and fossasia#321 Add Image/Video search suppor…

…t for Bing
bhaveshAn · Jan 19, 2018 · b6849c5 · b6849c5
1 parent 24f17b1
commit b6849c5
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 3 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,7 +9,7 @@ install:
   - pip install -r requirements.txt
 
 before_script:
-  - flake8 . --count --max-complexity=15 --show-source --statistics
+  - flake8 . --count --max-complexity=16 --show-source --statistics
 script:
   - python -m app.server > /dev/null &
   - pytest --cov=./

diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py
@@ -42,6 +42,10 @@ def feed_gen(query, engine, count=10, qtype=''):
     engine = old_names.get(engine, engine)
     if engine in ('quora', 'youtube'):
         urls = scrapers[engine].search_without_count(query)
+    elif (engine in ['bing']) and (qtype == 'vid'):
+        urls = scrapers[engine].video_search_without_count(query)
+    elif (engine in ['bing']) and (qtype == 'isch'):
+        urls = scrapers[engine].image_search_without_count(query)
     else:
         urls = scrapers[engine].search(query, count, qtype)
     return urls
diff --git a/app/scrapers/bing.py b/app/scrapers/bing.py
@@ -8,9 +8,13 @@ class Bing(Scraper):
     def __init__(self):
         Scraper.__init__(self)
         self.url = 'http://www.bing.com/search'
+        self.videoURL = 'https://www.bing.com/videos/search'
+        self.imageURL = 'https://www.bing.com/images/search'
         self.defaultStart = 1
         self.startKey = 'first'
         self.name = 'bing'
+        self.videoKey = 'FORM'
+        self.imageKey = 'FORM'
 
     def parse_response(self, soup):
         """ Parses the reponse and return set of urls
@@ -30,3 +34,39 @@ def parse_response(self, soup):
         print('Bing parsed: ' + str(urls))
 
         return urls
+
+    def parse_video_response(self, soup):
+        """ Parse response and returns the urls
+
+            Returns: urls (list)
+                    [[Tile1, url1], [Title2, url2], ...]
+        """
+        urls = []
+        for a in soup.findAll('a', attrs={'class': 'mc_vtvc_link'}):
+            title = a.get('aria-label').split(' Duration')[0]
+            url = 'https://www.bing.com' + a.get('href')
+            urls.append({
+                'title': title,
+                'link': url
+            })
+
+        print('Bing parsed: ' + str(urls))
+
+        return urls
+
+    def parse_image_response(self, soup):
+        """ Parse response and returns the urls
+
+            Returns: urls (list)
+                    [[url1], [url2], ...]
+        """
+        urls = []
+        for a in soup.findAll('a', attrs={'class': 'iusc'}):
+            url = 'https://www.bing.com' + a.get('href')
+            urls.append({
+                'link': url
+            })
+
+        print('Bing parsed: ' + str(urls))
+
+        return urls
diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py
@@ -81,3 +81,31 @@ def search_without_count(self, query):
         soup = BeautifulSoup(response.text, 'html.parser')
         urls = self.parse_response(soup)
         return urls
+
+    def video_search_without_count(self, query):
+        """
+            Search for the query and return set of urls
+            Returns: list
+        """
+        urls = []
+        if self.name in ['bing']:
+            url = self.videoURL
+            payload = {self.queryKey: query, self.videoKey: 'HDRSC3'}
+        response = requests.get(url, headers=self.headers, params=payload)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        urls = self.parse_video_response(soup)
+        return urls
+
+    def image_search_without_count(self, query):
+        """
+            Search for the query and return set of urls
+            Returns: list
+        """
+        urls = []
+        if self.name in ['bing']:
+            url = self.imageURL
+            payload = {self.queryKey: query, self.imageKey: 'HDRSC2'}
+        response = requests.get(url, headers=self.headers, params=payload)
+        soup = BeautifulSoup(response.text, 'html.parser')
+        urls = self.parse_image_response(soup)
+        return urls
diff --git a/app/server.py b/app/server.py
@@ -7,7 +7,7 @@
 from flask import (Flask, Response, abort, jsonify, make_response,
                    render_template, request)
 
-from app.scrapers import feed_gen, scrapers
+from scrapers import feed_gen, scrapers
 
 DISABLE_CACHE = True  # Temporarily disable the MongoDB cache
 if DISABLE_CACHE:
@@ -77,7 +77,8 @@ def search(search_engine):
             unicode  # unicode is undefined in Python 3 so NameError is raised
             for line in result:
                 line['link'] = line['link'].encode('utf-8')
-                line['title'] = line['title'].encode('utf-8')
+                if 'desc' in line:
+                    line['title'] = line['title'].encode('utf-8')
                 if 'desc' in line:
                     line['desc'] = line['desc'].encode('utf-8')
         except NameError:

diff --git a/package-lock.json b/package-lock.json