diff --git a/.travis.yml b/.travis.yml index a2c45cc4..9541025a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install -r requirements.txt before_script: - - flake8 . --count --max-complexity=15 --show-source --statistics + - flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100 script: - python -m app.server > /dev/null & - pytest --cov=./ diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index fa6a727b..0e751407 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -42,6 +42,10 @@ def feed_gen(query, engine, count=10, qtype=''): engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) + elif (engine in ['parsijoo']) and (qtype == 'isch'): + urls = scrapers[engine].image_search_without_count(query) + elif (engine in ['parsijoo']) and (qtype == 'vid'): + urls = scrapers[engine].video_search_without_count(query) else: urls = scrapers[engine].search(query, count, qtype) return urls diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index e87df5ff..590f52eb 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -81,3 +81,34 @@ def search_without_count(self, query): soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_response(soup) return urls + + def video_search_without_count(self, query): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name in ['parsijoo']: + url = self.videoURL + payload = {self.queryKey: query} + response = requests.get(url, headers=self.headers, params=payload) + soup = BeautifulSoup(response.text, 'html.parser') + urls = self.parse_video_response(soup) + if urls == []: + return "No video with this Keyword" + else: + return urls + + def image_search_without_count(self, query): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name in ['parsijoo']: + url = self.imageURL + payload = {self.queryKey: query} + response = requests.get(url, headers=self.headers, params=payload) + soup = BeautifulSoup(response.text, 'html.parser') + urls = self.parse_image_response(soup) + return urls diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index a1134bd9..c462ea1d 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -8,6 +8,8 @@ class Parsijoo(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://parsijoo.ir/web' + self.imageURL = 'https://image.parsijoo.ir/image' + self.videoURL = 'https://video.parsijoo.ir/video' self.defaultStart = 0 self.startKey = 'co' self.name = 'parsijoo' @@ -28,3 +30,40 @@ def parse_response(self, soup): print('Parsijoo parsed: ' + str(urls)) return urls + + def parse_video_response(self, soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[Tile1, url1], [Title2, url2], ...] + """ + urls = [] + for a in soup.findAll('a', attrs={'class': 'over-page'}): + title = a.get('title') + url = self.videoURL + a.get('href') + urls.append({ + 'title': title, + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls + + def parse_image_response(self, soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[url1], [url2], ...] + """ + urls = [] + for div in soup.findAll('div', attrs={'class': 'image-container overflow'}): + a = div.find('a') + url = 'https://image.parsijoo.ir' + a.get('href') + urls.append({ + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls diff --git a/app/server.py b/app/server.py index 104fc655..b4ed0369 100644 --- a/app/server.py +++ b/app/server.py @@ -7,7 +7,7 @@ from flask import (Flask, Response, abort, jsonify, make_response, render_template, request) -from app.scrapers import feed_gen, scrapers +from scrapers import feed_gen, scrapers DISABLE_CACHE = True # Temporarily disable the MongoDB cache if DISABLE_CACHE: @@ -77,7 +77,8 @@ def search(search_engine): unicode # unicode is undefined in Python 3 so NameError is raised for line in result: line['link'] = line['link'].encode('utf-8') - line['title'] = line['title'].encode('utf-8') + if 'title' in line: + line['title'] = line['title'].encode('utf-8') if 'desc' in line: line['desc'] = line['desc'].encode('utf-8') except NameError: