From 70899f8ac5b7105870642c9c26df7a7cadbbb183 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Sat, 20 Jan 2018 17:55:47 +0530 Subject: [PATCH 01/12] Addresses #320 and #321 Add Image/Video search support for Parsijoo --- .travis.yml | 2 +- app/scrapers/__init__.py | 4 +++ app/scrapers/generalized.py | 31 ++++++++++++++++++++++ app/scrapers/parsijoo.py | 39 ++++++++++++++++++++++++++++ app/server.py | 9 ++++--- package-lock.json | 51 +++++++++++++++++++++++++++++++++++++ 6 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 package-lock.json diff --git a/.travis.yml b/.travis.yml index a2c45cc4..9541025a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install -r requirements.txt before_script: - - flake8 . --count --max-complexity=15 --show-source --statistics + - flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100 script: - python -m app.server > /dev/null & - pytest --cov=./ diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index fa6a727b..0e751407 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -42,6 +42,10 @@ def feed_gen(query, engine, count=10, qtype=''): engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) + elif (engine in ['parsijoo']) and (qtype == 'isch'): + urls = scrapers[engine].image_search_without_count(query) + elif (engine in ['parsijoo']) and (qtype == 'vid'): + urls = scrapers[engine].video_search_without_count(query) else: urls = scrapers[engine].search(query, count, qtype) return urls diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index e87df5ff..590f52eb 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -81,3 +81,34 @@ def search_without_count(self, query): soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_response(soup) return urls + + def video_search_without_count(self, query): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name in ['parsijoo']: + url = self.videoURL + payload = {self.queryKey: query} + response = requests.get(url, headers=self.headers, params=payload) + soup = BeautifulSoup(response.text, 'html.parser') + urls = self.parse_video_response(soup) + if urls == []: + return "No video with this Keyword" + else: + return urls + + def image_search_without_count(self, query): + """ + Search for the query and return set of urls + Returns: list + """ + urls = [] + if self.name in ['parsijoo']: + url = self.imageURL + payload = {self.queryKey: query} + response = requests.get(url, headers=self.headers, params=payload) + soup = BeautifulSoup(response.text, 'html.parser') + urls = self.parse_image_response(soup) + return urls diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index a1134bd9..c462ea1d 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -8,6 +8,8 @@ class Parsijoo(Scraper): def __init__(self): Scraper.__init__(self) self.url = 'https://parsijoo.ir/web' + self.imageURL = 'https://image.parsijoo.ir/image' + self.videoURL = 'https://video.parsijoo.ir/video' self.defaultStart = 0 self.startKey = 'co' self.name = 'parsijoo' @@ -28,3 +30,40 @@ def parse_response(self, soup): print('Parsijoo parsed: ' + str(urls)) return urls + + def parse_video_response(self, soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[Tile1, url1], [Title2, url2], ...] + """ + urls = [] + for a in soup.findAll('a', attrs={'class': 'over-page'}): + title = a.get('title') + url = self.videoURL + a.get('href') + urls.append({ + 'title': title, + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls + + def parse_image_response(self, soup): + """ Parse response and returns the urls + + Returns: urls (list) + [[url1], [url2], ...] + """ + urls = [] + for div in soup.findAll('div', attrs={'class': 'image-container overflow'}): + a = div.find('a') + url = 'https://image.parsijoo.ir' + a.get('href') + urls.append({ + 'link': url + }) + + print('Parsijoo parsed: ' + str(urls)) + + return urls diff --git a/app/server.py b/app/server.py index 104fc655..fee9a1cd 100644 --- a/app/server.py +++ b/app/server.py @@ -6,8 +6,10 @@ from dicttoxml import dicttoxml from flask import (Flask, Response, abort, jsonify, make_response, render_template, request) - -from app.scrapers import feed_gen, scrapers +try: + from app.scrapers import feed_gen, scrapers +except ImportError: + from scrapers import feed_gen, scrapers DISABLE_CACHE = True # Temporarily disable the MongoDB cache if DISABLE_CACHE: @@ -77,7 +79,8 @@ def search(search_engine): unicode # unicode is undefined in Python 3 so NameError is raised for line in result: line['link'] = line['link'].encode('utf-8') - line['title'] = line['title'].encode('utf-8') + if 'title' in line: + line['title'] = line['title'].encode('utf-8') if 'desc' in line: line['desc'] = line['desc'].encode('utf-8') except NameError: diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..beab31fb --- /dev/null +++ b/package-lock.json @@ -0,0 +1,51 @@ +{ + "name": "query-server", + "version": "0.1.0", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "bower": { + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/bower/-/bower-1.8.2.tgz", + "integrity": "sha1-rfU1KcjUrwLvJPuNU0HBQZ0z4vc=" + }, + "ci-info": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-1.1.2.tgz", + "integrity": "sha512-uTGIPNx/nSpBdsF6xnseRXLLtfr9VLqkz8ZqHXr3Y7b6SftyRxBGjwMtJj1OhNbmlc1wZzLNAlAcvyIiE8a6ZA==", + "dev": true + }, + "husky": { + "version": "0.14.3", + "resolved": "https://registry.npmjs.org/husky/-/husky-0.14.3.tgz", + "integrity": "sha512-e21wivqHpstpoiWA/Yi8eFti8E+sQDSS53cpJsPptPs295QTOQR0ZwnHo2TXy1XOpZFD9rPOd3NpmqTK6uMLJA==", + "dev": true, + "requires": { + "is-ci": "1.1.0", + "normalize-path": "1.0.0", + "strip-indent": "2.0.0" + } + }, + "is-ci": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-ci/-/is-ci-1.1.0.tgz", + "integrity": "sha512-c7TnwxLePuqIlxHgr7xtxzycJPegNHFuIrBkwbf8hc58//+Op1CqFkyS+xnIMkwn9UsJIwc174BIjkyBmSpjKg==", + "dev": true, + "requires": { + "ci-info": "1.1.2" + } + }, + "normalize-path": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-1.0.0.tgz", + "integrity": "sha1-MtDkcvkf80VwHBWoMRAY07CpA3k=", + "dev": true + }, + "strip-indent": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-2.0.0.tgz", + "integrity": "sha1-XvjbKV0B5u1sv3qrlpmNeCJSe2g=", + "dev": true + } + } +} From ea4618f2e29860aa50224646e30841f1345705e2 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 13:45:14 +0530 Subject: [PATCH 02/12] Add staticmethod decorator --- app/scrapers/parsijoo.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index dfc9004c..9719965a 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -31,8 +31,9 @@ def parse_response(soup): print('Parsijoo parsed: ' + str(urls)) return urls - - def parse_video_response(self, soup): + + @staticmethod + def parse_video_response(soup): """ Parse response and returns the urls Returns: urls (list) @@ -51,7 +52,8 @@ def parse_video_response(self, soup): return urls - def parse_image_response(self, soup): + @staticmethod + def parse_image_response(soup): """ Parse response and returns the urls Returns: urls (list) From 5692c94708f7581718e95459c110c63c14006522 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 13:55:02 +0530 Subject: [PATCH 03/12] Add staticmethod decorator --- app/scrapers/parsijoo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index 9719965a..e8494ad9 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -42,7 +42,7 @@ def parse_video_response(soup): urls = [] for a in soup.findAll('a', attrs={'class': 'over-page'}): title = a.get('title') - url = self.videoURL + a.get('href') + url = 'https://video.parsijoo.ir/video' + a.get('href') urls.append({ 'title': title, 'link': url From e80eaeb11e3772f3332dcc0034a927f3b41032f3 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 13:58:44 +0530 Subject: [PATCH 04/12] Add staticmethod decorator --- app/scrapers/parsijoo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index e8494ad9..fa0ade48 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -31,7 +31,7 @@ def parse_response(soup): print('Parsijoo parsed: ' + str(urls)) return urls - + @staticmethod def parse_video_response(soup): """ Parse response and returns the urls From 7c17a8935133c00c9913ebe17efa67322a0dc3d1 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 14:43:41 +0530 Subject: [PATCH 05/12] Add staticmethod decorator --- app/scrapers/parsijoo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index fa0ade48..59546712 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -42,7 +42,7 @@ def parse_video_response(soup): urls = [] for a in soup.findAll('a', attrs={'class': 'over-page'}): title = a.get('title') - url = 'https://video.parsijoo.ir/video' + a.get('href') + url = 'https://video.parsijoo.ir' + a.get('href') urls.append({ 'title': title, 'link': url From 525149153b81076efe5be81b17f682116de9db5d Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 15:06:44 +0530 Subject: [PATCH 06/12] Add staticmethod decorator --- app/scrapers/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 0e751407..8bf1fd25 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -42,9 +42,9 @@ def feed_gen(query, engine, count=10, qtype=''): engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) - elif (engine in ['parsijoo']) and (qtype == 'isch'): + elif engine in ('parsijoo') and qtype == 'isch': urls = scrapers[engine].image_search_without_count(query) - elif (engine in ['parsijoo']) and (qtype == 'vid'): + elif engine in ('parsijoo') and qtype == 'vid': urls = scrapers[engine].video_search_without_count(query) else: urls = scrapers[engine].search(query, count, qtype) From cea140807bf6db50ac9c3f2142947d7115371dd4 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 15:09:45 +0530 Subject: [PATCH 07/12] Add staticmethod decorator --- app/scrapers/generalized.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index c7531bfd..d399b175 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -97,7 +97,7 @@ def video_search_without_count(self, query): response = requests.get(url, headers=self.headers, params=payload) soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_video_response(soup) - if urls == []: + if len(urls) == 0: return "No video with this Keyword" else: return urls From c5b0397e975c62ec50fce131c21dc6cef521b314 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 15:19:03 +0530 Subject: [PATCH 08/12] Add staticmethod decorator --- app/scrapers/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 8bf1fd25..66a1b39f 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -42,9 +42,9 @@ def feed_gen(query, engine, count=10, qtype=''): engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): urls = scrapers[engine].search_without_count(query) - elif engine in ('parsijoo') and qtype == 'isch': + elif engine in ('parsijoo',) and qtype == 'isch': urls = scrapers[engine].image_search_without_count(query) - elif engine in ('parsijoo') and qtype == 'vid': + elif engine in ('parsijoo',) and qtype == 'vid': urls = scrapers[engine].video_search_without_count(query) else: urls = scrapers[engine].search(query, count, qtype) From 8f473381cf6a4a0fd435b7cac80b70b6a8805571 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 15:42:36 +0530 Subject: [PATCH 09/12] Add staticmethod decorator --- .travis.yml | 2 +- app/scrapers/parsijoo.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 9541025a..2be53a4b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install -r requirements.txt before_script: - - flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100 + - flake8 . --count --max-complexity=16 --show-source --statistics script: - python -m app.server > /dev/null & - pytest --cov=./ diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index 59546712..69e5e40a 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -60,7 +60,8 @@ def parse_image_response(soup): [[url1], [url2], ...] """ urls = [] - for div in soup.findAll('div', attrs={'class': 'image-container overflow'}): + for div in \ + soup.findAll('div', attrs={'class': 'image-container overflow'}): a = div.find('a') url = 'https://image.parsijoo.ir' + a.get('href') urls.append({ From 40e5246c8c0b343aa7b248399e35a8f6f4f16d2e Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 15:46:27 +0530 Subject: [PATCH 10/12] Add staticmethod decorator --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2be53a4b..9541025a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - pip install -r requirements.txt before_script: - - flake8 . --count --max-complexity=16 --show-source --statistics + - flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100 script: - python -m app.server > /dev/null & - pytest --cov=./ From 1e865a9928595a4c750a319e8c6fefba60eca7a1 Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Mon, 22 Jan 2018 16:26:08 +0530 Subject: [PATCH 11/12] Add staticmethod decorator --- package-lock.json | 51 ----------------------------------------------- 1 file changed, 51 deletions(-) delete mode 100644 package-lock.json diff --git a/package-lock.json b/package-lock.json deleted file mode 100644 index beab31fb..00000000 --- a/package-lock.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "name": "query-server", - "version": "0.1.0", - "lockfileVersion": 1, - "requires": true, - "dependencies": { - "bower": { - "version": "1.8.2", - "resolved": "https://registry.npmjs.org/bower/-/bower-1.8.2.tgz", - "integrity": "sha1-rfU1KcjUrwLvJPuNU0HBQZ0z4vc=" - }, - "ci-info": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-1.1.2.tgz", - "integrity": "sha512-uTGIPNx/nSpBdsF6xnseRXLLtfr9VLqkz8ZqHXr3Y7b6SftyRxBGjwMtJj1OhNbmlc1wZzLNAlAcvyIiE8a6ZA==", - "dev": true - }, - "husky": { - "version": "0.14.3", - "resolved": "https://registry.npmjs.org/husky/-/husky-0.14.3.tgz", - "integrity": "sha512-e21wivqHpstpoiWA/Yi8eFti8E+sQDSS53cpJsPptPs295QTOQR0ZwnHo2TXy1XOpZFD9rPOd3NpmqTK6uMLJA==", - "dev": true, - "requires": { - "is-ci": "1.1.0", - "normalize-path": "1.0.0", - "strip-indent": "2.0.0" - } - }, - "is-ci": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/is-ci/-/is-ci-1.1.0.tgz", - "integrity": "sha512-c7TnwxLePuqIlxHgr7xtxzycJPegNHFuIrBkwbf8hc58//+Op1CqFkyS+xnIMkwn9UsJIwc174BIjkyBmSpjKg==", - "dev": true, - "requires": { - "ci-info": "1.1.2" - } - }, - "normalize-path": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-1.0.0.tgz", - "integrity": "sha1-MtDkcvkf80VwHBWoMRAY07CpA3k=", - "dev": true - }, - "strip-indent": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-2.0.0.tgz", - "integrity": "sha1-XvjbKV0B5u1sv3qrlpmNeCJSe2g=", - "dev": true - } - } -} From c37af57d7ffdf2dd6376cacb1bcb9367a8f77b8a Mon Sep 17 00:00:00 2001 From: bhaveshAn Date: Wed, 24 Jan 2018 00:44:29 +0530 Subject: [PATCH 12/12] Add tests --- test/test_parsijoo.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/test_parsijoo.py b/test/test_parsijoo.py index e89b5387..8682aedf 100644 --- a/test/test_parsijoo.py +++ b/test/test_parsijoo.py @@ -20,3 +20,28 @@ def test_parse_response(): }] resp = Parsijoo().parse_response(dummy_soup) assert resp == expected_resp + + +def test_parse_video_response(): + html_text = """mock_title""" + dummy_soup = BeautifulSoup(html_text, 'html.parser') + url = 'https://video.parsijoo.ir' + "mock_url" + expected_resp = [{ + 'title': u'mock_title', + 'link': url, + }] + resp = Parsijoo().parse_video_response(dummy_soup) + assert resp == expected_resp + + +def test_parse_image_response(): + html_text = """""" + dummy_soup = BeautifulSoup(html_text, 'html.parser') + image_url = 'https://image.parsijoo.ir' + 'mock_url' + expected_resp = [{ + 'link': image_url, + }] + resp = Parsijoo().parse_image_response(dummy_soup) + assert resp == expected_resp