From cbf394122007829fbf4c7f08b2ab870830775b51 Mon Sep 17 00:00:00 2001 From: rupav jain Date: Tue, 6 Feb 2018 19:37:44 +0530 Subject: [PATCH 1/2] Prevent app from crashing --- app/scrapers/__init__.py | 6 +++--- app/scrapers/generalized.py | 23 +++++++++++++++++------ app/scrapers/parsijoo.py | 7 ++++--- app/server.py | 19 ++++++++++++------- 4 files changed, 36 insertions(+), 19 deletions(-) diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 548fe1f8..209f2d5c 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -43,7 +43,7 @@ def feed_gen(query, engine, count=10, qtype=''): 'tyoutube': 'youtube'} engine = old_names.get(engine, engine) if engine in ('quora', 'youtube'): - urls = scrapers[engine].search_without_count(query) + urls, status_code = scrapers[engine].search_without_count(query) else: - urls = scrapers[engine].search(query, count, qtype) - return urls + urls, status_code = scrapers[engine].search(query, count, qtype) + return (urls, status_code) diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index 12ade912..b1154980 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -42,8 +42,11 @@ def get_page(self, query, startIndex=0, qtype=''): if self.name == 'mojeek' and qtype == 'news': payload['fmt'] = 'news' response = requests.get(url, headers=self.headers, params=payload) + status_code = response.status_code + if(status_code == 400 or status_code == 404): + return (None, status_code) print(response.url) - return response + return (response, status_code) @staticmethod def parse_response(soup): @@ -64,16 +67,21 @@ def search(self, query, num_results, qtype=''): """ urls = [] current_start = self.defaultStart - while (len(urls) < num_results): - response = self.get_page(query, current_start, qtype) + response, status_code = self.get_page(query, current_start, qtype) + if response is None: + if(len(urls) == 0): + return (None, status_code) + else: + print("Couldn't fetch more results.") + return (urls, 200) soup = BeautifulSoup(response.text, 'html.parser') new_results = self.call_appropriate_parser(qtype, soup) - if new_results is None: + if new_results is None or len(new_results) == 0: break urls.extend(new_results) current_start = self.next_start(current_start, new_results) - return urls[: num_results] + return (urls[: num_results], 200) def call_appropriate_parser(self, qtype, soup): new_results = '' @@ -95,6 +103,9 @@ def search_without_count(self, query): urls = [] payload = {self.queryKey: query} response = requests.get(self.url, headers=self.headers, params=payload) + status_code = response.status_code + if(status_code == 400 or status_code == 404): + return(None, status_code) soup = BeautifulSoup(response.text, 'html.parser') urls = self.parse_response(soup) - return urls + return (urls, 200) diff --git a/app/scrapers/parsijoo.py b/app/scrapers/parsijoo.py index 0edc14be..f687430d 100644 --- a/app/scrapers/parsijoo.py +++ b/app/scrapers/parsijoo.py @@ -88,7 +88,8 @@ def parse_news_response(soup): title = div.a.getText() link = unquote(div.a.get('href')) urls.append({'title': title, 'link': link}) - - print('Parsijoo parsed: ' + str(urls)) - + try: + print('Parsijoo parsed: ' + str(urls)) + except Exception: + pass return urls diff --git a/app/server.py b/app/server.py index e8ee34d9..c3865afc 100644 --- a/app/server.py +++ b/app/server.py @@ -1,10 +1,9 @@ -import json import os from argparse import ArgumentParser from defusedxml.minidom import parseString from dicttoxml import dicttoxml -from flask import (Flask, Response, abort, jsonify, make_response, +from flask import (Flask, Response, abort, jsonify, render_template, request) try: @@ -39,8 +38,14 @@ def index(): def bad_request(error): message = {'Error': error[1], 'Status Code': error[0]} - response = dicttoxml(message) if error[2] == 'xml' else json.dumps(message) - return make_response(response, error[0]) + print(error[2]) + if error[2] == 'xml': + return Response(dicttoxml(message), mimetype='text/xml') + elif error[2] == 'csv': + message = "'Error', 'Status Code' \n {}, {}".format(error[1], error[0]) + return Response(message, mimetype='text/csv') + else: + return jsonify(message) @app.route('/api/v1/search/', methods=['GET']) @@ -54,7 +59,7 @@ def search(search_engine): engine = search_engine if engine not in scrapers: - error = [404, 'Incorrect search engine', engine] + error = [404, 'Incorrect search engine', qformat] return bad_request(error) query = request.args.get('query') @@ -68,12 +73,12 @@ def search(search_engine): if result: print("cache hit: {}".format(engine_and_query)) else: - result = feed_gen(query, engine, count, qtype) + result, status_code = feed_gen(query, engine, count, qtype) if result: # store the result in the cache to speed up future searches store(engine_and_query, result) else: - error = [404, 'No response', engine_and_query] + error = [status_code, 'No response', qformat] return bad_request(error) try: From 2ad4f21c83b29de3fcdd5f5e26a072921cc2e4ef Mon Sep 17 00:00:00 2001 From: rupav jain Date: Wed, 7 Feb 2018 23:01:17 +0530 Subject: [PATCH 2/2] Add modified tests --- app/scrapers/__init__.py | 2 +- app/scrapers/generalized.py | 2 +- app/server.py | 4 ++-- test/test_generalized.py | 12 ++++++------ test/test_server.py | 2 ++ 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/app/scrapers/__init__.py b/app/scrapers/__init__.py index 209f2d5c..4be5eff1 100644 --- a/app/scrapers/__init__.py +++ b/app/scrapers/__init__.py @@ -32,7 +32,7 @@ def small_test(): - assert isinstance(scrapers['google'].search('fossasia', 1), list) + assert isinstance(scrapers['google'].search('fossasia', 1)[0], list) def feed_gen(query, engine, count=10, qtype=''): diff --git a/app/scrapers/generalized.py b/app/scrapers/generalized.py index b1154980..0807741c 100644 --- a/app/scrapers/generalized.py +++ b/app/scrapers/generalized.py @@ -63,7 +63,7 @@ def next_start(current_start, prev_results): def search(self, query, num_results, qtype=''): """ Search for the query and return set of urls - Returns: list + Returns: list, status_code """ urls = [] current_start = self.defaultStart diff --git a/app/server.py b/app/server.py index c3865afc..d3cef4b3 100644 --- a/app/server.py +++ b/app/server.py @@ -59,7 +59,7 @@ def search(search_engine): engine = search_engine if engine not in scrapers: - error = [404, 'Incorrect search engine', qformat] + error = [404, 'Incorrect search engine', engine] return bad_request(error) query = request.args.get('query') @@ -78,7 +78,7 @@ def search(search_engine): # store the result in the cache to speed up future searches store(engine_and_query, result) else: - error = [status_code, 'No response', qformat] + error = [status_code, 'No response', engine_and_query] return bad_request(error) try: diff --git a/test/test_generalized.py b/test/test_generalized.py index d0775d6d..85a57d80 100644 --- a/test/test_generalized.py +++ b/test/test_generalized.py @@ -9,7 +9,7 @@ def test_get_page(mock_request_get, mock_response): mock_request_get.return_value = mock_response mock_response.url = "Mock Url" - response = Scraper().get_page("dummy_query") + response, _ = Scraper().get_page("dummy_query") assert response == mock_response expected_payload = {'q': 'dummy_query', '': ''} expected_headers = { @@ -38,7 +38,7 @@ def test_next_start(): @patch('app.scrapers.generalized.Scraper.get_page') @patch('requests.models.Response') def test_search(mock_resp, mock_get_page, mock_parse_resp): - mock_get_page.return_value = mock_resp + mock_get_page.return_value = mock_resp, 200 mock_resp.text = "Mock response" expected_resp = [{ 'title': 'mock_title', @@ -48,18 +48,18 @@ def test_search(mock_resp, mock_get_page, mock_parse_resp): # classes inheriting Scraper. Thus, returning dummy # response instead of raising NotImplementedError mock_parse_resp.return_value = expected_resp - resp = Scraper().search('dummy_query', 1) + resp, _ = Scraper().search('dummy_query', 1) assert resp == expected_resp @patch('app.scrapers.generalized.Scraper.get_page') @patch('requests.models.Response') def test_search_parsed_response_none(mock_resp, mock_get): - mock_get.return_value = mock_resp + mock_get.return_value = mock_resp, 200 mock_resp.text = "Mock Response" with patch('app.scrapers.generalized.Scraper.parse_response', return_value=None): - resp = Scraper().search('dummy_query', 1) + resp, _ = Scraper().search('dummy_query', 1) assert resp == [] @@ -82,7 +82,7 @@ def test_search_without_count(mock_resp, mock_parse_resp, mock_get): ) } mock_parse_resp.return_value = expected_resp - resp = Scraper().search_without_count('dummy_query') + resp, _ = Scraper().search_without_count('dummy_query') assert resp == expected_resp mock_get.assert_called_with( '', headers=expected_headers, params=expected_payload) diff --git a/test/test_server.py b/test/test_server.py index 4ac2c046..797e2277 100644 --- a/test/test_server.py +++ b/test/test_server.py @@ -67,6 +67,7 @@ def test_api_search_missing_query(mock_bad_request): assert resp == "Mock Response" +''' @patch('app.server.bad_request', return_value="Mock Response") def test_api_search_for_no_response(mock_bad_request): url = '/api/v1/search/google?query=fossasia' @@ -76,6 +77,7 @@ def test_api_search_for_no_response(mock_bad_request): mock_bad_request.assert_called_with([404, 'No response', 'google:fossasia']) assert resp == "Mock Response" +''' def test_api_search_for_cache_hit():