Skip to content

Commit

Permalink
Added images support
Browse files Browse the repository at this point in the history
  • Loading branch information
vaibhavsingh97 committed Jan 10, 2018
1 parent db55ec9 commit 39397fb
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 39 deletions.
39 changes: 23 additions & 16 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,42 @@
from .youtube import Youtube

scrapers = {
'ask': Ask(),
'baidu': Baidu(),
'bing': Bing(),
'dailymotion': DailyMotion(),
'duckduckgo': DuckDuckGo(),
'exalead': ExaLead(),
'google': Google(),
'mojeek': Mojeek(),
'parsijoo': Parsijoo(),
'quora': Quora(),
'yahoo': Yahoo(),
'yandex': Yandex(),
'youtube': Youtube()
'google': Google('image')
# 'ask': Ask(),
# 'baidu': Baidu(),
# 'bing': Bing(),
# 'dailymotion': DailyMotion(),
# 'duckduckgo': DuckDuckGo(),
# 'exalead': ExaLead(),
# 'google': Google(),
# 'mojeek': Mojeek(),
# 'parsijoo': Parsijoo(),
# 'quora': Quora(),
# 'yahoo': Yahoo(),
# 'yandex': Yandex(),
# 'youtube': Youtube()
}



def small_test():
assert isinstance(scrapers['google'].search('fossasia', 1), list)


def feed_gen(query, engine, count=10):
def feed_gen(query, engine, extra, count=10):
print("Extra variable:", extra)
engine = engine.lower()
print("Engine: ", engine)
print("Searcgh Engine", scrapers[engine])
# provide temporary backwards compatibility for old names
old_names = {'ubaidu': 'baidu',
'vdailymotion': 'dailymotion',
'tyoutube': 'youtube'}
engine = old_names.get(engine, engine)
Scarper_class = scrapers(extra)

if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
urls = Scarper_class[engine].search_without_count(query)
else:
urls = scrapers[engine].search(query, count)
urls = Scarper_class[engine].search(query, count, extra)
return urls
15 changes: 10 additions & 5 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,20 @@ class Scraper:
)
}

def __init__(self):
def __init__(self, extra=None):
self.extra = extra
pass

def get_page(self, query, startIndex=0):
def get_page(self, query, startIndex=0, extra=None):
""" Fetch the google search results page
Returns : Results Page
"""
payload = {self.queryKey: query, self.startKey: startIndex}
if extra is None:
payload = {self.queryKey: query, self.startKey: startIndex}
else:
payload = {self.queryKey: query, self.startKey: startIndex, "source": "lnms", "tbm": "isch"}
response = requests.get(self.url, headers=self.headers, params=payload)
print("Response URL: ", response.url)
return response

def parse_response(self, soup):
Expand All @@ -34,7 +39,7 @@ def parse_response(self, soup):
def next_start(self, current_start, prev_results):
return current_start + len(prev_results)

def search(self, query, num_results):
def search(self, query, num_results, extra=None):
"""
Search for the query and return set of urls
Returns: list
Expand All @@ -43,7 +48,7 @@ def search(self, query, num_results):
current_start = self.defaultStart

while(len(urls) < num_results):
response = self.get_page(query, current_start)
response = self.get_page(query, current_start, extra)
soup = BeautifulSoup(response.text, 'html.parser')
new_results = self.parse_response(soup)
if new_results is None:
Expand Down
4 changes: 3 additions & 1 deletion app/scrapers/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
class Google(Scraper):
"""Scrapper class for Google"""

def __init__(self):
def __init__(self, extra=None):
Scraper.__init__(self)
self.url = 'https://www.google.com/search'
self.defaultStart = 0
self.startKey = 'start'
self.extra = extra
print("Type", extra)

def next_start(self, current_start, prev_results):
return current_start + len(prev_results)
Expand Down
33 changes: 17 additions & 16 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from dicttoxml import dicttoxml
from flask import (Flask, Response, abort, jsonify, make_response,
render_template, request)

from query_cache import lookup, store
from pymongo import MongoClient
from scrapers import feed_gen, scrapers

app = Flask(__name__)
err = ""

client = MongoClient(os.environ.get('MONGO_URI', 'mongodb://localhost:27017/'))
db = client['query-server-v2']
errorObj = {
'type': 'Internal Server Error',
'status_code': 500,
Expand Down Expand Up @@ -40,8 +41,12 @@ def search(search_engine):
try:
count = int(request.args.get('num', 10))
qformat = request.args.get('format', 'json').lower()
qtype = request.args.get('type','image').lower()
if qformat not in ('json', 'xml', 'csv'):
abort(400, 'Not Found - undefined format')

if qtype not in ('image'):
abort(400, 'Not Found - undefined query')

engine = search_engine
if engine not in scrapers:
Expand All @@ -52,20 +57,16 @@ def search(search_engine):
if not query:
error = [400, 'Not Found - missing query', qformat]
return bad_request(error)
# print("Extra query", qExtra)
result = feed_gen(query, engine, count, qtype)
print("Result", result)
if not result:
error = [404, 'No response', qformat]
return bad_request(error)

# first see if we can get the results for the cache
engine_and_query = engine + ':' + query
result = lookup(engine_and_query)
if result:
print("cache hit: {}".format(engine_and_query))
else:
result = feed_gen(query, engine, count)
if result:
# store the result in the cache to speed up future searches
store(engine_and_query, result)
else:
error = [404, 'No response', engine_and_query]
return bad_request(error)
if db['queries'].find({query: query}).limit(1) is False:
db['queries'].insert(
{"query": query, "engine": engine, "qformat": qformat})

try:
unicode # unicode is undefined in Python 3 so NameError is raised
Expand Down Expand Up @@ -103,4 +104,4 @@ def set_header(r):

if __name__ == '__main__':
port = int(os.environ.get('PORT', 7001))
app.run(host='0.0.0.0', port=port, debug=args.dev)
app.run(host='0.0.0.0', port=port, debug=args.dev)
15 changes: 14 additions & 1 deletion app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ <h1><code>query-server</code></h1>
<form id="searchForm">
<div class="container" style="padding:0; margin: 0 auto;">
<div class="row">
<div class="col-sm-8">
<div class="col-sm-4">
<div class="input-group input-group-lg" style="min-width:100%">
<input id="query" type="text" class="center-block form-control" autocomplete="on" placeholder="Search with query-server">
</div>
Expand All @@ -62,6 +62,19 @@ <h1><code>query-server</code></h1>
</label>
</div>
</div>
<div class="col-sm-4">
<form id="type">
<!-- <label class="radio-inline">
<input type="radio" name="general" checked="">General
</label> -->
<label class="radio-inline">
<input type="radio" name="image" value="image" >Image
</label>
<!-- <label class="radio-inline">
<input type="radio" name="video">Video
</label> -->
</form>
</div>
</div>
<br/>
<div class="col-sm-4 col-xs-6" style="padding:0; margin: 0 auto;">
Expand Down

0 comments on commit 39397fb

Please sign in to comment.