Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Added images support #427

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 23 additions & 16 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,35 +15,42 @@
from .youtube import Youtube

scrapers = {
'ask': Ask(),
'baidu': Baidu(),
'bing': Bing(),
'dailymotion': DailyMotion(),
'duckduckgo': DuckDuckGo(),
'exalead': ExaLead(),
'google': Google(),
'mojeek': Mojeek(),
'parsijoo': Parsijoo(),
'quora': Quora(),
'yahoo': Yahoo(),
'yandex': Yandex(),
'youtube': Youtube()
'google': Google('image')
# 'ask': Ask(),
# 'baidu': Baidu(),
# 'bing': Bing(),
# 'dailymotion': DailyMotion(),
# 'duckduckgo': DuckDuckGo(),
# 'exalead': ExaLead(),
# 'google': Google(),
# 'mojeek': Mojeek(),
# 'parsijoo': Parsijoo(),
# 'quora': Quora(),
# 'yahoo': Yahoo(),
# 'yandex': Yandex(),
# 'youtube': Youtube()
}



def small_test():
assert isinstance(scrapers['google'].search('fossasia', 1), list)


def feed_gen(query, engine, count=10):
def feed_gen(query, engine, extra, count=10):
print("Extra variable:", extra)
engine = engine.lower()
print("Engine: ", engine)
print("Searcgh Engine", scrapers[engine])
# provide temporary backwards compatibility for old names
old_names = {'ubaidu': 'baidu',
'vdailymotion': 'dailymotion',
'tyoutube': 'youtube'}
engine = old_names.get(engine, engine)
Scarper_class = scrapers(extra)

if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
urls = Scarper_class[engine].search_without_count(query)
else:
urls = scrapers[engine].search(query, count)
urls = Scarper_class[engine].search(query, count, extra)
return urls
15 changes: 10 additions & 5 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,20 @@ class Scraper:
)
}

def __init__(self):
def __init__(self, extra=None):
self.extra = extra
pass

def get_page(self, query, startIndex=0):
def get_page(self, query, startIndex=0, extra=None):
""" Fetch the google search results page
Returns : Results Page
"""
payload = {self.queryKey: query, self.startKey: startIndex}
if extra is None:
payload = {self.queryKey: query, self.startKey: startIndex}
else:
payload = {self.queryKey: query, self.startKey: startIndex, "source": "lnms", "tbm": "isch"}
response = requests.get(self.url, headers=self.headers, params=payload)
print("Response URL: ", response.url)
return response

def parse_response(self, soup):
Expand All @@ -34,7 +39,7 @@ def parse_response(self, soup):
def next_start(self, current_start, prev_results):
return current_start + len(prev_results)

def search(self, query, num_results):
def search(self, query, num_results, extra=None):
"""
Search for the query and return set of urls
Returns: list
Expand All @@ -43,7 +48,7 @@ def search(self, query, num_results):
current_start = self.defaultStart

while(len(urls) < num_results):
response = self.get_page(query, current_start)
response = self.get_page(query, current_start, extra)
soup = BeautifulSoup(response.text, 'html.parser')
new_results = self.parse_response(soup)
if new_results is None:
Expand Down
4 changes: 3 additions & 1 deletion app/scrapers/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
class Google(Scraper):
"""Scrapper class for Google"""

def __init__(self):
def __init__(self, extra=None):
Scraper.__init__(self)
self.url = 'https://www.google.com/search'
self.defaultStart = 0
self.startKey = 'start'
self.extra = extra
print("Type", extra)

def next_start(self, current_start, prev_results):
return current_start + len(prev_results)
Expand Down
33 changes: 17 additions & 16 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@
from dicttoxml import dicttoxml
from flask import (Flask, Response, abort, jsonify, make_response,
render_template, request)

from query_cache import lookup, store
from pymongo import MongoClient
from scrapers import feed_gen, scrapers

app = Flask(__name__)
err = ""

client = MongoClient(os.environ.get('MONGO_URI', 'mongodb://localhost:27017/'))
db = client['query-server-v2']
errorObj = {
'type': 'Internal Server Error',
'status_code': 500,
Expand Down Expand Up @@ -40,8 +41,12 @@ def search(search_engine):
try:
count = int(request.args.get('num', 10))
qformat = request.args.get('format', 'json').lower()
qtype = request.args.get('type','image').lower()
if qformat not in ('json', 'xml', 'csv'):
abort(400, 'Not Found - undefined format')

if qtype not in ('image'):
abort(400, 'Not Found - undefined query')

engine = search_engine
if engine not in scrapers:
Expand All @@ -52,20 +57,16 @@ def search(search_engine):
if not query:
error = [400, 'Not Found - missing query', qformat]
return bad_request(error)
# print("Extra query", qExtra)
result = feed_gen(query, engine, count, qtype)
print("Result", result)
if not result:
error = [404, 'No response', qformat]
return bad_request(error)

# first see if we can get the results for the cache
engine_and_query = engine + ':' + query
result = lookup(engine_and_query)
if result:
print("cache hit: {}".format(engine_and_query))
else:
result = feed_gen(query, engine, count)
if result:
# store the result in the cache to speed up future searches
store(engine_and_query, result)
else:
error = [404, 'No response', engine_and_query]
return bad_request(error)
if db['queries'].find({query: query}).limit(1) is False:
db['queries'].insert(
{"query": query, "engine": engine, "qformat": qformat})

try:
unicode # unicode is undefined in Python 3 so NameError is raised
Expand Down Expand Up @@ -103,4 +104,4 @@ def set_header(r):

if __name__ == '__main__':
port = int(os.environ.get('PORT', 7001))
app.run(host='0.0.0.0', port=port, debug=args.dev)
app.run(host='0.0.0.0', port=port, debug=args.dev)
15 changes: 14 additions & 1 deletion app/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ <h1><code>query-server</code></h1>
<form id="searchForm">
<div class="container" style="padding:0; margin: 0 auto;">
<div class="row">
<div class="col-sm-8">
<div class="col-sm-4">
<div class="input-group input-group-lg" style="min-width:100%">
<input id="query" type="text" class="center-block form-control" autocomplete="on" placeholder="Search with query-server">
</div>
Expand All @@ -62,6 +62,19 @@ <h1><code>query-server</code></h1>
</label>
</div>
</div>
<div class="col-sm-4">
<form id="type">
<!-- <label class="radio-inline">
<input type="radio" name="general" checked="">General
</label> -->
<label class="radio-inline">
<input type="radio" name="image" value="image" >Image
</label>
<!-- <label class="radio-inline">
<input type="radio" name="video">Video
</label> -->
</form>
</div>
</div>
<br/>
<div class="col-sm-4 col-xs-6" style="padding:0; margin: 0 auto;">
Expand Down