Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #455 Add Image/Video search support for Parsijoo #449

Merged
merged 18 commits into from
Jan 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ install:
- pip install -r requirements.txt

before_script:
- flake8 . --count --max-complexity=16 --show-source --statistics
- flake8 . --count --max-complexity=16 --show-source --statistics --max-line-length=100

script:
- python -m app.server > /dev/null &
Expand Down
4 changes: 2 additions & 2 deletions app/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ def feed_gen(query, engine, count=10, qtype=''):
engine = old_names.get(engine, engine)
if engine in ('quora', 'youtube'):
urls = scrapers[engine].search_without_count(query)
elif engine in ('bing',) and qtype == 'vid':
elif engine in ('bing', 'parsijoo') and qtype == 'vid':
urls = scrapers[engine].video_search_without_count(query)
elif engine in ('bing',) and qtype == 'isch':
elif engine in ('bing', 'parsijoo') and qtype == 'isch':
urls = scrapers[engine].image_search_without_count(query)
elif engine in ('ask',) and qtype == 'vid':
urls = scrapers[engine].video_search(query, count, qtype)
Expand Down
15 changes: 12 additions & 3 deletions app/scrapers/generalized.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,21 +133,30 @@ def video_search_without_count(self, query):
Returns: list
"""
urls = []
if self.name in ['bing']:
if self.name in ['parsijoo']:
url = self.videoURL
payload = {self.queryKey: query}
elif self.name in ['bing']:
url = self.videoURL
payload = {self.queryKey: query, self.videoKey: 'HDRSC3'}
response = requests.get(url, headers=self.headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
urls = self.parse_video_response(soup)
return urls
if len(urls) == 0:
return "No video with this Keyword"
else:
return urls

def image_search_without_count(self, query):
"""
Search for the query and return set of urls
Returns: list
"""
urls = []
if self.name in ['bing']:
if self.name in ['parsijoo']:
url = self.imageURL
payload = {self.queryKey: query}
elif self.name in ['bing']:
url = self.imageURL
payload = {self.queryKey: query, self.imageKey: 'HDRSC2'}
response = requests.get(url, headers=self.headers, params=payload)
Expand Down
42 changes: 42 additions & 0 deletions app/scrapers/parsijoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ class Parsijoo(Scraper):
def __init__(self):
Scraper.__init__(self)
self.url = 'https://parsijoo.ir/web'
self.imageURL = 'https://image.parsijoo.ir/image'
self.videoURL = 'https://video.parsijoo.ir/video'
self.defaultStart = 0
self.startKey = 'co'
self.name = 'parsijoo'
Expand All @@ -29,3 +31,43 @@ def parse_response(soup):
print('Parsijoo parsed: ' + str(urls))

return urls

@staticmethod
def parse_video_response(soup):
""" Parse response and returns the urls

Returns: urls (list)
[[Tile1, url1], [Title2, url2], ...]
"""
urls = []
for a in soup.findAll('a', attrs={'class': 'over-page'}):
title = a.get('title')
url = 'https://video.parsijoo.ir' + a.get('href')
urls.append({
'title': title,
'link': url
})

print('Parsijoo parsed: ' + str(urls))

return urls

@staticmethod
def parse_image_response(soup):
""" Parse response and returns the urls

Returns: urls (list)
[[url1], [url2], ...]
"""
urls = []
for div in \
soup.findAll('div', attrs={'class': 'image-container overflow'}):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cclauss

flake8 remove --max-line-length=100

Can't be done to for this reason. Its exceeding the default max-line-length

Copy link
Contributor

@cclauss cclauss Jan 22, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please reformat as for div in soup.find_all('div', class_='image-container overflow'): to avoid the backslash and the long line. Both are advised against in PEP8.

a = div.find('a')
url = 'https://image.parsijoo.ir' + a.get('href')
urls.append({
'link': url
})

print('Parsijoo parsed: ' + str(urls))

return urls
25 changes: 25 additions & 0 deletions test/test_parsijoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,28 @@ def test_parse_response():
}]
resp = Parsijoo().parse_response(dummy_soup)
assert resp == expected_resp


def test_parse_video_response():
html_text = """<a href="mock_url" class="over-page"
title="mock_title">mock_title</a>"""
dummy_soup = BeautifulSoup(html_text, 'html.parser')
url = 'https://video.parsijoo.ir' + "mock_url"
expected_resp = [{
'title': u'mock_title',
'link': url,
}]
resp = Parsijoo().parse_video_response(dummy_soup)
assert resp == expected_resp


def test_parse_image_response():
html_text = """<div class="image-container overflow"><a href="mock_url"
title="mock_title">mock_title</a></div>"""
dummy_soup = BeautifulSoup(html_text, 'html.parser')
image_url = 'https://image.parsijoo.ir' + 'mock_url'
expected_resp = [{
'link': image_url,
}]
resp = Parsijoo().parse_image_response(dummy_soup)
assert resp == expected_resp