-
Notifications
You must be signed in to change notification settings - Fork 264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixes #455 Add Image/Video search support for Parsijoo #449
Changes from 15 commits
70899f8
afc397d
6ff5499
ea4618f
5692c94
e80eaeb
7c17a89
5251491
cea1408
c5b0397
8f47338
40e5246
1e865a9
3df4a23
c37af57
ac01b4b
d45d132
8d328d9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -84,3 +84,34 @@ def search_without_count(self, query): | |
soup = BeautifulSoup(response.text, 'html.parser') | ||
urls = self.parse_response(soup) | ||
return urls | ||
|
||
def video_search_without_count(self, query): | ||
""" | ||
Search for the query and return set of urls | ||
Returns: list | ||
""" | ||
urls = [] | ||
if self.name in ['parsijoo']: | ||
url = self.videoURL | ||
payload = {self.queryKey: query} | ||
response = requests.get(url, headers=self.headers, params=payload) | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
urls = self.parse_video_response(soup) | ||
if len(urls) == 0: | ||
return "No video with this Keyword" | ||
else: | ||
return urls | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we just do return urls or "No video with this Keyword" because all other falsey values are just as dangerous as an empty list. |
||
|
||
def image_search_without_count(self, query): | ||
""" | ||
Search for the query and return set of urls | ||
Returns: list | ||
""" | ||
urls = [] | ||
if self.name in ['parsijoo']: | ||
url = self.imageURL | ||
payload = {self.queryKey: query} | ||
response = requests.get(url, headers=self.headers, params=payload) | ||
soup = BeautifulSoup(response.text, 'html.parser') | ||
urls = self.parse_image_response(soup) | ||
return urls |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,8 @@ class Parsijoo(Scraper): | |
def __init__(self): | ||
Scraper.__init__(self) | ||
self.url = 'https://parsijoo.ir/web' | ||
self.imageURL = 'https://image.parsijoo.ir/image' | ||
self.videoURL = 'https://video.parsijoo.ir/video' | ||
self.defaultStart = 0 | ||
self.startKey = 'co' | ||
self.name = 'parsijoo' | ||
|
@@ -29,3 +31,43 @@ def parse_response(soup): | |
print('Parsijoo parsed: ' + str(urls)) | ||
|
||
return urls | ||
|
||
@staticmethod | ||
def parse_video_response(soup): | ||
""" Parse response and returns the urls | ||
|
||
Returns: urls (list) | ||
[[Tile1, url1], [Title2, url2], ...] | ||
""" | ||
urls = [] | ||
for a in soup.findAll('a', attrs={'class': 'over-page'}): | ||
title = a.get('title') | ||
url = 'https://video.parsijoo.ir' + a.get('href') | ||
urls.append({ | ||
'title': title, | ||
'link': url | ||
}) | ||
|
||
print('Parsijoo parsed: ' + str(urls)) | ||
|
||
return urls | ||
|
||
@staticmethod | ||
def parse_image_response(soup): | ||
""" Parse response and returns the urls | ||
|
||
Returns: urls (list) | ||
[[url1], [url2], ...] | ||
""" | ||
urls = [] | ||
for div in \ | ||
soup.findAll('div', attrs={'class': 'image-container overflow'}): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Can't be done to for this reason. Its exceeding the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please reformat as for div in soup.find_all('div', class_='image-container overflow'): to avoid the backslash and the long line. Both are advised against in PEP8. |
||
a = div.find('a') | ||
url = 'https://image.parsijoo.ir' + a.get('href') | ||
urls.append({ | ||
'link': url | ||
}) | ||
|
||
print('Parsijoo parsed: ' + str(urls)) | ||
|
||
return urls |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to go back to violating PEP8 line length?