Skip to content

Commit

Permalink
Merge pull request #42 from 2globalnomads/patch-1
Browse files Browse the repository at this point in the history
Ignore possible errors in UTF-8 encoding
  • Loading branch information
c4software authored Oct 26, 2017
2 parents 18683d0 + 99af38d commit 6b57ef2
Showing 1 changed file with 2 additions and 7 deletions.
9 changes: 2 additions & 7 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def __crawling(self):
# Search for images in the current page.
images = self.imageregex.findall(msg)
for image_link in list(set(images)):
image_link = image_link.decode("utf-8")
image_link = image_link.decode("utf-8", errors="ignore")

# Ignore link starting with data:
if image_link.startswith("data:"):
Expand Down Expand Up @@ -215,12 +215,7 @@ def __crawling(self):
# Found links
links = self.linkregex.findall(msg)
for link in links:
try:
link = link.decode("utf-8")
except Exception as e:
logging.debug("Error decoding : {0}".format(link))
continue

link = link.decode("utf-8", errors="ignore")
link = self.clean_link(link)
logging.debug("Found : {0}".format(link))

Expand Down

0 comments on commit 6b57ef2

Please sign in to comment.