From 99af38d866aca23e53182f109ffe407d72d1a701 Mon Sep 17 00:00:00 2001 From: Santeri Kannisto Date: Thu, 26 Oct 2017 17:23:18 +0400 Subject: [PATCH] Ignore possible errors in UTF-8 encoding All the URLs should be UTF-8 encodable so it is safe to drop the characters with encoding errors. Fix to issue #35. --- crawler.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/crawler.py b/crawler.py index b61d9e4..f3c5f79 100644 --- a/crawler.py +++ b/crawler.py @@ -172,7 +172,7 @@ def __crawling(self): # Search for images in the current page. images = self.imageregex.findall(msg) for image_link in list(set(images)): - image_link = image_link.decode("utf-8") + image_link = image_link.decode("utf-8", errors="ignore") # Ignore link starting with data: if image_link.startswith("data:"): @@ -215,12 +215,7 @@ def __crawling(self): # Found links links = self.linkregex.findall(msg) for link in links: - try: - link = link.decode("utf-8") - except Exception as e: - logging.debug("Error decoding : {0}".format(link)) - continue - + link = link.decode("utf-8", errors="ignore") link = self.clean_link(link) logging.debug("Found : {0}".format(link))