diff --git a/crawler.py b/crawler.py index b61d9e4..f3c5f79 100644 --- a/crawler.py +++ b/crawler.py @@ -172,7 +172,7 @@ def __crawling(self): # Search for images in the current page. images = self.imageregex.findall(msg) for image_link in list(set(images)): - image_link = image_link.decode("utf-8") + image_link = image_link.decode("utf-8", errors="ignore") # Ignore link starting with data: if image_link.startswith("data:"): @@ -215,12 +215,7 @@ def __crawling(self): # Found links links = self.linkregex.findall(msg) for link in links: - try: - link = link.decode("utf-8") - except Exception as e: - logging.debug("Error decoding : {0}".format(link)) - continue - + link = link.decode("utf-8", errors="ignore") link = self.clean_link(link) logging.debug("Found : {0}".format(link))