From 99af38d866aca23e53182f109ffe407d72d1a701 Mon Sep 17 00:00:00 2001
From: Santeri Kannisto <info@2globalnomads.info>
Date: Thu, 26 Oct 2017 17:23:18 +0400
Subject: [PATCH] Ignore possible errors in UTF-8 encoding

All the URLs should be UTF-8 encodable so it is safe to drop the characters with encoding errors. Fix to issue #35.
---
 crawler.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/crawler.py b/crawler.py
index b61d9e4..f3c5f79 100644
--- a/crawler.py
+++ b/crawler.py
@@ -172,7 +172,7 @@ def __crawling(self):
 			# Search for images in the current page.
 			images = self.imageregex.findall(msg)
 			for image_link in list(set(images)):
-				image_link = image_link.decode("utf-8")
+				image_link = image_link.decode("utf-8", errors="ignore")
 
 				# Ignore link starting with data:
 				if image_link.startswith("data:"):
@@ -215,12 +215,7 @@ def __crawling(self):
 		# Found links
 		links = self.linkregex.findall(msg)
 		for link in links:
-			try:
-				link = link.decode("utf-8")
-			except Exception as e:
-				logging.debug("Error decoding : {0}".format(link))
-				continue
-
+			link = link.decode("utf-8", errors="ignore")
 			link = self.clean_link(link)
 			logging.debug("Found : {0}".format(link))