From 7998d94d13e37049c56ad1ff5b3dca3e0bcc4645 Mon Sep 17 00:00:00 2001
From: Arthur Taylor <arthur@codders.de>
Date: Mon, 22 Jan 2024 17:36:04 +0100
Subject: [PATCH] Switch html.parser for lxml in BeautifulSoup invocations

---
 flathunter/abstract_crawler.py       | 6 +++---
 flathunter/crawler/wggesucht.py      | 4 ++--
 test/crawler/test_crawl_wggesucht.py | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/flathunter/abstract_crawler.py b/flathunter/abstract_crawler.py
index 0ff121a0..a8deaf08 100644
--- a/flathunter/abstract_crawler.py
+++ b/flathunter/abstract_crawler.py
@@ -73,7 +73,7 @@ def get_soup_from_url(
             elif re.search("g-recaptcha", driver.page_source):
                 self.resolve_recaptcha(
                     driver, checkbox, afterlogin_string or "")
-            return BeautifulSoup(driver.page_source, 'html.parser')
+            return BeautifulSoup(driver.page_source, 'lxml')
 
         resp = requests.get(url, headers=self.HEADERS, timeout=30)
         if resp.status_code not in (200, 405):
@@ -83,7 +83,7 @@ def get_soup_from_url(
             logger.error("Got response (%i): %s\n%s",
                          resp.status_code, resp.content, user_agent)
 
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def get_soup_with_proxy(self, url) -> BeautifulSoup:
         """Will try proxies until it's possible to crawl and return a soup"""
@@ -124,7 +124,7 @@ def get_soup_with_proxy(self, url) -> BeautifulSoup:
             raise ProxyException(
                 "An error occurred while fetching proxies or content")
 
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def extract_data(self, soup):
         """Should be implemented in subclass"""
diff --git a/flathunter/crawler/wggesucht.py b/flathunter/crawler/wggesucht.py
index b42f8112..f20a2a00 100644
--- a/flathunter/crawler/wggesucht.py
+++ b/flathunter/crawler/wggesucht.py
@@ -230,5 +230,5 @@ def get_soup_from_url(
             elif re.search("g-recaptcha", driver.page_source):
                 self.resolve_recaptcha(
                     driver, checkbox, afterlogin_string or "")
-            return BeautifulSoup(driver.page_source, 'html.parser')
-        return BeautifulSoup(resp.content, 'html.parser')
+            return BeautifulSoup(driver.page_source, 'lxml')
+        return BeautifulSoup(resp.content, 'lxml')
diff --git a/test/crawler/test_crawl_wggesucht.py b/test/crawler/test_crawl_wggesucht.py
index 45c2c785..11e448b7 100644
--- a/test/crawler/test_crawl_wggesucht.py
+++ b/test/crawler/test_crawl_wggesucht.py
@@ -31,7 +31,7 @@ def test(self):
 
     def test_filter_spotahome_ads(self):
         with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "wg-gesucht-spotahome.html")) as fixture:
-            soup = BeautifulSoup(fixture, 'html.parser')
+            soup = BeautifulSoup(fixture, 'lxml')
         entries = self.crawler.extract_data(soup)
         assert len(entries) == 20