Switch html.parser for lxml in BeautifulSoup invocations

flathunters · Jan 22, 2024 · 7998d94 · 7998d94
1 parent 77a1b57
commit 7998d94
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 6 deletions.
diff --git a/flathunter/abstract_crawler.py b/flathunter/abstract_crawler.py
@@ -73,7 +73,7 @@ def get_soup_from_url(
             elif re.search("g-recaptcha", driver.page_source):
                 self.resolve_recaptcha(
                     driver, checkbox, afterlogin_string or "")
-            return BeautifulSoup(driver.page_source, 'html.parser')
+            return BeautifulSoup(driver.page_source, 'lxml')
 
         resp = requests.get(url, headers=self.HEADERS, timeout=30)
         if resp.status_code not in (200, 405):
@@ -83,7 +83,7 @@ def get_soup_from_url(
             logger.error("Got response (%i): %s\n%s",
                          resp.status_code, resp.content, user_agent)
 
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def get_soup_with_proxy(self, url) -> BeautifulSoup:
         """Will try proxies until it's possible to crawl and return a soup"""
@@ -124,7 +124,7 @@ def get_soup_with_proxy(self, url) -> BeautifulSoup:
             raise ProxyException(
                 "An error occurred while fetching proxies or content")
 
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def extract_data(self, soup):
         """Should be implemented in subclass"""

diff --git a/flathunter/crawler/wggesucht.py b/flathunter/crawler/wggesucht.py
@@ -230,5 +230,5 @@ def get_soup_from_url(
             elif re.search("g-recaptcha", driver.page_source):
                 self.resolve_recaptcha(
                     driver, checkbox, afterlogin_string or "")
-            return BeautifulSoup(driver.page_source, 'html.parser')
-        return BeautifulSoup(resp.content, 'html.parser')
+            return BeautifulSoup(driver.page_source, 'lxml')
+        return BeautifulSoup(resp.content, 'lxml')
diff --git a/test/crawler/test_crawl_wggesucht.py b/test/crawler/test_crawl_wggesucht.py
@@ -31,7 +31,7 @@ def test(self):
 
     def test_filter_spotahome_ads(self):
         with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "wg-gesucht-spotahome.html")) as fixture:
-            soup = BeautifulSoup(fixture, 'html.parser')
+            soup = BeautifulSoup(fixture, 'lxml')
         entries = self.crawler.extract_data(soup)
         assert len(entries) == 20