From 7998d94d13e37049c56ad1ff5b3dca3e0bcc4645 Mon Sep 17 00:00:00 2001 From: Arthur Taylor Date: Mon, 22 Jan 2024 17:36:04 +0100 Subject: [PATCH] Switch html.parser for lxml in BeautifulSoup invocations --- flathunter/abstract_crawler.py | 6 +++--- flathunter/crawler/wggesucht.py | 4 ++-- test/crawler/test_crawl_wggesucht.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/flathunter/abstract_crawler.py b/flathunter/abstract_crawler.py index 0ff121a0..a8deaf08 100644 --- a/flathunter/abstract_crawler.py +++ b/flathunter/abstract_crawler.py @@ -73,7 +73,7 @@ def get_soup_from_url( elif re.search("g-recaptcha", driver.page_source): self.resolve_recaptcha( driver, checkbox, afterlogin_string or "") - return BeautifulSoup(driver.page_source, 'html.parser') + return BeautifulSoup(driver.page_source, 'lxml') resp = requests.get(url, headers=self.HEADERS, timeout=30) if resp.status_code not in (200, 405): @@ -83,7 +83,7 @@ def get_soup_from_url( logger.error("Got response (%i): %s\n%s", resp.status_code, resp.content, user_agent) - return BeautifulSoup(resp.content, 'html.parser') + return BeautifulSoup(resp.content, 'lxml') def get_soup_with_proxy(self, url) -> BeautifulSoup: """Will try proxies until it's possible to crawl and return a soup""" @@ -124,7 +124,7 @@ def get_soup_with_proxy(self, url) -> BeautifulSoup: raise ProxyException( "An error occurred while fetching proxies or content") - return BeautifulSoup(resp.content, 'html.parser') + return BeautifulSoup(resp.content, 'lxml') def extract_data(self, soup): """Should be implemented in subclass""" diff --git a/flathunter/crawler/wggesucht.py b/flathunter/crawler/wggesucht.py index b42f8112..f20a2a00 100644 --- a/flathunter/crawler/wggesucht.py +++ b/flathunter/crawler/wggesucht.py @@ -230,5 +230,5 @@ def get_soup_from_url( elif re.search("g-recaptcha", driver.page_source): self.resolve_recaptcha( driver, checkbox, afterlogin_string or "") - return BeautifulSoup(driver.page_source, 'html.parser') - return BeautifulSoup(resp.content, 'html.parser') + return BeautifulSoup(driver.page_source, 'lxml') + return BeautifulSoup(resp.content, 'lxml') diff --git a/test/crawler/test_crawl_wggesucht.py b/test/crawler/test_crawl_wggesucht.py index 45c2c785..11e448b7 100644 --- a/test/crawler/test_crawl_wggesucht.py +++ b/test/crawler/test_crawl_wggesucht.py @@ -31,7 +31,7 @@ def test(self): def test_filter_spotahome_ads(self): with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "wg-gesucht-spotahome.html")) as fixture: - soup = BeautifulSoup(fixture, 'html.parser') + soup = BeautifulSoup(fixture, 'lxml') entries = self.crawler.extract_data(soup) assert len(entries) == 20