Skip to content

Commit

Permalink
Switch html.parser for lxml in BeautifulSoup invocations
Browse files Browse the repository at this point in the history
  • Loading branch information
codders committed Jan 22, 2024
1 parent 77a1b57 commit 7998d94
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 6 deletions.
6 changes: 3 additions & 3 deletions flathunter/abstract_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_soup_from_url(
elif re.search("g-recaptcha", driver.page_source):
self.resolve_recaptcha(
driver, checkbox, afterlogin_string or "")
return BeautifulSoup(driver.page_source, 'html.parser')
return BeautifulSoup(driver.page_source, 'lxml')

resp = requests.get(url, headers=self.HEADERS, timeout=30)
if resp.status_code not in (200, 405):
Expand All @@ -83,7 +83,7 @@ def get_soup_from_url(
logger.error("Got response (%i): %s\n%s",
resp.status_code, resp.content, user_agent)

return BeautifulSoup(resp.content, 'html.parser')
return BeautifulSoup(resp.content, 'lxml')

def get_soup_with_proxy(self, url) -> BeautifulSoup:
"""Will try proxies until it's possible to crawl and return a soup"""
Expand Down Expand Up @@ -124,7 +124,7 @@ def get_soup_with_proxy(self, url) -> BeautifulSoup:
raise ProxyException(
"An error occurred while fetching proxies or content")

return BeautifulSoup(resp.content, 'html.parser')
return BeautifulSoup(resp.content, 'lxml')

def extract_data(self, soup):
"""Should be implemented in subclass"""
Expand Down
4 changes: 2 additions & 2 deletions flathunter/crawler/wggesucht.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,5 +230,5 @@ def get_soup_from_url(
elif re.search("g-recaptcha", driver.page_source):
self.resolve_recaptcha(
driver, checkbox, afterlogin_string or "")
return BeautifulSoup(driver.page_source, 'html.parser')
return BeautifulSoup(resp.content, 'html.parser')
return BeautifulSoup(driver.page_source, 'lxml')
return BeautifulSoup(resp.content, 'lxml')
2 changes: 1 addition & 1 deletion test/crawler/test_crawl_wggesucht.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test(self):

def test_filter_spotahome_ads(self):
with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures", "wg-gesucht-spotahome.html")) as fixture:
soup = BeautifulSoup(fixture, 'html.parser')
soup = BeautifulSoup(fixture, 'lxml')
entries = self.crawler.extract_data(soup)
assert len(entries) == 20

0 comments on commit 7998d94

Please sign in to comment.