clean: Use Python's built-in html.parser instead of lxml (#71)

* clean: Use Python's built-in html.parser instead of lxml * doc: Update CHANGELOG.md
prcr · May 16, 2021 · 319765a · 319765a
1 parent aa619fc
commit 319765a
Show file tree

Hide file tree

Showing 5 changed files with 4 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 ### Fixed
 
 -   Write full page URLs in `meta-descriptions.csv` if `site_url` is defined. Fixes [#68](https://github.com/prcr/mkdocs-meta-descriptions-plugin/issues/68).
+-   Drop lxml dependency by using Python's built-in html.parser instead
 
 ## [v1.0.0](https://www.github.com/prcr/mkdocs-meta-descriptions-plugin/compare/v0.0.5...v1.0.0) (2021-05-15)
 

diff --git a/mkdocs_meta_descriptions_plugin/export.py b/mkdocs_meta_descriptions_plugin/export.py
@@ -29,7 +29,7 @@ def _read_meta_descriptions(self, pages):
                 html = page_file.read()
                 # Strip page body to improve performance
                 html = re.split(self._body_pattern, html, maxsplit=1)[0]
-                soup = BeautifulSoup(html, features="lxml")
+                soup = BeautifulSoup(html, "html.parser")
                 meta_tag = soup.select_one('meta[name="description"]')
                 if meta_tag:
                     meta_descriptions[page.url] = meta_tag.get("content")

diff --git a/mkdocs_meta_descriptions_plugin/plugin.py b/mkdocs_meta_descriptions_plugin/plugin.py
@@ -22,7 +22,7 @@ def _get_first_paragraph_text(self, html):
         # Strip page subsections to improve performance
         html = re.split(self._headings_pattern, html, maxsplit=1)[0]
         # Select first paragraph directly under body
-        first_paragraph = BeautifulSoup(html, features="lxml").select_one("body > p")
+        first_paragraph = BeautifulSoup(html, "html.parser").select_one("p")
         if first_paragraph is not None:
             # Found the first paragraph, return stripped and escaped text
             return escape(first_paragraph.get_text().strip())

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
 mkdocs>=1.1
 pymdown-extensions>=7.0
 beautifulsoup4>=4.9
-lxml>=4.6
diff --git a/tests/test_plugin.py b/tests/test_plugin.py
@@ -20,7 +20,7 @@ def get_meta_description(files, markdown_file_path):
     html_file_path = files.get_file_from_path(markdown_file_path).abs_dest_path
     with open(html_file_path) as file:
         html = file.read()
-        soup = BeautifulSoup(html, features="lxml")
+        soup = BeautifulSoup(html, "html.parser")
         result = soup.select_one('meta[name="description"]')
         return result["content"] if result else None