scrap lxml.html.Cleaner

adbar · Jan 31, 2024 · 481238a · 481238a
1 parent 97dc088
commit 481238a
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 46 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -8,6 +8,8 @@
 import sys
 import time
 
+from copy import copy
+
 import pytest
 
 from lxml import etree, html
@@ -284,6 +286,7 @@ def test_formatting():
     my_document = html.fromstring('<html><body><article><div><strong>Wild text</strong></div></article></body></html>')
     my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
     assert '<p>' in my_result and '<hi rend="#b">Wild text</hi>' in my_result  # no rend so far
+    my_document = html.fromstring('<html><body><article><div><strong>Wild text</strong></div></article></body></html>')
     my_result = extract(my_document, config=ZERO_CONFIG)
     assert my_result == 'Wild text'
     # links
@@ -315,13 +318,15 @@ def test_formatting():
 
     # XML and Markdown formatting within <p>-tag
     my_document = html.fromstring('<html><body><p><b>bold</b>, <i>italics</i>, <tt>tt</tt>, <strike>deleted</strike>, <u>underlined</u>, <a href="test.html">link</a> and additional text to bypass detection.</p></body></html>')
-    my_result = extract(my_document, no_fallback=True, include_formatting=False, config=ZERO_CONFIG)
+    my_result = extract(copy(my_document), no_fallback=True, include_formatting=False, config=ZERO_CONFIG)
     # TXT: newline problem here
     assert my_result == 'bold, italics, tt,\ndeleted, underlined, link and additional text to bypass detection.'
-    my_result = extract(my_document, output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
+
+    my_result = extract(copy(my_document), output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
     assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, link and additional text to bypass detection.</p>' in my_result
     assert 'rend="#b"' in my_result and 'rend="#i"' in my_result and 'rend="#t"' in my_result and 'rend="#u"' in my_result and '<del>' in my_result
-    my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+
+    my_result = extract(copy(my_document), output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG)
     assert '<p><hi rend="#b">bold</hi>, <hi rend="#i">italics</hi>, <hi rend="#t">tt</hi>, <del>deleted</del>, <hi rend="#u">underlined</hi>, <ref target="test.html">link</ref> and additional text to bypass detection.</p>' in my_result
     my_result = extract(my_document, output_format='txt', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
     assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'
@@ -450,13 +455,13 @@ def test_links():
     assert extract(mydoc) is not None
     # link with target
     mydoc = html.fromstring('<html><body><p><a href="testlink.html">Test link text.</a> This part of the text has to be long enough.</p></body></html>')
-    assert 'testlink.html' not in extract(mydoc)
-    assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+    assert 'testlink.html' not in extract(copy(mydoc))
+    assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(copy(mydoc), include_links=True, no_fallback=True, config=ZERO_CONFIG)
     # relative link conversion
-    assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(mydoc, url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG)
+    assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(copy(mydoc), url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG)
     # link without target
     mydoc = html.fromstring('<html><body><p><a>Test link text.</a> This part of the text has to be long enough.</p></body></html>')
-    assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+    assert '[Test link text.] This part of the text has to be long enough.' in extract(copy, include_links=True, no_fallback=True, config=ZERO_CONFIG)
     mydoc = html.fromstring('<html><body><article><a>Segment 1</a><h1><a>Segment 2</a></h1><p>Segment 3</p></article></body></html>')
     result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG)
     assert '1' in result and '2' in result and '3' in result
@@ -684,8 +689,8 @@ def test_htmlprocessing():
     myconverted = trafilatura.htmlprocessing.tree_cleaning(mydoc, options)
     assert myconverted.xpath('.//graphic') and not myconverted.xpath('.//table')
     mydoc = html.fromstring('<html><body><article><h1>Test headline</h1><p>Test</p></article></body></html>')
-    assert '<head rend="h1">Test headline</head>' in extract(mydoc, output_format='xml', config=ZERO_CONFIG, no_fallback=True)
-    assert '<ab rend="h1" type="header">Test headline</ab>' in extract(mydoc, output_format='xmltei', config=ZERO_CONFIG, no_fallback=True)
+    assert '<head rend="h1">Test headline</head>' in extract(copy(mydoc), output_format='xml', config=ZERO_CONFIG, no_fallback=True)
+    assert '<ab rend="h1" type="header">Test headline</ab>' in extract(copy(mydoc), output_format='xmltei', config=ZERO_CONFIG, no_fallback=True)
     # merge with parent function
     element = etree.Element('test')
     xml.merge_with_parent(element)

diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
@@ -12,37 +12,13 @@
 
 from courlan.urlutils import fix_relative_urls, get_base_url
 from lxml.etree import strip_tags
-from lxml.html.clean import Cleaner
 
 from .filters import duplicate_test, textfilter
 from .settings import CUT_EMPTY_ELEMS, MANUALLY_CLEANED, MANUALLY_STRIPPED
 from .utils import trim, uniquify_list
 
 LOGGER = logging.getLogger(__name__)
 
-# HTML_CLEANER config
-# https://lxml.de/api/lxml.html.clean.Cleaner-class.html
-# https://lxml.de/apidoc/lxml.html.clean.html
-HTML_CLEANER = Cleaner(
-    annoying_tags = False,  # True
-    comments = True,
-    embedded = False,  # True
-    forms = False,  # True
-    frames = False,  # True
-    javascript = False,
-    links = False,
-    meta = False,
-    page_structure = False,
-    processing_instructions = True,
-    remove_unknown_tags = False,
-    safe_attrs_only = False,
-    scripts = False,
-    style = False,
-    # remove_tags = MANUALLY_STRIPPED,
-    # kill_tags = MANUALLY_CLEANED,
-)
-
-
 REND_TAG_MAPPING = {
     'em': '#i',
     'i': '#i',
@@ -58,8 +34,16 @@
 }
 
 
+def delete_element(element):
+    "Remove the element from the LXML tree."
+    try:
+        element.drop_tree()  # faster when applicable
+    except AttributeError:
+        element.getparent().remove(element)
+
+
 def tree_cleaning(tree, options):
-    '''Prune the tree by discarding unwanted elements'''
+    "Prune the tree by discarding unwanted elements."
     # determine cleaning strategy, use lists to keep it deterministic
     cleaning_list, stripping_list = \
         MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
@@ -74,26 +58,25 @@ def tree_cleaning(tree, options):
         cleaning_list = [e for e in cleaning_list if e
                          not in ('figure', 'picture', 'source')]
         stripping_list.remove('img')
+
+    # strip targeted elements
+    strip_tags(tree, stripping_list)
+
     # delete targeted elements
     for expression in cleaning_list:
         for element in tree.getiterator(expression):
-            try:
-                element.drop_tree() # faster when applicable
-            except AttributeError:
-                element.getparent().remove(element)
-    HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list
-    # save space and processing time
-    return HTML_CLEANER.clean_html(prune_html(tree))
+            delete_element(element)
+
+    return prune_html(tree)
 
 
 def prune_html(tree):
-    '''Delete selected empty elements'''
+    "Delete selected empty elements to save space and processing time."
+    # //processing-instruction()
+    # //comment() needed for date extraction
     for element in tree.xpath(".//*[not(node())]"):
         if element.tag in CUT_EMPTY_ELEMS:
-            try:
-                element.drop_tree()
-            except AttributeError:
-                element.getparent().remove(element)
+            delete_element(element)
     return tree