diff --git a/tests/unit_tests.py b/tests/unit_tests.py
index 9847afc0..b28a9aa5 100644
--- a/tests/unit_tests.py
+++ b/tests/unit_tests.py
@@ -8,6 +8,8 @@
import sys
import time
+from copy import copy
+
import pytest
from lxml import etree, html
@@ -284,6 +286,7 @@ def test_formatting():
my_document = html.fromstring('
Wild text
')
my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
assert '' in my_result and 'Wild text' in my_result # no rend so far
+ my_document = html.fromstring('
Wild text
')
my_result = extract(my_document, config=ZERO_CONFIG)
assert my_result == 'Wild text'
# links
@@ -315,13 +318,15 @@ def test_formatting():
# XML and Markdown formatting within -tag
my_document = html.fromstring('
bold, italics, tt, deleted, underlined, link and additional text to bypass detection.
')
- my_result = extract(my_document, no_fallback=True, include_formatting=False, config=ZERO_CONFIG)
+ my_result = extract(copy(my_document), no_fallback=True, include_formatting=False, config=ZERO_CONFIG)
# TXT: newline problem here
assert my_result == 'bold, italics, tt,\ndeleted, underlined, link and additional text to bypass detection.'
- my_result = extract(my_document, output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
+
+ my_result = extract(copy(my_document), output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
assert 'bold, italics, tt, deleted, underlined, link and additional text to bypass detection.
' in my_result
assert 'rend="#b"' in my_result and 'rend="#i"' in my_result and 'rend="#t"' in my_result and 'rend="#u"' in my_result and '' in my_result
- my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+
+ my_result = extract(copy(my_document), output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG)
assert 'bold, italics, tt, deleted, underlined, [link](test.html) and additional text to bypass detection.
' in my_result
my_result = extract(my_document, output_format='txt', no_fallback=True, include_formatting=True, config=ZERO_CONFIG)
assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.'
@@ -450,13 +455,13 @@ def test_links():
assert extract(mydoc) is not None
# link with target
mydoc = html.fromstring('Test link text. This part of the text has to be long enough.
')
- assert 'testlink.html' not in extract(mydoc)
- assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+ assert 'testlink.html' not in extract(copy(mydoc))
+ assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(copy(mydoc), include_links=True, no_fallback=True, config=ZERO_CONFIG)
# relative link conversion
- assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(mydoc, url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG)
+ assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(copy(mydoc), url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG)
# link without target
mydoc = html.fromstring('Test link text. This part of the text has to be long enough.
')
- assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG)
+ assert '[Test link text.] This part of the text has to be long enough.' in extract(copy, include_links=True, no_fallback=True, config=ZERO_CONFIG)
mydoc = html.fromstring('Segment 1Segment 3
')
result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG)
assert '1' in result and '2' in result and '3' in result
@@ -684,8 +689,8 @@ def test_htmlprocessing():
myconverted = trafilatura.htmlprocessing.tree_cleaning(mydoc, options)
assert myconverted.xpath('.//graphic') and not myconverted.xpath('.//table')
mydoc = html.fromstring('Test headline
Test
')
- assert 'Test headline' in extract(mydoc, output_format='xml', config=ZERO_CONFIG, no_fallback=True)
- assert 'Test headline ' in extract(mydoc, output_format='xmltei', config=ZERO_CONFIG, no_fallback=True)
+ assert 'Test headline' in extract(copy(mydoc), output_format='xml', config=ZERO_CONFIG, no_fallback=True)
+ assert 'Test headline ' in extract(copy(mydoc), output_format='xmltei', config=ZERO_CONFIG, no_fallback=True)
# merge with parent function
element = etree.Element('test')
xml.merge_with_parent(element)
diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py
index f4dadb57..32a2fa48 100644
--- a/trafilatura/htmlprocessing.py
+++ b/trafilatura/htmlprocessing.py
@@ -12,7 +12,6 @@
from courlan.urlutils import fix_relative_urls, get_base_url
from lxml.etree import strip_tags
-from lxml.html.clean import Cleaner
from .filters import duplicate_test, textfilter
from .settings import CUT_EMPTY_ELEMS, MANUALLY_CLEANED, MANUALLY_STRIPPED
@@ -20,29 +19,6 @@
LOGGER = logging.getLogger(__name__)
-# HTML_CLEANER config
-# https://lxml.de/api/lxml.html.clean.Cleaner-class.html
-# https://lxml.de/apidoc/lxml.html.clean.html
-HTML_CLEANER = Cleaner(
- annoying_tags = False, # True
- comments = True,
- embedded = False, # True
- forms = False, # True
- frames = False, # True
- javascript = False,
- links = False,
- meta = False,
- page_structure = False,
- processing_instructions = True,
- remove_unknown_tags = False,
- safe_attrs_only = False,
- scripts = False,
- style = False,
- # remove_tags = MANUALLY_STRIPPED,
- # kill_tags = MANUALLY_CLEANED,
-)
-
-
REND_TAG_MAPPING = {
'em': '#i',
'i': '#i',
@@ -58,8 +34,16 @@
}
+def delete_element(element):
+ "Remove the element from the LXML tree."
+ try:
+ element.drop_tree() # faster when applicable
+ except AttributeError:
+ element.getparent().remove(element)
+
+
def tree_cleaning(tree, options):
- '''Prune the tree by discarding unwanted elements'''
+ "Prune the tree by discarding unwanted elements."
# determine cleaning strategy, use lists to keep it deterministic
cleaning_list, stripping_list = \
MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
@@ -74,26 +58,25 @@ def tree_cleaning(tree, options):
cleaning_list = [e for e in cleaning_list if e
not in ('figure', 'picture', 'source')]
stripping_list.remove('img')
+
+ # strip targeted elements
+ strip_tags(tree, stripping_list)
+
# delete targeted elements
for expression in cleaning_list:
for element in tree.getiterator(expression):
- try:
- element.drop_tree() # faster when applicable
- except AttributeError:
- element.getparent().remove(element)
- HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list
- # save space and processing time
- return HTML_CLEANER.clean_html(prune_html(tree))
+ delete_element(element)
+
+ return prune_html(tree)
def prune_html(tree):
- '''Delete selected empty elements'''
+ "Delete selected empty elements to save space and processing time."
+ # //processing-instruction()
+ # //comment() needed for date extraction
for element in tree.xpath(".//*[not(node())]"):
if element.tag in CUT_EMPTY_ELEMS:
- try:
- element.drop_tree()
- except AttributeError:
- element.getparent().remove(element)
+ delete_element(element)
return tree