diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 9847afc0..b28a9aa5 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -8,6 +8,8 @@ import sys import time +from copy import copy + import pytest from lxml import etree, html @@ -284,6 +286,7 @@ def test_formatting(): my_document = html.fromstring('
Wild text
') my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG) assert '

' in my_result and 'Wild text' in my_result # no rend so far + my_document = html.fromstring('

Wild text
') my_result = extract(my_document, config=ZERO_CONFIG) assert my_result == 'Wild text' # links @@ -315,13 +318,15 @@ def test_formatting(): # XML and Markdown formatting within

-tag my_document = html.fromstring('

bold, italics, tt, deleted, underlined, link and additional text to bypass detection.

') - my_result = extract(my_document, no_fallback=True, include_formatting=False, config=ZERO_CONFIG) + my_result = extract(copy(my_document), no_fallback=True, include_formatting=False, config=ZERO_CONFIG) # TXT: newline problem here assert my_result == 'bold, italics, tt,\ndeleted, underlined, link and additional text to bypass detection.' - my_result = extract(my_document, output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG) + + my_result = extract(copy(my_document), output_format='xml', no_fallback=True, include_formatting=True, config=ZERO_CONFIG) assert '

bold, italics, tt, deleted, underlined, link and additional text to bypass detection.

' in my_result assert 'rend="#b"' in my_result and 'rend="#i"' in my_result and 'rend="#t"' in my_result and 'rend="#u"' in my_result and '' in my_result - my_result = extract(my_document, output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG) + + my_result = extract(copy(my_document), output_format='xml', include_formatting=True, include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '

bold, italics, tt, deleted, underlined, link and additional text to bypass detection.

' in my_result my_result = extract(my_document, output_format='txt', no_fallback=True, include_formatting=True, config=ZERO_CONFIG) assert my_result == '**bold**, *italics*, `tt`, ~~deleted~~, __underlined__, link and additional text to bypass detection.' @@ -450,13 +455,13 @@ def test_links(): assert extract(mydoc) is not None # link with target mydoc = html.fromstring('

Test link text. This part of the text has to be long enough.

') - assert 'testlink.html' not in extract(mydoc) - assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) + assert 'testlink.html' not in extract(copy(mydoc)) + assert '[Test link text.](testlink.html) This part of the text has to be long enough.' in extract(copy(mydoc), include_links=True, no_fallback=True, config=ZERO_CONFIG) # relative link conversion - assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(mydoc, url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG) + assert '[Test link text.](https://www.example.com/testlink.html) This part of the text has to be long enough.' in extract(copy(mydoc), url='https://www.example.com/', include_links=True, no_fallback=True, config=ZERO_CONFIG) # link without target mydoc = html.fromstring('

Test link text. This part of the text has to be long enough.

') - assert '[Test link text.] This part of the text has to be long enough.' in extract(mydoc, include_links=True, no_fallback=True, config=ZERO_CONFIG) + assert '[Test link text.] This part of the text has to be long enough.' in extract(copy, include_links=True, no_fallback=True, config=ZERO_CONFIG) mydoc = html.fromstring('
Segment 1

Segment 2

Segment 3

') result = extract(mydoc, output_format='xml', include_links=True, no_fallback=True, config=ZERO_CONFIG) assert '1' in result and '2' in result and '3' in result @@ -684,8 +689,8 @@ def test_htmlprocessing(): myconverted = trafilatura.htmlprocessing.tree_cleaning(mydoc, options) assert myconverted.xpath('.//graphic') and not myconverted.xpath('.//table') mydoc = html.fromstring('

Test headline

Test

') - assert 'Test headline' in extract(mydoc, output_format='xml', config=ZERO_CONFIG, no_fallback=True) - assert 'Test headline' in extract(mydoc, output_format='xmltei', config=ZERO_CONFIG, no_fallback=True) + assert 'Test headline' in extract(copy(mydoc), output_format='xml', config=ZERO_CONFIG, no_fallback=True) + assert 'Test headline' in extract(copy(mydoc), output_format='xmltei', config=ZERO_CONFIG, no_fallback=True) # merge with parent function element = etree.Element('test') xml.merge_with_parent(element) diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index f4dadb57..32a2fa48 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -12,7 +12,6 @@ from courlan.urlutils import fix_relative_urls, get_base_url from lxml.etree import strip_tags -from lxml.html.clean import Cleaner from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, MANUALLY_CLEANED, MANUALLY_STRIPPED @@ -20,29 +19,6 @@ LOGGER = logging.getLogger(__name__) -# HTML_CLEANER config -# https://lxml.de/api/lxml.html.clean.Cleaner-class.html -# https://lxml.de/apidoc/lxml.html.clean.html -HTML_CLEANER = Cleaner( - annoying_tags = False, # True - comments = True, - embedded = False, # True - forms = False, # True - frames = False, # True - javascript = False, - links = False, - meta = False, - page_structure = False, - processing_instructions = True, - remove_unknown_tags = False, - safe_attrs_only = False, - scripts = False, - style = False, - # remove_tags = MANUALLY_STRIPPED, - # kill_tags = MANUALLY_CLEANED, -) - - REND_TAG_MAPPING = { 'em': '#i', 'i': '#i', @@ -58,8 +34,16 @@ } +def delete_element(element): + "Remove the element from the LXML tree." + try: + element.drop_tree() # faster when applicable + except AttributeError: + element.getparent().remove(element) + + def tree_cleaning(tree, options): - '''Prune the tree by discarding unwanted elements''' + "Prune the tree by discarding unwanted elements." # determine cleaning strategy, use lists to keep it deterministic cleaning_list, stripping_list = \ MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy() @@ -74,26 +58,25 @@ def tree_cleaning(tree, options): cleaning_list = [e for e in cleaning_list if e not in ('figure', 'picture', 'source')] stripping_list.remove('img') + + # strip targeted elements + strip_tags(tree, stripping_list) + # delete targeted elements for expression in cleaning_list: for element in tree.getiterator(expression): - try: - element.drop_tree() # faster when applicable - except AttributeError: - element.getparent().remove(element) - HTML_CLEANER.kill_tags, HTML_CLEANER.remove_tags = cleaning_list, stripping_list - # save space and processing time - return HTML_CLEANER.clean_html(prune_html(tree)) + delete_element(element) + + return prune_html(tree) def prune_html(tree): - '''Delete selected empty elements''' + "Delete selected empty elements to save space and processing time." + # //processing-instruction() + # //comment() needed for date extraction for element in tree.xpath(".//*[not(node())]"): if element.tag in CUT_EMPTY_ELEMS: - try: - element.drop_tree() - except AttributeError: - element.getparent().remove(element) + delete_element(element) return tree