From 08457c4070e97ed73a197fc7301d7a6e47e98993 Mon Sep 17 00:00:00 2001 From: Luise Koehler Date: Tue, 2 Apr 2024 16:00:54 +0200 Subject: [PATCH] Avoid inserting tail before children --- tests/unit_tests.py | 13 +++++++++++++ trafilatura/htmlprocessing.py | 6 +++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index a9631b53..bc0055a4 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -762,6 +762,19 @@ def test_htmlprocessing(): trafilatura.htmlprocessing.process_node(node, options) assert node.text == "some text" assert node.tail == "tail" + node = etree.fromstring("

boldinnerouter

")[0] + processed = trafilatura.htmlprocessing.handle_textnode(node, options) + assert processed.tail == "outer" + node = etree.fromstring("

texttail

")[0] + processed = trafilatura.htmlprocessing.handle_textnode(node, options) + assert processed.tail == "tail" and processed.text == "text" + node = etree.fromstring("

tail

")[0] + processed = trafilatura.htmlprocessing.handle_textnode(node, options) + assert processed.tail == "" and processed.text == "tail" + node = etree.fromstring("

textboldtail

")[0] + processed = trafilatura.htmlprocessing.handle_textnode(node, options) + assert processed.tail == "tail" and processed.text == "text" + def test_extraction_options(): diff --git a/trafilatura/htmlprocessing.py b/trafilatura/htmlprocessing.py index 268ed93d..5feef3f9 100644 --- a/trafilatura/htmlprocessing.py +++ b/trafilatura/htmlprocessing.py @@ -308,7 +308,7 @@ def convert_tags(tree, options, url=None): def handle_textnode(element, options, comments_fix=True, preserve_spaces=False): '''Convert, format, and probe potential text elements''' - if element.text is None and element.tail is None: + if element.text is None and element.tail is None and len(element) == 0: return None # lb bypass if comments_fix is False and element.tag == 'lb': @@ -318,7 +318,7 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False): # return None # duplicate_test(subelement)? return element - if element.text is None: + if element.text is None and len(element) == 0: # try the tail # LOGGER.debug('using tail for element %s', element.tag) element.text, element.tail = element.tail, '' @@ -332,7 +332,7 @@ def handle_textnode(element, options, comments_fix=True, preserve_spaces=False): element.tail = trim(element.tail) # filter content # or not re.search(r'\w', element.text): # text_content()? - if not element.text or textfilter(element) is True: + if not element.text and textfilter(element) is True: return None if options.dedup and duplicate_test(element, options.config) is True: return None