From 3f50d4e83a29442e53a7755e13fd83f5c5051369 Mon Sep 17 00:00:00 2001 From: Hugo Bauer Date: Thu, 25 Jan 2024 18:44:16 +0100 Subject: [PATCH] add a min extracted size when merging multiple nodes --- trafilatura/core.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index b2592743..87e101f2 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -550,11 +550,13 @@ def extract_content(tree, options): new_subtree = HtmlElement(subtrees[0].tag) for _subtree in subtrees: for child in _subtree: - # if len(' '.join(child.itertext()).strip()) > MIN_EXTRACTED_SIZE ? - new_subtree.append(child) + if len(''.join(child.itertext()).strip()) > options.config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'): + new_subtree.append(child) subtree = new_subtree - else: + elif len(subtrees) == 1: subtree = subtrees[0] + else: + continue except IndexError: continue # prune the subtree