Skip to content

Commit

Permalink
add a min extracted size when merging multiple nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugo Bauer committed Jan 25, 2024
1 parent 65eccab commit 3f50d4e
Showing 1 changed file with 5 additions and 3 deletions.
8 changes: 5 additions & 3 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,11 +550,13 @@ def extract_content(tree, options):
new_subtree = HtmlElement(subtrees[0].tag)
for _subtree in subtrees:
for child in _subtree:
# if len(' '.join(child.itertext()).strip()) > MIN_EXTRACTED_SIZE ?
new_subtree.append(child)
if len(''.join(child.itertext()).strip()) > options.config.getint('DEFAULT', 'MIN_EXTRACTED_SIZE'):
new_subtree.append(child)
subtree = new_subtree
else:
elif len(subtrees) == 1:
subtree = subtrees[0]
else:
continue
except IndexError:
continue
# prune the subtree
Expand Down

0 comments on commit 3f50d4e

Please sign in to comment.