Skip to content

Commit

Permalink
handles cases where multiple nodes are returned by the XPath (such as…
Browse files Browse the repository at this point in the history
… <article>)
  • Loading branch information
Hugo Bauer committed Jan 24, 2024
1 parent 3bd925e commit 65eccab
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
13 changes: 11 additions & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from copy import deepcopy

from lxml.etree import Element, SubElement, strip_elements, strip_tags
from lxml.html import tostring
from lxml.html import tostring, Element as HtmlElement

# own
from .external import (SANITIZED_XPATH, justext_rescue, sanitize_tree,
Expand Down Expand Up @@ -545,7 +545,16 @@ def extract_content(tree, options):
for expr in BODY_XPATH:
# select tree if the expression has been found
try:
subtree = tree.xpath(expr)[0]
subtrees = tree.xpath(expr)
if len(subtrees) > 1 and options.recall is True:
new_subtree = HtmlElement(subtrees[0].tag)
for _subtree in subtrees:
for child in _subtree:

Check warning on line 552 in trafilatura/core.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/core.py#L550-L552

Added lines #L550 - L552 were not covered by tests
# if len(' '.join(child.itertext()).strip()) > MIN_EXTRACTED_SIZE ?
new_subtree.append(child)
subtree = new_subtree

Check warning on line 555 in trafilatura/core.py

View check run for this annotation

Codecov / codecov/patch

trafilatura/core.py#L554-L555

Added lines #L554 - L555 were not covered by tests
else:
subtree = subtrees[0]
except IndexError:
continue
# prune the subtree
Expand Down
2 changes: 1 addition & 1 deletion trafilatura/xpaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
contains(@id, "body-text") or contains(@class, "body-text") or
contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]''',
# (…)[1] = first occurrence
'(.//article)[1]',
'(.//article)',
"""(.//*[(self::article or self::div or self::main or self::section)][
contains(@class, 'post-bodycopy') or
contains(@class, 'storycontent') or contains(@class, 'story-content') or
Expand Down

0 comments on commit 65eccab

Please sign in to comment.