Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle changed behaviour of lxml addnext method #484

Merged
merged 4 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,9 @@ def test_ab_with_p_parent_resolved():
</TEI>"""
)
cleaned = check_tei(xml_doc, "fake_url")
result = [(elem.tag, elem.text, elem.tail) for elem in xml_doc.iter(["p", "ab"])]
result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])]
assert result == [
("p", "text1", None),
("p", "text1", ""),
("ab", "text2", None),
("p", "text3", None),
("ab", "text4", None),
Expand All @@ -339,10 +339,10 @@ def test_ab_with_p_parent_resolved():
</TEI>"""
)
cleaned = check_tei(xml_doc, "fake_url")
result = [(elem.tag, elem.text, elem.tail) for elem in xml_doc.iter(["p", "ab"])]
result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])]
assert result == [
("p", "text0", None),
("ab", "text1", None),
("p", "text0", ""),
("ab", "text1", ""),
("p", None, None),
("ab", "text3", None),
("p", "text4", None),
Expand Down
24 changes: 18 additions & 6 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def replace_element_text(element, include_formatting):
def merge_with_parent(element, include_formatting=False):
'''Merge element with its parent and convert formatting to markdown.'''
parent = element.getparent()
if not parent:
if parent is None:
return

full_text = replace_element_text(element, include_formatting)
Expand Down Expand Up @@ -491,16 +491,28 @@ def _wrap_unwanted_siblings_of_div(div_element):


def _move_element_one_level_up(element):
"""
Fix TEI compatibility issues by moving certain p-elems up in the XML tree.
There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p
"""
parent = element.getparent()
grand_parent = parent.getparent()

new_elem = Element("p")
new_elem.extend(sibling for sibling in element.itersiblings())

parent.addnext(element)
grand_parent.insert(grand_parent.index(parent) + 1, element)

if element.tail is not None and element.tail.strip():
if element.tail and element.tail.strip():
new_elem.text = element.tail.strip()
element.tail = None
if len(new_elem) != 0 or new_elem.text:
element.addnext(new_elem)

if parent.tail and parent.tail.strip():
new_elem.tail = parent.tail.strip()
parent.tail = None

if len(new_elem) != 0 or new_elem.text or new_elem.tail:
grand_parent.insert(grand_parent.index(element) + 1, new_elem)

if len(parent) == 0 and parent.text is None:
parent.getparent().remove(parent)
grand_parent.remove(parent)
Loading