Skip to content

Commit

Permalink
simplify code and address warning
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Jan 23, 2024
1 parent 860f17a commit e9b33e1
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 9 deletions.
6 changes: 3 additions & 3 deletions tests/xml_tei_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,7 @@ def test_ab_with_p_parent_resolved():
cleaned = check_tei(xml_doc, "fake_url")
result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])]
assert result == [
("p", "text1", ''),
("p", "text1", ""),
("ab", "text2", None),
("p", "text3", None),
("ab", "text4", None),
Expand All @@ -341,8 +341,8 @@ def test_ab_with_p_parent_resolved():
cleaned = check_tei(xml_doc, "fake_url")
result = [(elem.tag, elem.text, elem.tail if elem.tail is None else elem.tail.strip()) for elem in xml_doc.iter(["p", "ab"])]
assert result == [
("p", "text0", ''),
("ab", "text1", ''),
("p", "text0", ""),
("ab", "text1", ""),
("p", None, None),
("ab", "text3", None),
("p", "text4", None),
Expand Down
22 changes: 16 additions & 6 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def replace_element_text(element, include_formatting):
def merge_with_parent(element, include_formatting=False):
'''Merge element with its parent and convert formatting to markdown.'''
parent = element.getparent()
if not parent:
if parent is None:
return

full_text = replace_element_text(element, include_formatting)
Expand Down Expand Up @@ -491,18 +491,28 @@ def _wrap_unwanted_siblings_of_div(div_element):


def _move_element_one_level_up(element):
"""
Fix TEI compatibility issues by moving certain p-elems up in the XML tree.
There is always a n+2 nesting for p-elements with the minimal structure ./TEI/text/body/p
"""
parent = element.getparent()
grand_parent = parent.getparent()

new_elem = Element("p")
new_elem.extend(sibling for sibling in element.itersiblings())
grand_parent= parent.getparent()
grand_parent.insert(grand_parent.index(parent)+1, element)
if element.tail is not None and element.tail.strip():

grand_parent.insert(grand_parent.index(parent) + 1, element)

if element.tail and element.tail.strip():
new_elem.text = element.tail.strip()
element.tail = None
if parent.tail is not None and parent.tail.strip():

if parent.tail and parent.tail.strip():
new_elem.tail = parent.tail.strip()
parent.tail = None

if len(new_elem) != 0 or new_elem.text or new_elem.tail:
grand_parent.insert(grand_parent.index(element)+1, new_elem)
grand_parent.insert(grand_parent.index(element) + 1, new_elem)

if len(parent) == 0 and parent.text is None:
grand_parent.remove(parent)

0 comments on commit e9b33e1

Please sign in to comment.