Skip to content

Commit

Permalink
Better implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
mikhainin committed Apr 5, 2024
1 parent daa8773 commit 791c55e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 3 deletions.
3 changes: 1 addition & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,8 +1028,7 @@ def test_table_processing():
(el.tag, el.text) if el.text is not None and el.text.strip() else el.tag
for el in processed_table.iter()
]
# assert result == ["table", "row", "cell", ("p", "a list"), "list", ("item", "one"), ("item", "two"),]
assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list']
assert result == ["table", "row", "cell", ("p", "a list"), 'list', ("item", "one"), ("item", "two"),]
broken_table = html.fromstring("<table><td>cell1</td><tr><td>cell2</td></tr></table>")
processed_table = handle_table(broken_table, TAG_CATALOG, options)
result = [el.tag for el in processed_table.iter()]
Expand Down
6 changes: 5 additions & 1 deletion trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,13 +391,17 @@ def handle_table(table_elem, potential_tags, options):
child.tag = 'cell'
processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
# todo: lists in table cells
elif child.tag == "list":
if (processed_subchild := handle_lists(child, options)) is not None:
newchildelem.append(processed_subchild)
processed_subchild = None # don't handle it anymore
else:
# subcell_elem = Element(child.tag)
processed_subchild = handle_textelem(child, potential_tags.union(['div']), options)
# add child element to processed_element
if processed_subchild is not None:
subchildelem = SubElement(newchildelem, processed_subchild.tag)
subchildelem.text, subchildelem.tail = ''.join(processed_subchild.itertext()), processed_subchild.tail
subchildelem.text, subchildelem.tail = processed_subchild.text, processed_subchild.tail
child.tag = 'done'
# add to tree
if newchildelem.text or len(newchildelem) > 0:
Expand Down

0 comments on commit 791c55e

Please sign in to comment.