Skip to content

Commit

Permalink
favor_recall: process lists inside tables (#534)
Browse files Browse the repository at this point in the history
* Fixed lists inside tables when include_tables=True

* Better implementation

* This operator does not exist in Python 3.8

* use only if recall is preferred

* update and fix variable name

---------

Co-authored-by: Mikhail Galanin <[email protected]>
Co-authored-by: Adrien Barbaresi <[email protected]>
Co-authored-by: Adrien Barbaresi <[email protected]>
  • Loading branch information
4 people authored Apr 11, 2024
1 parent 54ad86c commit 5ca01a8
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
12 changes: 10 additions & 2 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,13 +1038,21 @@ def test_table_processing():
</td>
</tr></table>
""")
processed_table = handle_table(table_with_list, TAG_CATALOG, options)
processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options)
result = [
(el.tag, el.text) if el.text is not None and el.text.strip() else el.tag
for el in processed_table.iter()
]
# assert result == ["table", "row", "cell", ("p", "a list"), "list", ("item", "one"), ("item", "two"),]
assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list']

options.recall = True
processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options)
result = [
(el.tag, el.text) if el.text is not None and el.text.strip() else el.tag
for el in processed_table.iter()
]
assert result == ["table", "row", "cell", ("p", "a list"), 'list', ("item", "one"), ("item", "two"),]

broken_table = html.fromstring("<table><td>cell1</td><tr><td>cell2</td></tr></table>")
processed_table = handle_table(broken_table, TAG_CATALOG, options)
result = [el.tag for el in processed_table.iter()]
Expand Down
5 changes: 5 additions & 0 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,11 @@ def handle_table(table_elem, potential_tags, options):
child.tag = "cell"
processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True)
# todo: lists in table cells
elif child.tag == "list" and options.recall:
processed_subchild = handle_lists(child, options)
if processed_subchild is not None:
new_child_elem.append(processed_subchild)
processed_subchild = None # don't handle it anymore
else:
# subcell_elem = Element(child.tag)
processed_subchild = handle_textelem(child, potential_tags.union(["div"]), options)
Expand Down

0 comments on commit 5ca01a8

Please sign in to comment.