diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 6487ac60..db88158c 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1038,13 +1038,21 @@ def test_table_processing(): """) - processed_table = handle_table(table_with_list, TAG_CATALOG, options) + processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options) result = [ (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag for el in processed_table.iter() ] - # assert result == ["table", "row", "cell", ("p", "a list"), "list", ("item", "one"), ("item", "two"),] assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list'] + + options.recall = True + processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options) + result = [ + (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag + for el in processed_table.iter() + ] + assert result == ["table", "row", "cell", ("p", "a list"), 'list', ("item", "one"), ("item", "two"),] + broken_table = html.fromstring("
cell1
cell2
") processed_table = handle_table(broken_table, TAG_CATALOG, options) result = [el.tag for el in processed_table.iter()] diff --git a/trafilatura/core.py b/trafilatura/core.py index e313a5f0..c75f49b3 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -409,6 +409,11 @@ def handle_table(table_elem, potential_tags, options): child.tag = "cell" processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True) # todo: lists in table cells + elif child.tag == "list" and options.recall: + processed_subchild = handle_lists(child, options) + if processed_subchild is not None: + new_child_elem.append(processed_subchild) + processed_subchild = None # don't handle it anymore else: # subcell_elem = Element(child.tag) processed_subchild = handle_textelem(child, potential_tags.union(["div"]), options)