From 5ca01a8e3c40be6de677d2d4cc0e41c9636753a2 Mon Sep 17 00:00:00 2001 From: Mikhail Galanin <195510+mikhainin@users.noreply.github.com> Date: Thu, 11 Apr 2024 12:50:20 +0100 Subject: [PATCH] favor_recall: process lists inside tables (#534) * Fixed lists inside tables when include_tables=True * Better implementation * This operator does not exist in Python 3.8 * use only if recall is preferred * update and fix variable name --------- Co-authored-by: Mikhail Galanin Co-authored-by: Adrien Barbaresi Co-authored-by: Adrien Barbaresi --- tests/unit_tests.py | 12 ++++++++++-- trafilatura/core.py | 5 +++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 6487ac60..db88158c 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1038,13 +1038,21 @@ def test_table_processing(): """) - processed_table = handle_table(table_with_list, TAG_CATALOG, options) + processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options) result = [ (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag for el in processed_table.iter() ] - # assert result == ["table", "row", "cell", ("p", "a list"), "list", ("item", "one"), ("item", "two"),] assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list'] + + options.recall = True + processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options) + result = [ + (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag + for el in processed_table.iter() + ] + assert result == ["table", "row", "cell", ("p", "a list"), 'list', ("item", "one"), ("item", "two"),] + broken_table = html.fromstring("
cell1
cell2
") processed_table = handle_table(broken_table, TAG_CATALOG, options) result = [el.tag for el in processed_table.iter()] diff --git a/trafilatura/core.py b/trafilatura/core.py index e313a5f0..c75f49b3 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -409,6 +409,11 @@ def handle_table(table_elem, potential_tags, options): child.tag = "cell" processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True) # todo: lists in table cells + elif child.tag == "list" and options.recall: + processed_subchild = handle_lists(child, options) + if processed_subchild is not None: + new_child_elem.append(processed_subchild) + processed_subchild = None # don't handle it anymore else: # subcell_elem = Element(child.tag) processed_subchild = handle_textelem(child, potential_tags.union(["div"]), options)