From daa8773d27eb5f85e6455e94e858a2c3ab01ab00 Mon Sep 17 00:00:00 2001 From: Mikhail Galanin Date: Tue, 2 Apr 2024 13:52:53 +0100 Subject: [PATCH 1/5] Fixed lists inside tables when include_tables=True --- trafilatura/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index 63699a4b..1970c250 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -397,7 +397,7 @@ def handle_table(table_elem, potential_tags, options): # add child element to processed_element if processed_subchild is not None: subchildelem = SubElement(newchildelem, processed_subchild.tag) - subchildelem.text, subchildelem.tail = processed_subchild.text, processed_subchild.tail + subchildelem.text, subchildelem.tail = ''.join(processed_subchild.itertext()), processed_subchild.tail child.tag = 'done' # add to tree if newchildelem.text or len(newchildelem) > 0: From 791c55ef94becb5979830b4078a72682eab75dcd Mon Sep 17 00:00:00 2001 From: Mikhail Galanin Date: Fri, 5 Apr 2024 12:27:26 +0100 Subject: [PATCH 2/5] Better implementation --- tests/unit_tests.py | 3 +-- trafilatura/core.py | 6 +++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index bc0055a4..e417134f 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1028,8 +1028,7 @@ def test_table_processing(): (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag for el in processed_table.iter() ] - # assert result == ["table", "row", "cell", ("p", "a list"), "list", ("item", "one"), ("item", "two"),] - assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list'] + assert result == ["table", "row", "cell", ("p", "a list"), 'list', ("item", "one"), ("item", "two"),] broken_table = html.fromstring("
cell1
cell2
") processed_table = handle_table(broken_table, TAG_CATALOG, options) result = [el.tag for el in processed_table.iter()] diff --git a/trafilatura/core.py b/trafilatura/core.py index 1970c250..bfd4a8e3 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -391,13 +391,17 @@ def handle_table(table_elem, potential_tags, options): child.tag = 'cell' processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True) # todo: lists in table cells + elif child.tag == "list": + if (processed_subchild := handle_lists(child, options)) is not None: + newchildelem.append(processed_subchild) + processed_subchild = None # don't handle it anymore else: # subcell_elem = Element(child.tag) processed_subchild = handle_textelem(child, potential_tags.union(['div']), options) # add child element to processed_element if processed_subchild is not None: subchildelem = SubElement(newchildelem, processed_subchild.tag) - subchildelem.text, subchildelem.tail = ''.join(processed_subchild.itertext()), processed_subchild.tail + subchildelem.text, subchildelem.tail = processed_subchild.text, processed_subchild.tail child.tag = 'done' # add to tree if newchildelem.text or len(newchildelem) > 0: From 954897d2f660d8af381fb4924c0c2258fbc580e3 Mon Sep 17 00:00:00 2001 From: Mikhail Galanin Date: Fri, 5 Apr 2024 13:25:41 +0100 Subject: [PATCH 3/5] This operator does not exist in Python 3.8 --- trafilatura/core.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index bfd4a8e3..46edad06 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -392,7 +392,8 @@ def handle_table(table_elem, potential_tags, options): processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True) # todo: lists in table cells elif child.tag == "list": - if (processed_subchild := handle_lists(child, options)) is not None: + processed_subchild = handle_lists(child, options) + if processed_subchild is not None: newchildelem.append(processed_subchild) processed_subchild = None # don't handle it anymore else: From f5b604507bae4039f03fae3f0b7aba34a3b03441 Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 11 Apr 2024 13:27:55 +0200 Subject: [PATCH 4/5] use only if recall is preferred --- tests/unit_tests.py | 11 ++++++++++- trafilatura/core.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index e417134f..c65248f1 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -1023,12 +1023,21 @@ def test_table_processing(): """) - processed_table = handle_table(table_with_list, TAG_CATALOG, options) + processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options) + result = [ + (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag + for el in processed_table.iter() + ] + assert result == ['table', 'row', 'cell', ('p', 'a list'), 'list'] + + options.recall = True + processed_table = handle_table(copy(table_with_list), TAG_CATALOG, options) result = [ (el.tag, el.text) if el.text is not None and el.text.strip() else el.tag for el in processed_table.iter() ] assert result == ["table", "row", "cell", ("p", "a list"), 'list', ("item", "one"), ("item", "two"),] + broken_table = html.fromstring("
cell1
cell2
") processed_table = handle_table(broken_table, TAG_CATALOG, options) result = [el.tag for el in processed_table.iter()] diff --git a/trafilatura/core.py b/trafilatura/core.py index 46edad06..0a302eb2 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -391,7 +391,7 @@ def handle_table(table_elem, potential_tags, options): child.tag = 'cell' processed_subchild = handle_textnode(child, options, preserve_spaces=True, comments_fix=True) # todo: lists in table cells - elif child.tag == "list": + elif child.tag == "list" and options.recall: processed_subchild = handle_lists(child, options) if processed_subchild is not None: newchildelem.append(processed_subchild) From 7d9c440fdc5e3895e206da1e8fef23bbbc14c06a Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 11 Apr 2024 13:42:07 +0200 Subject: [PATCH 5/5] update and fix variable name --- trafilatura/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index 80374576..c75f49b3 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -412,7 +412,7 @@ def handle_table(table_elem, potential_tags, options): elif child.tag == "list" and options.recall: processed_subchild = handle_lists(child, options) if processed_subchild is not None: - newchildelem.append(processed_subchild) + new_child_elem.append(processed_subchild) processed_subchild = None # don't handle it anymore else: # subcell_elem = Element(child.tag)