From bdf3b8be7574c6123a3e1aefab8c6f938d34be69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 14 Jan 2025 12:06:09 +0200 Subject: [PATCH 1/5] [en] Ignore too big rowspand and colspan For some reason some tables now use colspan=999 to make separators, but we don't want that when creating InflCells and parsing tables so let's just collapse anything above 30 to 1. --- src/wiktextract/extractor/en/inflection.py | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/wiktextract/extractor/en/inflection.py b/src/wiktextract/extractor/en/inflection.py index 040607b3b..85d0ad2b5 100644 --- a/src/wiktextract/extractor/en/inflection.py +++ b/src/wiktextract/extractor/en/inflection.py @@ -3046,6 +3046,10 @@ def handle_wikitext_or_html_table( # Imported here to avoid a circular import from wiktextract.page import clean_node, recursively_extract + # from wikitextprocessor.parser import print_tree + # print_tree(tree) + # print("-------==========-------") + if not tablecontext: tablecontext = TableContext() @@ -3089,6 +3093,8 @@ def handle_table1( sub_ret = [] + # from wikitextprocessor.parser import print_tree + # print_tree(tree) for node in tree.children: if not isinstance(node, WikiNode): continue @@ -3169,6 +3175,19 @@ def handle_table1( colspan = 1 # print("COL:", col) + if colspan > 30: + wxr.wtp.error( + f"Colspan {colspan} over 30, set to 1", + sortid="inflection/20250113a", + ) + colspan = 1 + if rowspan > 30: + wxr.wtp.error( + f"Rowspan {rowspan} over 30, set to 1", + sortid="inflection/20250113b", + ) + rowspan = 1 + # Process any nested tables recursively. tables, rest = recursively_extract( col, @@ -3179,6 +3198,7 @@ def handle_table1( # Clean the rest of the cell. celltext = clean_node(wxr, None, rest) # print("CLEANED:", celltext) + # print(f"SUBTABLES: {tables}") # Handle nested tables. for tbl in tables: @@ -3392,6 +3412,10 @@ def parse_inflection_section( titleparts = [] preceding_bolded_title = "" + # from wikitextprocessor.parser import print_tree + # print_tree(tree) + # print("--------------******************----------------") + def process_tables(): for kind, node, titles, after in tables: after = "".join(after).strip() From 30a1d2483a954c10d6a0ba1217f0c73e2dc29dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 14 Jan 2025 12:08:13 +0200 Subject: [PATCH 2/5] [en] Inflection sections: keep tables intact Issue: sometimes inflection table also contain naked tables (or in the case of issue #973, pre-expanded templates that generate table sections), and sometimes these tables break because we split on templates. Fix: keep track of table end-tokens and prioritize tables. --- src/wiktextract/extractor/en/page.py | 67 +++++++++++++++++++++------- 1 file changed, 51 insertions(+), 16 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 5f5aa0edd..fa95b9251 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -2340,7 +2340,14 @@ def inflection_template_fn( text = wxr.wtp.node_to_wikitext(node.children) # Split text into separate sections for each to-level template - brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"] + brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text) + # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"] + # The (?:...) creates a non-capturing regex group; if it was capturing, + # like the group around it, it would create elements in brace_matches, + # including None if it doesn't match. + # 20250114: Added {| and |} into the regex because tables were being + # cut into pieces by this code. Issue #973, introduction of two-part + # book-end templates similar to trans-top and tran-bottom. template_sections = [] template_nesting = 0 # depth of SINGLE BRACES { { nesting } } # Because there is the possibility of triple curly braces @@ -2352,9 +2359,13 @@ def inflection_template_fn( # about the outer-most delimiters (the highest level template) # we can just count the single braces when those single # braces are part of a group. + table_nesting = 0 + # However, if we have a stray table ({| ... |}) that should always + # be its own section, and should prevent templates from cutting it + # into sections. # print(f"Parse inflection: {text=}") - # print(repr(brace_matches)) + # print(f"Brace matches: {repr('///'.join(brace_matches))}") if len(brace_matches) > 1: tsection: list[str] = [] after_templates = False # kludge to keep any text @@ -2368,25 +2379,49 @@ def inflection_template_fn( template_sections.append(tsection) tsection = [] tsection.append(m) - elif m.startswith("{{"): - if template_nesting == 0 and after_templates: + elif m.startswith("{{") or m.endswith("{|"): + if ( + template_nesting == 0 + and after_templates + and table_nesting == 0 + ): template_sections.append(tsection) tsection = [] # start new section after_templates = True - template_nesting += len(m) + if m.startswith("{{"): + template_nesting += 1 + else: + # m.endswith("{|") + table_nesting += 1 tsection.append(m) - elif m.startswith("}}"): - template_nesting -= len(m) - if template_nesting < 0: - wxr.wtp.error( - "Negatively nested braces, " - "couldn't split inflection templates, " - "{}/{} section {}".format(word, language, section), - sortid="page/1871", - ) - template_sections = [] # use whole text - break + elif m.startswith("}}") or m.endswith("|}"): + if m.startswith("}}"): + template_nesting -= 1 + if template_nesting < 0: + wxr.wtp.error( + "Negatively nested braces, " + "couldn't split inflection templates, " + "{}/{} section {}".format( + word, language, section + ), + sortid="page/1871", + ) + template_sections = [] # use whole text + break + else: + table_nesting -= 1 + if table_nesting < 0: + wxr.wtp.error( + "Negatively nested table braces, " + "couldn't split inflection section, " + "{}/{} section {}".format( + word, language, section + ), + sortid="page/20250114", + ) + template_sections = [] # use whole text + break tsection.append(m) else: tsection.append(m) From f680ac18cf8e9cc9b0d8ee97b68fd2b646bb1b97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 14 Jan 2025 12:10:50 +0200 Subject: [PATCH 3/5] [en] Newly introduced book-end templates inflection-table-top These should be pre-expanded, and also exposed some other bugs in the process, but thankfully it seems nothing much needs to be done because the bookends generate complete tables that can be handled easily as complete entities, unlike the mess with trans-top etc. --- src/wiktextract/extractor/en/page.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index fa95b9251..e9a3b5cfb 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -322,6 +322,10 @@ "ru-alt-ё", "inflection of", "no deprecated lang param usage", + # These separated top and bottom templates for inflection tables were + # introduced at the end of 2024... + "inflection-table-top", + "inflection-table-bottom", } # Inverse linkage for those that have them From 5525cd019d6766c627f63d4d16515a233d6d0b41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 14 Jan 2025 12:12:12 +0200 Subject: [PATCH 4/5] [en] More inflectiondata for Assyrian Neo-Aramaic --- src/wiktextract/extractor/en/inflectiondata.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/wiktextract/extractor/en/inflectiondata.py b/src/wiktextract/extractor/en/inflectiondata.py index a1e3a9fc0..06aa199f9 100644 --- a/src/wiktextract/extractor/en/inflectiondata.py +++ b/src/wiktextract/extractor/en/inflectiondata.py @@ -4030,6 +4030,10 @@ "lang": ["Assyrian Neo-Aramaic",], "then": "stem", }, + "base form": { + "lang": ["Assyrian Neo-Aramaic",], + "then": "stem", + }, "Personal-pronoun- including forms": { "lang": [ "Arabic", From d995c99e8498dbd25f22f26b48197f708f9b7858 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Tue, 14 Jan 2025 12:24:56 +0200 Subject: [PATCH 5/5] [en] Tests for table broken by template splitting Put under test_en_inflection_aii --- tests/test_en_inflection_aii.py | 242 ++++++++++++++++++++++++++++++++ 1 file changed, 242 insertions(+) create mode 100644 tests/test_en_inflection_aii.py diff --git a/tests/test_en_inflection_aii.py b/tests/test_en_inflection_aii.py new file mode 100644 index 000000000..9c3c14dd8 --- /dev/null +++ b/tests/test_en_inflection_aii.py @@ -0,0 +1,242 @@ +# -*- fundamental -*- +# +# Tests for parsing inflection tables +# +# Copyright (c) 2021, 2022 Tatu Ylonen. See file LICENSE and https://ylonen.org +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.en.inflection import parse_inflection_section +from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.wxr_context import WiktextractContext + + +class InflTests(unittest.TestCase): + def setUp(self): + self.maxDiff = None + self.wxr = WiktextractContext(Wtp(), WiktionaryConfig()) + self.wxr.wtp.start_page("testpage") + self.wxr.wtp.start_section("Assyrian Neo-Aramaic") + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + close_thesaurus_db( + self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn + ) + + def xinfl(self, word, lang, pos, section, text): + """Runs a single inflection table parsing test, and returns ``data``.""" + self.wxr.wtp.start_page(word) + self.wxr.wtp.start_section(lang) + self.wxr.wtp.start_subsection(pos) + tree = self.wxr.wtp.parse(text) + data = {} + parse_inflection_section(self.wxr, data, word, lang, pos, section, tree) + return data + + def test_aii_table(self): + ret = self.xinfl( + "ܛܠܐ", + "Assyrian Neo-Aramaic", + "prep", + "Conjugation", + """ +
+ + +{| class="inflection-table%2B" + +|+ + class="inflection-table-title" + Inflection of ܛܠܵܐ + + +|- + +! colspan="3" class="outer" | base form + + + + +| [[ܛܠܐ#Assyrian_Neo-Aramaic|ܛܠܵܐ]] (ṭlā) + + + + +|- + +! colspan="999" class="separator" | + + + + +|- + +! colspan="4" class="outer" | Personal-pronoun including forms + + + + +|- + +! rowspan="2" class="outer" | + + + + +! colspan="2" | singular + + + + +! rowspan="2" | plural + + + + +|- + +! class="secondary" | m + + + + +! class="secondary" | f + + + + +|- + +! class="outer" |1st person + + + + +| colspan="2" | [[ܛܠܬܝ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܝܼ]] (ṭlāṯī) + + + + +| [[ܛܠܬܢ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܲܢ]] (ṭlāṯan) + + + + +|- class="vsHide" + +! class="outer" | 2nd person + + + + +| [[ܛܠܬܘܟ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܘܼܟ݂]] (ṭlāṯūḵ) + + + + +| [[ܛܠܬܟܝ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܵܟ݂ܝ]] (ṭlāṯāḵ) + + + + +| [[ܛܠܬܘܟܘܢ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܵܘܟ݂ܘܿܢ]] (ṭlāṯāwḵōn) + + + + +|- + +! class="outer" | 3rd person + + + + +| [[ܛܠܬܗ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܹܗ]] (ṭlāṯēh) + + + + +| [[ܛܠܬܗ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܵܗ̇]] (ṭlāṯāh) + + + + +| [[ܛܠܬܗܘܢ#Assyrian_Neo-Aramaic|ܛܠܵܬ݂ܗܘܿܢ]] (ṭlāṯhōn) + + + + +|} + + + +
+""", # noqa: E501 W291 + ) + expected = { + "forms": [ + { + "form": "no-table-tags", + "tags": ["table-tags"], + "source": "Conjugation", + }, + { + "form": "ܛܠܵܐ", + "roman": "ṭlā", + "source": "Conjugation", + "tags": ["stem"], + }, + { + "form": "ܛܠܵܬ݂ܝܼ", + "roman": "ṭlāṯī", + "source": "Conjugation", + "tags": ["first-person", "singular", "stem"], + }, + { + "form": "ܛܠܵܬ݂ܲܢ", + "roman": "ṭlāṯan", + "source": "Conjugation", + "tags": ["first-person", "plural"], + }, + { + "form": "ܛܠܵܬ݂ܘܼܟ݂", + "roman": "ṭlāṯūḵ", + "source": "Conjugation", + "tags": ["masculine", "second-person", "singular", "stem"], + }, + { + "form": "ܛܠܵܬ݂ܵܟ݂ܝ", + "roman": "ṭlāṯāḵ", + "source": "Conjugation", + "tags": ["feminine", "second-person", "singular", "stem"], + }, + { + "form": "ܛܠܵܬ݂ܵܘܟ݂ܘܿܢ", + "roman": "ṭlāṯāwḵōn", + "source": "Conjugation", + "tags": ["plural", "second-person"], + }, + { + "form": "ܛܠܵܬ݂ܹܗ", + "roman": "ṭlāṯēh", + "source": "Conjugation", + "tags": ["masculine", "singular", "stem", "third-person"], + }, + { + "form": "ܛܠܵܬ݂ܵܗ̇", + "roman": "ṭlāṯāh", + "source": "Conjugation", + "tags": ["feminine", "singular", "stem", "third-person"], + }, + { + "form": "ܛܠܵܬ݂ܗܘܿܢ", + "roman": "ṭlāṯhōn", + "source": "Conjugation", + "tags": ["plural", "third-person"], + }, + ] + } + self.assertEqual(expected, ret)