From bdf3b8be7574c6123a3e1aefab8c6f938d34be69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Tue, 14 Jan 2025 12:06:09 +0200
Subject: [PATCH 1/5] [en] Ignore too big rowspand and colspan

For some reason some tables now use colspan=999
to make separators, but we don't want that when
creating InflCells and parsing tables so let's
just collapse anything above 30 to 1.
---
 src/wiktextract/extractor/en/inflection.py | 24 ++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/wiktextract/extractor/en/inflection.py b/src/wiktextract/extractor/en/inflection.py
index 040607b3b..85d0ad2b5 100644
--- a/src/wiktextract/extractor/en/inflection.py
+++ b/src/wiktextract/extractor/en/inflection.py
@@ -3046,6 +3046,10 @@ def handle_wikitext_or_html_table(
     # Imported here to avoid a circular import
     from wiktextract.page import clean_node, recursively_extract
 
+    # from wikitextprocessor.parser import print_tree
+    # print_tree(tree)
+    # print("-------==========-------")
+
     if not tablecontext:
         tablecontext = TableContext()
 
@@ -3089,6 +3093,8 @@ def handle_table1(
 
         sub_ret = []
 
+        # from wikitextprocessor.parser import print_tree
+        # print_tree(tree)
         for node in tree.children:
             if not isinstance(node, WikiNode):
                 continue
@@ -3169,6 +3175,19 @@ def handle_table1(
                         colspan = 1
                     # print("COL:", col)
 
+                    if colspan > 30:
+                        wxr.wtp.error(
+                            f"Colspan {colspan} over 30, set to 1",
+                            sortid="inflection/20250113a",
+                        )
+                        colspan = 1
+                    if rowspan > 30:
+                        wxr.wtp.error(
+                            f"Rowspan {rowspan} over 30, set to 1",
+                            sortid="inflection/20250113b",
+                        )
+                        rowspan = 1
+
                     # Process any nested tables recursively.
                     tables, rest = recursively_extract(
                         col,
@@ -3179,6 +3198,7 @@ def handle_table1(
                     # Clean the rest of the cell.
                     celltext = clean_node(wxr, None, rest)
                     # print("CLEANED:", celltext)
+                    # print(f"SUBTABLES: {tables}")
 
                     # Handle nested tables.
                     for tbl in tables:
@@ -3392,6 +3412,10 @@ def parse_inflection_section(
     titleparts = []
     preceding_bolded_title = ""
 
+    # from wikitextprocessor.parser import print_tree
+    # print_tree(tree)
+    # print("--------------******************----------------")
+
     def process_tables():
         for kind, node, titles, after in tables:
             after = "".join(after).strip()

From 30a1d2483a954c10d6a0ba1217f0c73e2dc29dd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Tue, 14 Jan 2025 12:08:13 +0200
Subject: [PATCH 2/5] [en] Inflection sections: keep tables intact

Issue: sometimes inflection table also contain naked
tables (or in the case of issue #973, pre-expanded
templates that generate table sections), and sometimes
these tables break because we split on templates.

Fix: keep track of table end-tokens and prioritize
tables.
---
 src/wiktextract/extractor/en/page.py | 67 +++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 16 deletions(-)

diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
index 5f5aa0edd..fa95b9251 100644
--- a/src/wiktextract/extractor/en/page.py
+++ b/src/wiktextract/extractor/en/page.py
@@ -2340,7 +2340,14 @@ def inflection_template_fn(
         text = wxr.wtp.node_to_wikitext(node.children)
 
         # Split text into separate sections for each to-level template
-        brace_matches = re.split("({{+|}}+)", text)  # ["{{", "template", "}}"]
+        brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
+        # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
+        # The (?:...) creates a non-capturing regex group; if it was capturing,
+        # like the group around it, it would create elements in brace_matches,
+        # including None if it doesn't match.
+        # 20250114: Added {| and |} into the regex because tables were being
+        # cut into pieces by this code. Issue #973, introduction of two-part
+        # book-end templates similar to trans-top and tran-bottom.
         template_sections = []
         template_nesting = 0  # depth of SINGLE BRACES { { nesting } }
         # Because there is the possibility of triple curly braces
@@ -2352,9 +2359,13 @@ def inflection_template_fn(
         # about the outer-most delimiters (the highest level template)
         # we can just count the single braces when those single
         # braces are part of a group.
+        table_nesting = 0
+        # However, if we have a stray table ({| ... |}) that should always
+        # be its own section, and should prevent templates from cutting it
+        # into sections.
 
         # print(f"Parse inflection: {text=}")
-        # print(repr(brace_matches))
+        # print(f"Brace matches: {repr('///'.join(brace_matches))}")
         if len(brace_matches) > 1:
             tsection: list[str] = []
             after_templates = False  # kludge to keep any text
@@ -2368,25 +2379,49 @@ def inflection_template_fn(
                     template_sections.append(tsection)
                     tsection = []
                     tsection.append(m)
-                elif m.startswith("{{"):
-                    if template_nesting == 0 and after_templates:
+                elif m.startswith("{{") or m.endswith("{|"):
+                    if (
+                        template_nesting == 0
+                        and after_templates
+                        and table_nesting == 0
+                    ):
                         template_sections.append(tsection)
                         tsection = []
                         # start new section
                     after_templates = True
-                    template_nesting += len(m)
+                    if m.startswith("{{"):
+                        template_nesting += 1
+                    else:
+                        # m.endswith("{|")
+                        table_nesting += 1
                     tsection.append(m)
-                elif m.startswith("}}"):
-                    template_nesting -= len(m)
-                    if template_nesting < 0:
-                        wxr.wtp.error(
-                            "Negatively nested braces, "
-                            "couldn't split inflection templates, "
-                            "{}/{} section {}".format(word, language, section),
-                            sortid="page/1871",
-                        )
-                        template_sections = []  # use whole text
-                        break
+                elif m.startswith("}}") or m.endswith("|}"):
+                    if m.startswith("}}"):
+                        template_nesting -= 1
+                        if template_nesting < 0:
+                            wxr.wtp.error(
+                                "Negatively nested braces, "
+                                "couldn't split inflection templates, "
+                                "{}/{} section {}".format(
+                                    word, language, section
+                                ),
+                                sortid="page/1871",
+                            )
+                            template_sections = []  # use whole text
+                            break
+                    else:
+                        table_nesting -= 1
+                        if table_nesting < 0:
+                            wxr.wtp.error(
+                                "Negatively nested table braces, "
+                                "couldn't split inflection section, "
+                                "{}/{} section {}".format(
+                                    word, language, section
+                                ),
+                                sortid="page/20250114",
+                            )
+                            template_sections = []  # use whole text
+                            break
                     tsection.append(m)
                 else:
                     tsection.append(m)

From f680ac18cf8e9cc9b0d8ee97b68fd2b646bb1b97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Tue, 14 Jan 2025 12:10:50 +0200
Subject: [PATCH 3/5] [en] Newly introduced book-end templates
 inflection-table-top

These should be pre-expanded, and also exposed some other
bugs in the process, but thankfully it seems nothing
much needs to be done because the bookends generate
complete tables that can be handled easily as complete
entities, unlike the mess with trans-top etc.
---
 src/wiktextract/extractor/en/page.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
index fa95b9251..e9a3b5cfb 100644
--- a/src/wiktextract/extractor/en/page.py
+++ b/src/wiktextract/extractor/en/page.py
@@ -322,6 +322,10 @@
     "ru-alt-ё",
     "inflection of",
     "no deprecated lang param usage",
+    # These separated top and bottom templates for inflection tables were
+    # introduced at the end of 2024...
+    "inflection-table-top",
+    "inflection-table-bottom",
 }
 
 # Inverse linkage for those that have them

From 5525cd019d6766c627f63d4d16515a233d6d0b41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Tue, 14 Jan 2025 12:12:12 +0200
Subject: [PATCH 4/5] [en] More inflectiondata for Assyrian Neo-Aramaic

---
 src/wiktextract/extractor/en/inflectiondata.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/wiktextract/extractor/en/inflectiondata.py b/src/wiktextract/extractor/en/inflectiondata.py
index a1e3a9fc0..06aa199f9 100644
--- a/src/wiktextract/extractor/en/inflectiondata.py
+++ b/src/wiktextract/extractor/en/inflectiondata.py
@@ -4030,6 +4030,10 @@
         "lang": ["Assyrian Neo-Aramaic",],
         "then": "stem",
     },
+    "base form": {
+        "lang": ["Assyrian Neo-Aramaic",],
+        "then": "stem",
+    },
     "Personal-pronoun- including forms": {
         "lang": [
             "Arabic",

From d995c99e8498dbd25f22f26b48197f708f9b7858 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= <kristian@clausal.com>
Date: Tue, 14 Jan 2025 12:24:56 +0200
Subject: [PATCH 5/5] [en] Tests for table broken by template splitting

Put under test_en_inflection_aii
---
 tests/test_en_inflection_aii.py | 242 ++++++++++++++++++++++++++++++++
 1 file changed, 242 insertions(+)
 create mode 100644 tests/test_en_inflection_aii.py

diff --git a/tests/test_en_inflection_aii.py b/tests/test_en_inflection_aii.py
new file mode 100644
index 000000000..9c3c14dd8
--- /dev/null
+++ b/tests/test_en_inflection_aii.py
@@ -0,0 +1,242 @@
+# -*- fundamental -*-
+#
+# Tests for parsing inflection tables
+#
+# Copyright (c) 2021, 2022 Tatu Ylonen.  See file LICENSE and https://ylonen.org
+import unittest
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.en.inflection import parse_inflection_section
+from wiktextract.thesaurus import close_thesaurus_db
+from wiktextract.wxr_context import WiktextractContext
+
+
+class InflTests(unittest.TestCase):
+    def setUp(self):
+        self.maxDiff = None
+        self.wxr = WiktextractContext(Wtp(), WiktionaryConfig())
+        self.wxr.wtp.start_page("testpage")
+        self.wxr.wtp.start_section("Assyrian Neo-Aramaic")
+
+    def tearDown(self) -> None:
+        self.wxr.wtp.close_db_conn()
+        close_thesaurus_db(
+            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
+        )
+
+    def xinfl(self, word, lang, pos, section, text):
+        """Runs a single inflection table parsing test, and returns ``data``."""
+        self.wxr.wtp.start_page(word)
+        self.wxr.wtp.start_section(lang)
+        self.wxr.wtp.start_subsection(pos)
+        tree = self.wxr.wtp.parse(text)
+        data = {}
+        parse_inflection_section(self.wxr, data, word, lang, pos, section, tree)
+        return data
+
+    def test_aii_table(self):
+        ret = self.xinfl(
+            "ܛܠܐ",
+            "Assyrian Neo-Aramaic",
+            "prep",
+            "Conjugation",
+            """
+<div class="inflection-table-wrapper%2Binflection-table-narrow%2Binflection-table-red%2B%2Binflection-table-collapsible%2Binflection-table-collapsed%2Bno-vc%2B" style="width%253A%2Bfit-content" data-toggle-category="inflection"><templatestyles src="Template%253Ainflection-table-top%252Fstyle.css">
+
+
+{| class="inflection-table%2B"
+
+|+ 
+ class="inflection-table-title"
+ Inflection of <i class="Syrc%2Bmention" lang="aii">ܛܠܵܐ</i>
+
+
+|- 
+
+! colspan="3" class="outer" | base form
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܐ#Assyrian&#95;Neo-Aramaic|ܛܠܵܐ]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlā</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+|- 
+
+! colspan="999" class="separator" |
+
+
+
+
+|- 
+
+! colspan="4" class="outer" | Personal-pronoun including forms
+
+
+
+
+|- 
+
+! rowspan="2" class="outer" |
+
+
+
+
+! colspan="2" | singular
+
+
+
+
+! rowspan="2" | plural
+
+
+
+
+|- 
+
+! class="secondary" | <span class="gender"><abbr title="masculine+gender">m</abbr></span>
+
+
+
+
+! class="secondary" | <span class="gender"><abbr title="feminine+gender">f</abbr></span>
+
+
+
+
+|- 
+
+! class="outer" |1<sup>st</sup> person
+
+
+
+
+| colspan="2" | <span class="Syrc" lang="aii">[[ܛܠܬܝ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܝܼ]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯī</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܬܢ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܲܢ]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯan</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+|- class="vsHide"
+
+! class="outer" | 2<sup>nd</sup> person
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܬܘܟ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܘܼܟ݂]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯūḵ</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܬܟܝ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܵܟ݂ܝ]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯāḵ</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܬܘܟܘܢ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܵܘܟ݂ܘܿܢ]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯāwḵōn</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+|- 
+
+! class="outer" | 3<sup>rd</sup> person
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܬܗ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܹܗ]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯēh</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܬܗ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܵܗ̇]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯāh</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+| <span class="Syrc" lang="aii">[[ܛܠܬܗܘܢ#Assyrian&#95;Neo-Aramaic|ܛܠܵܬ݂ܗܘܿܢ]]</span> <span class="mention-gloss-paren+annotation-paren">(</span><span lang="aii-Latn" class="tr+Latn">ṭlāṯhōn</span><span class="mention-gloss-paren+annotation-paren">)</span>
+
+
+
+
+|}
+
+
+
+</div>
+""",  # noqa: E501 W291
+        )
+        expected = {
+            "forms": [
+                {
+                    "form": "no-table-tags",
+                    "tags": ["table-tags"],
+                    "source": "Conjugation",
+                },
+                {
+                    "form": "ܛܠܵܐ",
+                    "roman": "ṭlā",
+                    "source": "Conjugation",
+                    "tags": ["stem"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܝܼ",
+                    "roman": "ṭlāṯī",
+                    "source": "Conjugation",
+                    "tags": ["first-person", "singular", "stem"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܲܢ",
+                    "roman": "ṭlāṯan",
+                    "source": "Conjugation",
+                    "tags": ["first-person", "plural"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܘܼܟ݂",
+                    "roman": "ṭlāṯūḵ",
+                    "source": "Conjugation",
+                    "tags": ["masculine", "second-person", "singular", "stem"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܵܟ݂ܝ",
+                    "roman": "ṭlāṯāḵ",
+                    "source": "Conjugation",
+                    "tags": ["feminine", "second-person", "singular", "stem"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܵܘܟ݂ܘܿܢ",
+                    "roman": "ṭlāṯāwḵōn",
+                    "source": "Conjugation",
+                    "tags": ["plural", "second-person"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܹܗ",
+                    "roman": "ṭlāṯēh",
+                    "source": "Conjugation",
+                    "tags": ["masculine", "singular", "stem", "third-person"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܵܗ̇",
+                    "roman": "ṭlāṯāh",
+                    "source": "Conjugation",
+                    "tags": ["feminine", "singular", "stem", "third-person"],
+                },
+                {
+                    "form": "ܛܠܵܬ݂ܗܘܿܢ",
+                    "roman": "ṭlāṯhōn",
+                    "source": "Conjugation",
+                    "tags": ["plural", "third-person"],
+                },
+            ]
+        }
+        self.assertEqual(expected, ret)