Merge pull request #986 from tatuylonen/split-templates

Fix pre-expanded templates being split by inflection section template detection
tatuylonen · Jan 14, 2025 · e6a833d · e6a833d
2 parents debb661 + d995c99
commit e6a833d
Show file tree

Hide file tree

Showing 4 changed files with 325 additions and 16 deletions.
diff --git a/src/wiktextract/extractor/en/inflection.py b/src/wiktextract/extractor/en/inflection.py
@@ -3046,6 +3046,10 @@ def handle_wikitext_or_html_table(
     # Imported here to avoid a circular import
     from wiktextract.page import clean_node, recursively_extract
 
+    # from wikitextprocessor.parser import print_tree
+    # print_tree(tree)
+    # print("-------==========-------")
+
     if not tablecontext:
         tablecontext = TableContext()
 
@@ -3089,6 +3093,8 @@ def handle_table1(
 
         sub_ret = []
 
+        # from wikitextprocessor.parser import print_tree
+        # print_tree(tree)
         for node in tree.children:
             if not isinstance(node, WikiNode):
                 continue
@@ -3169,6 +3175,19 @@ def handle_table1(
                         colspan = 1
                     # print("COL:", col)
 
+                    if colspan > 30:
+                        wxr.wtp.error(
+                            f"Colspan {colspan} over 30, set to 1",
+                            sortid="inflection/20250113a",
+                        )
+                        colspan = 1
+                    if rowspan > 30:
+                        wxr.wtp.error(
+                            f"Rowspan {rowspan} over 30, set to 1",
+                            sortid="inflection/20250113b",
+                        )
+                        rowspan = 1
+
                     # Process any nested tables recursively.
                     tables, rest = recursively_extract(
                         col,
@@ -3179,6 +3198,7 @@ def handle_table1(
                     # Clean the rest of the cell.
                     celltext = clean_node(wxr, None, rest)
                     # print("CLEANED:", celltext)
+                    # print(f"SUBTABLES: {tables}")
 
                     # Handle nested tables.
                     for tbl in tables:
@@ -3392,6 +3412,10 @@ def parse_inflection_section(
     titleparts = []
     preceding_bolded_title = ""
 
+    # from wikitextprocessor.parser import print_tree
+    # print_tree(tree)
+    # print("--------------******************----------------")
+
     def process_tables():
         for kind, node, titles, after in tables:
             after = "".join(after).strip()

diff --git a/src/wiktextract/extractor/en/inflectiondata.py b/src/wiktextract/extractor/en/inflectiondata.py
@@ -4030,6 +4030,10 @@
         "lang": ["Assyrian Neo-Aramaic",],
         "then": "stem",
     },
+    "base form": {
+        "lang": ["Assyrian Neo-Aramaic",],
+        "then": "stem",
+    },
     "Personal-pronoun- including forms": {
         "lang": [
             "Arabic",

diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
@@ -322,6 +322,10 @@
     "ru-alt-ё",
     "inflection of",
     "no deprecated lang param usage",
+    # These separated top and bottom templates for inflection tables were
+    # introduced at the end of 2024...
+    "inflection-table-top",
+    "inflection-table-bottom",
 }
 
 # Inverse linkage for those that have them
@@ -2340,7 +2344,14 @@ def inflection_template_fn(
         text = wxr.wtp.node_to_wikitext(node.children)
 
         # Split text into separate sections for each to-level template
-        brace_matches = re.split("({{+|}}+)", text)  # ["{{", "template", "}}"]
+        brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
+        # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
+        # The (?:...) creates a non-capturing regex group; if it was capturing,
+        # like the group around it, it would create elements in brace_matches,
+        # including None if it doesn't match.
+        # 20250114: Added {| and |} into the regex because tables were being
+        # cut into pieces by this code. Issue #973, introduction of two-part
+        # book-end templates similar to trans-top and tran-bottom.
         template_sections = []
         template_nesting = 0  # depth of SINGLE BRACES { { nesting } }
         # Because there is the possibility of triple curly braces
@@ -2352,9 +2363,13 @@ def inflection_template_fn(
         # about the outer-most delimiters (the highest level template)
         # we can just count the single braces when those single
         # braces are part of a group.
+        table_nesting = 0
+        # However, if we have a stray table ({| ... |}) that should always
+        # be its own section, and should prevent templates from cutting it
+        # into sections.
 
         # print(f"Parse inflection: {text=}")
-        # print(repr(brace_matches))
+        # print(f"Brace matches: {repr('///'.join(brace_matches))}")
         if len(brace_matches) > 1:
             tsection: list[str] = []
             after_templates = False  # kludge to keep any text
@@ -2368,25 +2383,49 @@ def inflection_template_fn(
                     template_sections.append(tsection)
                     tsection = []
                     tsection.append(m)
-                elif m.startswith("{{"):
-                    if template_nesting == 0 and after_templates:
+                elif m.startswith("{{") or m.endswith("{|"):
+                    if (
+                        template_nesting == 0
+                        and after_templates
+                        and table_nesting == 0
+                    ):
                         template_sections.append(tsection)
                         tsection = []
                         # start new section
                     after_templates = True
-                    template_nesting += len(m)
+                    if m.startswith("{{"):
+                        template_nesting += 1
+                    else:
+                        # m.endswith("{|")
+                        table_nesting += 1
                     tsection.append(m)
-                elif m.startswith("}}"):
-                    template_nesting -= len(m)
-                    if template_nesting < 0:
-                        wxr.wtp.error(
-                            "Negatively nested braces, "
-                            "couldn't split inflection templates, "
-                            "{}/{} section {}".format(word, language, section),
-                            sortid="page/1871",
-                        )
-                        template_sections = []  # use whole text
-                        break
+                elif m.startswith("}}") or m.endswith("|}"):
+                    if m.startswith("}}"):
+                        template_nesting -= 1
+                        if template_nesting < 0:
+                            wxr.wtp.error(
+                                "Negatively nested braces, "
+                                "couldn't split inflection templates, "
+                                "{}/{} section {}".format(
+                                    word, language, section
+                                ),
+                                sortid="page/1871",
+                            )
+                            template_sections = []  # use whole text
+                            break
+                    else:
+                        table_nesting -= 1
+                        if table_nesting < 0:
+                            wxr.wtp.error(
+                                "Negatively nested table braces, "
+                                "couldn't split inflection section, "
+                                "{}/{} section {}".format(
+                                    word, language, section
+                                ),
+                                sortid="page/20250114",
+                            )
+                            template_sections = []  # use whole text
+                            break
                     tsection.append(m)
                 else:
                     tsection.append(m)