Skip to content

Commit

Permalink
Merge pull request #986 from tatuylonen/split-templates
Browse files Browse the repository at this point in the history
Fix pre-expanded templates being split by inflection section template detection
  • Loading branch information
kristian-clausal authored Jan 14, 2025
2 parents debb661 + d995c99 commit e6a833d
Show file tree
Hide file tree
Showing 4 changed files with 325 additions and 16 deletions.
24 changes: 24 additions & 0 deletions src/wiktextract/extractor/en/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3046,6 +3046,10 @@ def handle_wikitext_or_html_table(
# Imported here to avoid a circular import
from wiktextract.page import clean_node, recursively_extract

# from wikitextprocessor.parser import print_tree
# print_tree(tree)
# print("-------==========-------")

if not tablecontext:
tablecontext = TableContext()

Expand Down Expand Up @@ -3089,6 +3093,8 @@ def handle_table1(

sub_ret = []

# from wikitextprocessor.parser import print_tree
# print_tree(tree)
for node in tree.children:
if not isinstance(node, WikiNode):
continue
Expand Down Expand Up @@ -3169,6 +3175,19 @@ def handle_table1(
colspan = 1
# print("COL:", col)

if colspan > 30:
wxr.wtp.error(
f"Colspan {colspan} over 30, set to 1",
sortid="inflection/20250113a",
)
colspan = 1
if rowspan > 30:
wxr.wtp.error(
f"Rowspan {rowspan} over 30, set to 1",
sortid="inflection/20250113b",
)
rowspan = 1

# Process any nested tables recursively.
tables, rest = recursively_extract(
col,
Expand All @@ -3179,6 +3198,7 @@ def handle_table1(
# Clean the rest of the cell.
celltext = clean_node(wxr, None, rest)
# print("CLEANED:", celltext)
# print(f"SUBTABLES: {tables}")

# Handle nested tables.
for tbl in tables:
Expand Down Expand Up @@ -3392,6 +3412,10 @@ def parse_inflection_section(
titleparts = []
preceding_bolded_title = ""

# from wikitextprocessor.parser import print_tree
# print_tree(tree)
# print("--------------******************----------------")

def process_tables():
for kind, node, titles, after in tables:
after = "".join(after).strip()
Expand Down
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/en/inflectiondata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4030,6 +4030,10 @@
"lang": ["Assyrian Neo-Aramaic",],
"then": "stem",
},
"base form": {
"lang": ["Assyrian Neo-Aramaic",],
"then": "stem",
},
"Personal-pronoun- including forms": {
"lang": [
"Arabic",
Expand Down
71 changes: 55 additions & 16 deletions src/wiktextract/extractor/en/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,10 @@
"ru-alt-ё",
"inflection of",
"no deprecated lang param usage",
# These separated top and bottom templates for inflection tables were
# introduced at the end of 2024...
"inflection-table-top",
"inflection-table-bottom",
}

# Inverse linkage for those that have them
Expand Down Expand Up @@ -2340,7 +2344,14 @@ def inflection_template_fn(
text = wxr.wtp.node_to_wikitext(node.children)

# Split text into separate sections for each to-level template
brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"]
brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
# ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
# The (?:...) creates a non-capturing regex group; if it was capturing,
# like the group around it, it would create elements in brace_matches,
# including None if it doesn't match.
# 20250114: Added {| and |} into the regex because tables were being
# cut into pieces by this code. Issue #973, introduction of two-part
# book-end templates similar to trans-top and tran-bottom.
template_sections = []
template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
# Because there is the possibility of triple curly braces
Expand All @@ -2352,9 +2363,13 @@ def inflection_template_fn(
# about the outer-most delimiters (the highest level template)
# we can just count the single braces when those single
# braces are part of a group.
table_nesting = 0
# However, if we have a stray table ({| ... |}) that should always
# be its own section, and should prevent templates from cutting it
# into sections.

# print(f"Parse inflection: {text=}")
# print(repr(brace_matches))
# print(f"Brace matches: {repr('///'.join(brace_matches))}")
if len(brace_matches) > 1:
tsection: list[str] = []
after_templates = False # kludge to keep any text
Expand All @@ -2368,25 +2383,49 @@ def inflection_template_fn(
template_sections.append(tsection)
tsection = []
tsection.append(m)
elif m.startswith("{{"):
if template_nesting == 0 and after_templates:
elif m.startswith("{{") or m.endswith("{|"):
if (
template_nesting == 0
and after_templates
and table_nesting == 0
):
template_sections.append(tsection)
tsection = []
# start new section
after_templates = True
template_nesting += len(m)
if m.startswith("{{"):
template_nesting += 1
else:
# m.endswith("{|")
table_nesting += 1
tsection.append(m)
elif m.startswith("}}"):
template_nesting -= len(m)
if template_nesting < 0:
wxr.wtp.error(
"Negatively nested braces, "
"couldn't split inflection templates, "
"{}/{} section {}".format(word, language, section),
sortid="page/1871",
)
template_sections = [] # use whole text
break
elif m.startswith("}}") or m.endswith("|}"):
if m.startswith("}}"):
template_nesting -= 1
if template_nesting < 0:
wxr.wtp.error(
"Negatively nested braces, "
"couldn't split inflection templates, "
"{}/{} section {}".format(
word, language, section
),
sortid="page/1871",
)
template_sections = [] # use whole text
break
else:
table_nesting -= 1
if table_nesting < 0:
wxr.wtp.error(
"Negatively nested table braces, "
"couldn't split inflection section, "
"{}/{} section {}".format(
word, language, section
),
sortid="page/20250114",
)
template_sections = [] # use whole text
break
tsection.append(m)
else:
tsection.append(m)
Expand Down
Loading

0 comments on commit e6a833d

Please sign in to comment.