diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py
index 7cb7e46da..1f078fefd 100644
--- a/src/wiktextract/clean.py
+++ b/src/wiktextract/clean.py
@@ -9,13 +9,12 @@
import re
import html
import unicodedata
-from typing import (
- Callable,
- Optional,
- Union
-)
+from typing import Callable, Optional, Union
from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST
-from wikitextprocessor.core import NamespaceDataEntry
+from wikitextprocessor.core import (
+ NamespaceDataEntry,
+ TemplateArgs,
+)
from .wxr_context import WiktextractContext
######################################################################
@@ -94,7 +93,7 @@
"ι": "ᶥ",
"φ": "ᵠ",
"χ": "ᵡ",
- "∞": "\u2002᪲" # This is a KLUDGE
+ "∞": "\u2002᪲", # This is a KLUDGE
}
subscript_ht: dict[str, str] = {
@@ -137,6 +136,7 @@
"χ": "ᵪ",
}
+
def to_superscript(text: str) -> str:
"Converts text to superscript."
if not text:
@@ -147,6 +147,7 @@ def to_superscript(text: str) -> str:
return "^" + text
return "^({})".format(text)
+
def to_subscript(text: str) -> str:
"""Converts text to subscript."""
if not text:
@@ -157,10 +158,11 @@ def to_subscript(text: str) -> str:
return "_" + text
return "_({})".format(text)
+
def to_chem(text: str) -> str:
"""Converts text to chemical formula, making digits subscript."""
- return "".join(to_subscript(x) if x.isdigit() else x
- for x in text)
+ return "".join(to_subscript(x) if x.isdigit() else x for x in text)
+
# Mapping from Latex names to Unicode characters/strings. This is the
# default mapping (some cases are handled specially in the code).
@@ -886,7 +888,6 @@ def to_chem(text: str) -> str:
"zpipe": "⨠",
"zproject": "⨡",
"|": "‖",
-
# Accents XXX these really should be handled specially with diacritics
# after argument
"acute": "́",
@@ -906,8 +907,6 @@ def to_chem(text: str) -> str:
"overline": "◌̅",
"tilde": "̃",
"vec": "⃑",
-
-
# Some ignored operators
"bigl": "",
"bigr": "",
@@ -973,7 +972,7 @@ def to_chem(text: str) -> str:
"z": "𝓏",
}
-mathfrak_map: dict[str, str]= {
+mathfrak_map: dict[str, str] = {
"A": "𝔄",
"B": "𝔅",
"C": "ℭ",
@@ -1070,15 +1069,19 @@ def to_chem(text: str) -> str:
"9": "𝟡",
}
+
def mathcal_fn(text: str) -> str:
return "".join(mathcal_map.get(x, x) for x in text)
+
def mathfrak_fn(text: str) -> str:
return "".join(mathfrak_map.get(x, x) for x in text)
+
def mathbb_fn(text: str) -> str:
return "".join(mathbb_map.get(x, x) for x in text)
+
def to_math(text: str) -> str:
"""Converts a mathematical formula to ASCII."""
# print("to_math: {!r}".format(text))
@@ -1088,22 +1091,25 @@ def expand(text: str) -> str:
while True:
orig = text
# formatting with {:c} converts input into character
- text = re.sub(r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
- lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST],
- text)
+ text = re.sub(
+ r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST),
+ lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST],
+ text,
+ )
if text == orig:
break
return text
def recurse(text: str) -> str:
- def math_magic(text: str,
- left: str,
- right: str,
- fn: Callable[[str], str]
+ def math_magic(
+ text: str, left: str, right: str, fn: Callable[[str], str]
) -> str:
regexp_str = r"{}([^{}{}]+){}".format(
- re.escape(left), re.escape(left),
- re.escape(right), re.escape(right))
+ re.escape(left),
+ re.escape(left),
+ re.escape(right),
+ re.escape(right),
+ )
regexp = re.compile(regexp_str)
def repl(m: re.Match) -> str:
@@ -1150,8 +1156,11 @@ def expand_group(v: str) -> str:
elif re.match(r"\\sqrt($|[0-9]|\b)", v):
v = "√"
elif re.match(r"\\(frac|binom)($|[0-9]|\b)", v):
- m = re.match(r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*"
- r"(\\[a-zA-Z]+|\\.|.)$", v)
+ m = re.match(
+ r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*"
+ r"(\\[a-zA-Z]+|\\.|.)$",
+ v,
+ )
if not m:
print("MATH FRAC/BINOM ERROR: {!r}".format(v))
return v
@@ -1198,31 +1207,37 @@ def expand_group(v: str) -> str:
text = math_magic(text, "{", "}", recurse)
if text == orig:
break
- for m in re.finditer(r"\s+|"
- r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*"
- r"(\\dot\\(bigvee|cup|cap|lor|vee)|"
- r"\\not\\(subset|supset|subseteq|supseteq|in|ni|"
- r"preceq|succeq|vartrianglelefteq|"
- r"vartrianglerighteq|trianglelefteq|"
- r"trianglerighteq)|"
- r"\\widehat\{=\}|\\widehat=|"
- r"\\overset\{?\}\{=\}|"
- r"\\overset\?=|"
- r"\\overset\{\\operatorname\{def\}\}\{=\}|"
- r"\\[a-zA-Z]+|\\.|.)|"
- r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)"
- r"\b\s*|"
- r"\\sqrt\b(\[\d+\])?)?"
- r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)", text):
+ for m in re.finditer(
+ r"\s+|"
+ r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*"
+ r"(\\dot\\(bigvee|cup|cap|lor|vee)|"
+ r"\\not\\(subset|supset|subseteq|supseteq|in|ni|"
+ r"preceq|succeq|vartrianglelefteq|"
+ r"vartrianglerighteq|trianglelefteq|"
+ r"trianglerighteq)|"
+ r"\\widehat\{=\}|\\widehat=|"
+ r"\\overset\{?\}\{=\}|"
+ r"\\overset\?=|"
+ r"\\overset\{\\operatorname\{def\}\}\{=\}|"
+ r"\\[a-zA-Z]+|\\.|.)|"
+ r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)"
+ r"\b\s*|"
+ r"\\sqrt\b(\[\d+\])?)?"
+ r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)",
+ text,
+ ):
v = m.group(0).strip()
if not v:
continue
v = expand_group(v)
if v:
- if ((parts and parts[-1][-1].isalpha() and
- v[0] in "0123456789") or
- (parts and parts[-1][-1] in "0123456789" and
- v[0] in "0123456789")):
+ if (
+ parts and parts[-1][-1].isalpha() and v[0] in "0123456789"
+ ) or (
+ parts
+ and parts[-1][-1] in "0123456789"
+ and v[0] in "0123456789"
+ ):
v = " " + v
parts.append(v)
@@ -1237,7 +1252,7 @@ def expand_group(v: str) -> str:
def bold_follows(parts: list[str], i: int) -> bool:
"""Checks if there is a bold (''') in parts after parts[i]. We allow
intervening italics ('')."""
- parts = parts[i + 1:]
+ parts = parts[i + 1 :]
for p in parts:
if not p.startswith("''"):
continue
@@ -1308,13 +1323,12 @@ def remove_italic_and_bold(text: str) -> str:
continue
new_text_parts.append(part)
new_text_parts.append("\n")
- new_text_parts = new_text_parts[:-1] # remove last \n
+ new_text_parts = new_text_parts[:-1] # remove last \n
return "".join(new_text_parts)
-def clean_value(wxr: WiktextractContext,
- title: str,
- no_strip=False,
- no_html_strip=False
+
+def clean_value(
+ wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False
) -> str:
"""Cleans a title or value into a normal string. This should basically
remove any Wikimedia formatting from it: HTML tags, templates, links,
@@ -1334,17 +1348,18 @@ def repl_exturl(m: re.Match) -> str:
break
i += 1
return " ".join(args[i:])
+
def repl_link(m: re.Match) -> str:
if m.group(2) and m.group(2).lower() in ("file", "image"):
return ""
v = m.group(3).split("|")
return clean_value(wxr, v[0], no_strip=True)
+
def repl_link_bars(m: re.Match) -> str:
lnk = m.group(1)
if re.match(r"(?si)(File|Image)\s*:", lnk):
return ""
- return clean_value(wxr, m.group(4) or m.group(2) or "",
- no_strip=True)
+ return clean_value(wxr, m.group(4) or m.group(2) or "", no_strip=True)
def repl_1_sup(m: re.Match) -> str:
return to_superscript(clean_value(wxr, m.group(1)))
@@ -1373,34 +1388,47 @@ def repl_1_syntaxhighlight(m: re.Match) -> str:
# Remove references ([...]).
title = re.sub(r"(?is)[/]*?>\s*.*?]", "", title)
# Replace ... by stripped content without newlines
- title = re.sub(r"(?is)]*?>(.*?)\s*",
- lambda m: re.sub(r"\s+", " ", m.group(1)),
- title)
+ title = re.sub(
+ r"(?is)]*?>(.*?)\s*",
+ lambda m: re.sub(r"\s+", " ", m.group(1)),
+ title,
+ )
# Replace
by comma space (it is used to express alternatives in some
# declensions)
title = re.sub(r"(?si)\s*
\n*", "\n", title)
# Remove divs with floatright class (generated e.g. by {{ja-kanji|...}})
- title = re.sub(r'(?si)
]*?\bclass="[^"]*?\bfloatright\b[^>]*?>'
- r'((
|.)*?
)|.)*?'
- r'
',
- "", title)
+ title = re.sub(
+ r'(?si)]*?\bclass="[^"]*?\bfloatright\b[^>]*?>'
+ r"((
|.)*?
)|.)*?"
+ r"
",
+ "",
+ title,
+ )
# Remove divs with float: attribute
- title = re.sub(r'(?si)]*?\bstyle="[^"]*?\bfloat:[^>]*?>'
- r'((
|.)*?
)|.)*?'
- r'
',
- "", title)
+ title = re.sub(
+ r'(?si)]*?\bstyle="[^"]*?\bfloat:[^>]*?>'
+ r"((
|.)*?
)|.)*?"
+ r"
",
+ "",
+ title,
+ )
# Remove with previewonly class (generated e.g. by {{taxlink|...}})
- title = re.sub(r'(?si)]*?\bclass="[^"<>]*?'
- r'\bpreviewonly\b[^>]*?>'
- r'.+?',
- "", title)
+ title = re.sub(
+ r'(?si)]*?\bclass="[^"<>]*?'
+ r"\bpreviewonly\b[^>]*?>"
+ r".+?",
+ "",
+ title,
+ )
# Remove ...
- title = re.sub(r'(?si)]*?\bclass="[^"]*?\berror\b[^>]*?>'
- r'.+?',
- "", title)
+ title = re.sub(
+ r'(?si)]*?\bclass="[^"]*?\berror\b[^>]*?>'
+ r".+?",
+ "",
+ title,
+ )
# Change and
to newlines. Ditto for tr, li, table, dl, ul, ol
- title = re.sub(r"(?si)?(div|tr|li|table|dl|ul|ol)\b[^>]*>",
- "\n", title)
+ title = re.sub(r"(?si)?(div|tr|li|table|dl|ul|ol)\b[^>]*>", "\n", title)
# Change , , and into newlines;
# these generate new rows/lines.
title = re.sub(r"(?i)?d[dt]\s*>", "\n", title)
@@ -1408,22 +1436,20 @@ def repl_1_syntaxhighlight(m: re.Match) -> str:
title = re.sub(r"(?si)?(td|th)\b[^>]*>", " ", title)
# Change ... to ^
title = re.sub(r"(?si)]*>\s*", "", title)
- title = re.sub(r"(?si)]*>(.*?)",
- repl_1_sup, title)
+ title = re.sub(r"(?si)]*>(.*?)", repl_1_sup, title)
# Change ... to _
title = re.sub(r"(?si)]*>\s*", "", title)
- title = re.sub(r"(?si)]*>(.*?)",
- repl_1_sub, title)
+ title = re.sub(r"(?si)]*>(.*?)", repl_1_sub, title)
# Change ... using subscripts for digits
- title = re.sub(r"(?si)]*>(.*?)",
- repl_1_chem, title)
+ title = re.sub(r"(?si)]*>(.*?)", repl_1_chem, title)
# Change using special formatting.
- title = re.sub(r"(?si)",
- repl_1_math, title)
+ title = re.sub(r"(?si)", repl_1_math, title)
# Change ... using special formatting.
- title = re.sub(r"(?si)]*>(.*?)"
- r"",
- repl_1_syntaxhighlight, title)
+ title = re.sub(
+ r"(?si)]*>(.*?)" r"",
+ repl_1_syntaxhighlight,
+ title,
+ )
# Remove any remaining HTML tags.
if not no_html_strip:
title = re.sub(r"(?s)<[/!a-zA-Z][^>]*>", "", title)
@@ -1441,7 +1467,7 @@ def repl_1_syntaxhighlight(m: re.Match) -> str:
category_ns_data: NamespaceDataEntry
# XXX "Category" -> config variable for portability
- category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item]
+ category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item]
# Fail if we received empty dict from .get()
category_ns_names = {category_ns_data["name"]} | set(
category_ns_data["aliases"]
@@ -1455,22 +1481,30 @@ def repl_1_syntaxhighlight(m: re.Match) -> str:
"",
title,
)
- title = re.sub(r"(?s)\[\[\s*:?([^]|#<>]+?)\s*(#[^][|<>]*?)?\]\]",
- repl_1, title)
- title = re.sub(r"(?s)\[\[\s*(([a-zA-Z0-9]+)\s*:)?\s*([^][#|<>]+?)"
- r"\s*(#[^][|]*?)?\|?\]\]",
- repl_link, title)
- title = re.sub(r"(?s)\[\[\s*([^][|<>]+?)\s*\|"
- r"\s*(([^][|]|\[[^]]*\])+?)"
- r"(\s*\|\s*(([^]|]|\[[^]]*\])+?))*\s*\]\]",
- repl_link_bars, title)
+ title = re.sub(
+ r"(?s)\[\[\s*:?([^]|#<>]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title
+ )
+ title = re.sub(
+ r"(?s)\[\[\s*(([a-zA-Z0-9]+)\s*:)?\s*([^][#|<>]+?)"
+ r"\s*(#[^][|]*?)?\|?\]\]",
+ repl_link,
+ title,
+ )
+ title = re.sub(
+ r"(?s)\[\[\s*([^][|<>]+?)\s*\|"
+ r"\s*(([^][|]|\[[^]]*\])+?)"
+ r"(\s*\|\s*(([^]|]|\[[^]]*\])+?))*\s*\]\]",
+ repl_link_bars,
+ title,
+ )
if title == orig:
break
# Replace remaining HTML links by the URL.
while True:
orig = title
- title = re.sub(r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]",
- repl_exturl, title)
+ title = re.sub(
+ r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", repl_exturl, title
+ )
if title == orig:
break
@@ -1508,14 +1542,16 @@ def repl_1_syntaxhighlight(m: re.Match) -> str:
return title
-def clean_template_args(wxr: WiktextractContext,
- ht: dict[Union[int, str], str], # XXX -> "TemplateArgs"
- no_strip=False
+def clean_template_args(
+ wxr: WiktextractContext, ht: TemplateArgs, no_strip=False
) -> dict[str, str]:
"""Cleans all values in a template argument dictionary and returns the
cleaned dictionary."""
assert isinstance(wxr, WiktextractContext)
assert isinstance(ht, dict)
- return {clean_value(wxr, str(k), no_html_strip=True):
- clean_value(wxr, str(v), no_strip=no_strip, no_html_strip=True)
- for k, v in ht.items()}
+ return {
+ clean_value(wxr, str(k), no_html_strip=True): clean_value(
+ wxr, str(v), no_strip=no_strip, no_html_strip=True
+ )
+ for k, v in ht.items()
+ }
diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py
index 00e924b46..23156cb44 100644
--- a/src/wiktextract/extractor/en/page.py
+++ b/src/wiktextract/extractor/en/page.py
@@ -11,14 +11,22 @@
from functools import partial
from re import Pattern
from typing import (
+ TYPE_CHECKING,
+ Callable,
Optional,
Set,
Union,
+ cast,
)
from mediawiki_langcodes import get_all_names, name_to_code
from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.core import TemplateArgs
+from wikitextprocessor.core import (
+ TemplateArgs,
+ TemplateFnCallable,
+ PostTemplateFnCallable,
+)
+from wikitextprocessor.parser import GeneralNode
from wiktextract.clean import clean_template_args
from wiktextract.datautils import (
data_append,
@@ -44,7 +52,11 @@
from wiktextract.parts_of_speech import PARTS_OF_SPEECH
from wiktextract.tags import valid_tags
from wiktextract.translations import parse_translation_item_text
-from wiktextract.type_utils import WordData
+from wiktextract.type_utils import (
+ SenseData,
+ SoundData,
+ WordData,
+)
from wiktextract.wxr_context import WiktextractContext
from ..ruby import extract_ruby, parse_ruby
@@ -53,174 +65,177 @@
# Matches head tag
HEAD_TAG_RE: Pattern = re.compile(
- r"^(head|Han char|arabic-noun|arabic-noun-form|"
- r"hangul-symbol|syllable-hangul)$|" +
- r"^(latin|" +
- "|".join(lang_code for lang_code, *_ in get_all_names("en")) +
- r")-(" +
- "|".join([
- "abbr",
- "adj",
- "adjective",
- "adjective form",
- "adjective-form",
- "adv",
- "adverb",
- "affix",
- "animal command",
- "art",
- "article",
- "aux",
- "bound pronoun",
- "bound-pronoun",
- "Buyla",
- "card num",
- "card-num",
- "cardinal",
- "chunom",
- "classifier",
- "clitic",
- "cls",
- "cmene",
- "cmavo",
- "colloq-verb",
- "colverbform",
- "combining form",
- "combining-form",
- "comparative",
- "con",
- "concord",
- "conj",
- "conjunction",
- "conjug",
- "cont",
- "contr",
- "converb",
- "daybox",
- "decl",
- "decl noun",
- "def",
- "dem",
- "det",
- "determ",
- "Deva",
- "ending",
- "entry",
- "form",
- "fuhivla",
- "gerund",
- "gismu",
- "hanja",
- "hantu",
- "hanzi",
- "head",
- "ideophone",
- "idiom",
- "inf",
- "indef",
- "infixed pronoun",
- "infixed-pronoun",
- "infl",
- "inflection",
- "initialism",
- "int",
- "interfix",
- "interj",
- "interjection",
- "jyut",
- "latin",
- "letter",
- "locative",
- "lujvo",
- "monthbox",
- "mutverb",
- "name",
- "nisba",
- "nom",
- "noun",
- "noun form",
- "noun-form",
- "noun plural",
- "noun-plural",
- "nounprefix",
- "num",
- "number",
- "numeral",
- "ord",
- "ordinal",
- "par",
- "part",
- "part form",
- "part-form",
- "participle",
- "particle",
- "past",
- "past neg",
- "past-neg",
- "past participle",
- "past-participle",
- "perfect participle",
- "perfect-participle",
- "personal pronoun",
- "personal-pronoun",
- "pref",
- "prefix",
- "phrase",
- "pinyin",
- "plural noun",
- "plural-noun",
- "pos",
- "poss-noun",
- "post",
- "postp",
- "postposition",
- "PP",
- "pp",
- "ppron",
- "pred",
- "predicative",
- "prep",
- "prep phrase",
- "prep-phrase",
- "preposition",
- "present participle",
- "present-participle",
- "pron",
- "prondem",
- "pronindef",
- "pronoun",
- "prop",
- "proper noun",
- "proper-noun",
- "proper noun form",
- "proper-noun form",
- "proper noun-form",
- "proper-noun-form",
- "prov",
- "proverb",
- "prpn",
- "prpr",
- "punctuation mark",
- "punctuation-mark",
- "regnoun",
- "rel",
- "rom",
- "romanji",
- "root",
- "sign",
- "suff",
- "suffix",
- "syllable",
- "symbol",
- "verb",
- "verb form",
- "verb-form",
- "verbal noun",
- "verbal-noun",
- "verbnec",
- "vform",
- ]) +
- r")(-|/|\+|$)")
+ r"^(head|Han char|arabic-noun|arabic-noun-form|"
+ r"hangul-symbol|syllable-hangul)$|"
+ + r"^(latin|"
+ + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
+ + r")-("
+ + "|".join(
+ [
+ "abbr",
+ "adj",
+ "adjective",
+ "adjective form",
+ "adjective-form",
+ "adv",
+ "adverb",
+ "affix",
+ "animal command",
+ "art",
+ "article",
+ "aux",
+ "bound pronoun",
+ "bound-pronoun",
+ "Buyla",
+ "card num",
+ "card-num",
+ "cardinal",
+ "chunom",
+ "classifier",
+ "clitic",
+ "cls",
+ "cmene",
+ "cmavo",
+ "colloq-verb",
+ "colverbform",
+ "combining form",
+ "combining-form",
+ "comparative",
+ "con",
+ "concord",
+ "conj",
+ "conjunction",
+ "conjug",
+ "cont",
+ "contr",
+ "converb",
+ "daybox",
+ "decl",
+ "decl noun",
+ "def",
+ "dem",
+ "det",
+ "determ",
+ "Deva",
+ "ending",
+ "entry",
+ "form",
+ "fuhivla",
+ "gerund",
+ "gismu",
+ "hanja",
+ "hantu",
+ "hanzi",
+ "head",
+ "ideophone",
+ "idiom",
+ "inf",
+ "indef",
+ "infixed pronoun",
+ "infixed-pronoun",
+ "infl",
+ "inflection",
+ "initialism",
+ "int",
+ "interfix",
+ "interj",
+ "interjection",
+ "jyut",
+ "latin",
+ "letter",
+ "locative",
+ "lujvo",
+ "monthbox",
+ "mutverb",
+ "name",
+ "nisba",
+ "nom",
+ "noun",
+ "noun form",
+ "noun-form",
+ "noun plural",
+ "noun-plural",
+ "nounprefix",
+ "num",
+ "number",
+ "numeral",
+ "ord",
+ "ordinal",
+ "par",
+ "part",
+ "part form",
+ "part-form",
+ "participle",
+ "particle",
+ "past",
+ "past neg",
+ "past-neg",
+ "past participle",
+ "past-participle",
+ "perfect participle",
+ "perfect-participle",
+ "personal pronoun",
+ "personal-pronoun",
+ "pref",
+ "prefix",
+ "phrase",
+ "pinyin",
+ "plural noun",
+ "plural-noun",
+ "pos",
+ "poss-noun",
+ "post",
+ "postp",
+ "postposition",
+ "PP",
+ "pp",
+ "ppron",
+ "pred",
+ "predicative",
+ "prep",
+ "prep phrase",
+ "prep-phrase",
+ "preposition",
+ "present participle",
+ "present-participle",
+ "pron",
+ "prondem",
+ "pronindef",
+ "pronoun",
+ "prop",
+ "proper noun",
+ "proper-noun",
+ "proper noun form",
+ "proper-noun form",
+ "proper noun-form",
+ "proper-noun-form",
+ "prov",
+ "proverb",
+ "prpn",
+ "prpr",
+ "punctuation mark",
+ "punctuation-mark",
+ "regnoun",
+ "rel",
+ "rom",
+ "romanji",
+ "root",
+ "sign",
+ "suff",
+ "suffix",
+ "syllable",
+ "symbol",
+ "verb",
+ "verb form",
+ "verb-form",
+ "verbal noun",
+ "verbal-noun",
+ "verbnec",
+ "vform",
+ ]
+ )
+ + r")(-|/|\+|$)"
+)
FLOATING_TABLE_TEMPLATES: set[str] = {
# az-suffix-form creates a style=floatright div that is otherwise
@@ -439,8 +454,11 @@
"wtorw",
}
for x in PANEL_PREFIXES & wikipedia_templates:
- print("WARNING: {!r} in both panel_templates and wikipedia_templates"
- .format(x))
+ print(
+ "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
+ x
+ )
+ )
# Mapping from a template name (without language prefix) for the main word
# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
@@ -482,8 +500,10 @@
for k, v in template_allowed_pos_map.items():
for x in v:
if x not in PARTS_OF_SPEECH:
- print("BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
- "".format(x, k, v))
+ print(
+ "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
+ "".format(x, k, v)
+ )
assert False
@@ -526,9 +546,10 @@
# Regexp for matching ignored etymology template names. This adds certain
# prefixes to the names listed above.
ignored_etymology_templates_re = re.compile(
- r"^((cite-|R:|RQ:).*|" +
- r"|".join(re.escape(x) for x in ignored_etymology_templates) +
- r")$")
+ r"^((cite-|R:|RQ:).*|"
+ + r"|".join(re.escape(x) for x in ignored_etymology_templates)
+ + r")$"
+)
# Regexp for matching ignored descendants template names. Right now we just
# copy the ignored etymology templates
@@ -618,19 +639,38 @@
# Template name component to linkage section listing. Integer section means
# default section, starting at that argument.
-template_linkage_mappings: list[list[Union[str, int]]] = [
- ["syn", "synonyms"],
- ["synonyms", "synonyms"],
- ["ant", "antonyms"],
- ["antonyms", "antonyms"],
- ["hyp", "hyponyms"],
- ["hyponyms", "hyponyms"],
- ["der", "derived"],
- ["derived terms", "derived"],
- ["coordinate terms", "coordinate_terms"],
- ["rel", "related"],
- ["col", 2],
-]
+# XXX not used anymore, except for the first elements: moved to
+# template_linkages
+# template_linkage_mappings: list[list[Union[str, int]]] = [
+# ["syn", "synonyms"],
+# ["synonyms", "synonyms"],
+# ["ant", "antonyms"],
+# ["antonyms", "antonyms"],
+# ["hyp", "hyponyms"],
+# ["hyponyms", "hyponyms"],
+# ["der", "derived"],
+# ["derived terms", "derived"],
+# ["coordinate terms", "coordinate_terms"],
+# ["rel", "related"],
+# ["col", 2],
+# ]
+
+# Template names, this was exctracted from template_linkage_mappings,
+# because the code using template_linkage_mappings was actually not used
+# (but not removed).
+template_linkages: set[str] = {
+ "syn",
+ "synonyms",
+ "ant",
+ "antonyms",
+ "hyp",
+ "hyponyms",
+ "der",
+ "derived terms",
+ "coordinate terms",
+ "rel",
+ "col",
+}
# Maps template name used in a word sense to a linkage field that it adds.
sense_linkage_templates: dict[str, str] = {
@@ -655,11 +695,11 @@ def decode_html_entities(v: Union[str, int]) -> str:
return html.unescape(v)
-def parse_sense_linkage(wxr:
- WiktextractContext,
- data: WordData,
- name: str,
- ht: TemplateArgs,
+def parse_sense_linkage(
+ wxr: WiktextractContext,
+ data: SenseData,
+ name: str,
+ ht: TemplateArgs,
) -> None:
"""Parses a linkage (synonym, etc) specified in a word sense."""
assert isinstance(wxr, WiktextractContext)
@@ -670,13 +710,15 @@ def parse_sense_linkage(wxr:
for i in range(2, 20):
w = ht.get(i) or ""
w = clean_node(wxr, data, w)
- if w.startswith(ns_title_prefix_tuple(wxr, "Thesaurus")):
- w = w[10:]
+ for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
+ if w.startswith(alias):
+ w = w[len(alias) :]
+ break
if not w:
break
tags: list[str] = []
topics: list[str] = []
- english = None
+ english: Optional[str] = None
# Try to find qualifiers for this synonym
q = ht.get("q{}".format(i - 1))
if q:
@@ -703,7 +745,7 @@ def parse_sense_linkage(wxr:
alt = None
m = re.search(r"\(([^)]+)\)$", w)
if m:
- w = w[:m.start()].strip()
+ w = w[: m.start()].strip()
alt = m.group(1)
dt = {"word": w}
@@ -718,15 +760,15 @@ def parse_sense_linkage(wxr:
data_append(data, field, dt)
-def parse_language(wxr: WiktextractContext,
- langnode: WikiNode,
- language: str,
- lang_code: str) -> list[WordData]:
+def parse_language(
+ wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
+) -> list[WordData]:
"""Iterates over the text of the page, returning words (parts-of-speech)
defined on the page one at a time. (Individual word senses for the
same part-of-speech are typically encoded in the same entry.)"""
# imported here to avoid circular import
from wiktextract.pronunciations import parse_pronunciation
+
assert isinstance(wxr, WiktextractContext)
assert isinstance(langnode, WikiNode)
assert isinstance(language, str)
@@ -737,85 +779,110 @@ def parse_language(wxr: WiktextractContext,
word = wxr.wtp.title
unsupported_prefix = "Unsupported titles/"
if word.startswith(unsupported_prefix):
- w = word[len(unsupported_prefix):]
+ w = word[len(unsupported_prefix) :]
if w in unsupported_title_map:
word = unsupported_title_map[w]
else:
- wxr.wtp.error("Unimplemented unsupported title: {}".format(word),
- sortid="page/870")
+ wxr.wtp.error(
+ "Unimplemented unsupported title: {}".format(word),
+ sortid="page/870",
+ )
word = w
elif word.startswith("Reconstruction:"):
- word = word[word.find("/") + 1:]
+ word = word[word.find("/") + 1 :]
is_reconstruction = True
- base_data = {"word": word, "lang": language, "lang_code": lang_code}
+ base_data: WordData = {
+ "word": word,
+ "lang": language,
+ "lang_code": lang_code,
+ }
if is_reconstruction:
data_append(base_data, "tags", "reconstruction")
- sense_data = {}
- pos_data = {} # For a current part-of-speech
- etym_data = {} # For one etymology
- pos_datas = []
- etym_datas = []
- page_datas = []
+ sense_data: SenseData = {}
+ pos_data: WordData = {} # For a current part-of-speech
+ etym_data: WordData = {} # For one etymology
+ pos_datas: list[SenseData] = []
+ etym_datas: list[WordData] = []
+ page_datas: list[WordData] = []
have_etym = False
- stack = []
+ stack: list[str] = [] # names of items on the "stack"
- def merge_base(data, base):
+ def merge_base(data: WordData, base: WordData) -> None:
for k, v in base.items():
# Copy the value to ensure that we don't share lists or
# dicts between structures (even nested ones).
v = copy.deepcopy(v)
if k not in data:
# The list was copied above, so this will not create shared ref
- data[k] = v
+ data[k] = v # type: ignore[literal-required]
continue
- if data[k] == v:
+ if data[k] == v: # type: ignore[literal-required]
continue
- if (isinstance(data[k], (list, tuple)) or
- isinstance(v, (list, tuple))):
- data[k] = list(data[k]) + list(v)
- elif data[k] != v:
- wxr.wtp.warning("conflicting values for {} in merge_base: "
- "{!r} vs {!r}"
- .format(k, data[k], v),
- sortid="page/904")
-
- def complementary_pop(pron, key):
+ if (
+ isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
+ or isinstance(
+ v,
+ (list, tuple), # Should this be "and"?
+ )
+ ):
+ data[k] = list(data[k]) + list(v) # type: ignore
+ elif data[k] != v: # type: ignore[literal-required]
+ wxr.wtp.warning(
+ "conflicting values for {} in merge_base: "
+ "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
+ sortid="page/904",
+ )
+
+ def complementary_pop(pron: SoundData, key: str) -> SoundData:
"""Remove unnecessary keys from dict values
in a list comprehension..."""
if key in pron:
- pron.pop(key)
+ pron.pop(key) # type: ignore
return pron
# If the result has sounds, eliminate sounds that have a prefix that
# does not match "word" or one of "forms"
if "sounds" in data and "word" in data:
accepted = [data["word"]]
- accepted.extend(f["form"] for f in data.get("forms", ()))
- data["sounds"] = list(complementary_pop(s, "pos")
- for s in data["sounds"]
- if "form" not in s or s["form"] in accepted)
+ accepted.extend(f["form"] for f in data.get("forms", dict()))
+ data["sounds"] = list(
+ s
+ for s in data["sounds"]
+ if "form" not in s or s["form"] in accepted
+ )
# If the result has sounds, eliminate sounds that have a pos that
# does not match "pos"
if "sounds" in data and "pos" in data:
- data["sounds"] = list(s for s in data["sounds"]
- if "pos" not in s or s["pos"] == data["pos"])
+ data["sounds"] = list(
+ complementary_pop(s, "pos")
+ for s in data["sounds"]
+ # "pos" is not a field of SoundData, correctly, so we're
+ # removing it here. It's a kludge on a kludge on a kludge.
+ if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
+ )
- def push_sense():
+ def push_sense() -> bool:
"""Starts collecting data for a new word sense. This returns True
if a sense was added."""
nonlocal sense_data
tags = sense_data.get("tags", ())
- if (not sense_data.get("glosses") and
- "translation-hub" not in tags and
- "no-gloss" not in tags):
+ if (
+ not sense_data.get("glosses")
+ and "translation-hub" not in tags
+ and "no-gloss" not in tags
+ ):
return False
- if (("participle" in sense_data.get("tags", ()) or
- "infinitive" in sense_data.get("tags", ())) and
- "alt_of" not in sense_data and
- "form_of" not in sense_data and
- "etymology_text" in etym_data):
+ if (
+ (
+ "participle" in sense_data.get("tags", ())
+ or "infinitive" in sense_data.get("tags", ())
+ )
+ and "alt_of" not in sense_data
+ and "form_of" not in sense_data
+ and "etymology_text" in etym_data
+ ):
etym = etym_data["etymology_text"]
etym = etym.split(". ")[0]
ret = parse_alt_or_inflection_of(wxr, etym, set())
@@ -829,28 +896,29 @@ def push_sense():
data_extend(sense_data, "alt_of", lst)
data_extend(sense_data, "tags", tags)
- if (not sense_data.get("glosses") and
- "no-gloss" not in sense_data.get("tags", ())):
+ if not sense_data.get("glosses") and "no-gloss" not in sense_data.get(
+ "tags", ()
+ ):
data_append(sense_data, "tags", "no-gloss")
pos_datas.append(sense_data)
sense_data = {}
return True
- def push_pos():
+ def push_pos() -> None:
"""Starts collecting data for a new part-of-speech."""
nonlocal pos_data
nonlocal pos_datas
push_sense()
if wxr.wtp.subsection:
- data = {"senses": pos_datas}
+ data: WordData = {"senses": pos_datas}
merge_base(data, pos_data)
etym_datas.append(data)
pos_data = {}
pos_datas = []
wxr.wtp.start_subsection(None)
- def push_etym():
+ def push_etym() -> None:
"""Starts collecting data for a new etymology."""
nonlocal etym_data
nonlocal etym_datas
@@ -863,7 +931,7 @@ def push_etym():
etym_data = {}
etym_datas = []
- def select_data():
+ def select_data() -> WordData:
"""Selects where to store data (pos or etym) based on whether we
are inside a pos (part-of-speech)."""
if wxr.wtp.subsection is not None:
@@ -872,7 +940,9 @@ def select_data():
return base_data
return etym_data
- def head_post_template_fn(name, ht, expansion):
+ def head_post_template_fn(
+ name: str, ht: TemplateArgs, expansion: str
+ ) -> Optional[str]:
"""Handles special templates in the head section of a word. Head
section is the text after part-of-speech subtitle and before word
sense list. Typically it generates the bold line for the word, but
@@ -934,15 +1004,15 @@ def head_post_template_fn(name, ht, expansion):
return None
- def parse_part_of_speech(posnode, pos):
+ def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
"""Parses the subsection for a part-of-speech under a language on
a page."""
assert isinstance(posnode, WikiNode)
assert isinstance(pos, str)
# print("parse_part_of_speech", pos)
pos_data["pos"] = pos
- pre = [[]] # list of lists
- lists = [[]] # list of lists
+ pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
+ lists: list[list[WikiNode]] = [[]] # list of lists
first_para = True
first_head_tmplt = True
collecting_head = True
@@ -965,13 +1035,13 @@ def parse_part_of_speech(posnode, pos):
floaters, poschildren = recursively_extract(
posnode.children,
lambda x: (
- isinstance(x, WikiNode) and
- x.kind == NodeKind.TEMPLATE and
- x.largs[0][0] in FLOATING_TABLE_TEMPLATES
- )
+ isinstance(x, WikiNode)
+ and x.kind == NodeKind.TEMPLATE
+ and x.largs[0][0] in FLOATING_TABLE_TEMPLATES
+ ),
)
tempnode = WikiNode(NodeKind.LEVEL5, 0)
- tempnode.largs = ['Inflection']
+ tempnode.largs = [["Inflection"]]
tempnode.children = floaters
parse_inflection(tempnode, "Floating Div", pos)
# print(poschildren)
@@ -981,12 +1051,12 @@ def parse_part_of_speech(posnode, pos):
if not floaters:
wxr.wtp.debug(
"PoS section without contents",
- sortid="en/page/1051/20230612"
+ sortid="en/page/1051/20230612",
)
else:
wxr.wtp.debug(
"PoS section without contents except for a floating table",
- sortid="en/page/1056/20230612"
+ sortid="en/page/1056/20230612",
)
return
@@ -1019,16 +1089,19 @@ def parse_part_of_speech(posnode, pos):
elif collecting_head and kind == NodeKind.LINK:
# We might collect relevant links as they are often pictures
# relating to the word
- if (len(node.largs[0]) >= 1 and
- isinstance(node.largs[0][0], str)):
- if node.largs[0][0].startswith(ns_title_prefix_tuple(
- wxr, "Category")):
+ if len(node.largs[0]) >= 1 and isinstance(
+ node.largs[0][0], str
+ ):
+ if node.largs[0][0].startswith(
+ ns_title_prefix_tuple(wxr, "Category")
+ ):
# [[Category:...]]
# We're at the end of the file, probably, so stop
# here. Otherwise the head will get garbage.
break
- if node.largs[0][0].startswith(ns_title_prefix_tuple(
- wxr, "File")):
+ if node.largs[0][0].startswith(
+ ns_title_prefix_tuple(wxr, "File")
+ ):
# Skips file links
continue
start_of_paragraph = False
@@ -1040,8 +1113,12 @@ def parse_part_of_speech(posnode, pos):
lists.append([]) # Lists parallels pre
collecting_head = True
start_of_paragraph = True
- elif (collecting_head and
- node.sarg not in ("gallery", "ref", "cite", "caption")):
+ elif collecting_head and node.sarg not in (
+ "gallery",
+ "ref",
+ "cite",
+ "caption",
+ ):
start_of_paragraph = False
pre[-1].append(node)
else:
@@ -1061,21 +1138,23 @@ def parse_part_of_speech(posnode, pos):
# skip these templates; panel_templates is already used
# to skip certain templates else, but it also applies to
# head parsing quite well.
- if is_panel_template(wxr, node.largs[0][0]):
+ # node.largs[0][0] should always be str, but can't type-check
+ # that.
+ if is_panel_template(wxr, node.largs[0][0]): # type: ignore[arg-type]
continue
# skip these templates
# if node.largs[0][0] in skip_these_templates_in_head:
- # first_head_tmplt = False # no first_head_tmplt at all
- # start_of_paragraph = False
- # continue
+ # first_head_tmplt = False # no first_head_tmplt at all
+ # start_of_paragraph = False
+ # continue
if first_head_tmplt and pre[-1]:
first_head_tmplt = False
start_of_paragraph = False
pre[-1].append(node)
elif pre[-1] and start_of_paragraph:
- pre.append([]) # Switch to the next head
- lists.append([]) # lists parallel pre
+ pre.append([]) # Switch to the next head
+ lists.append([]) # lists parallel pre
collecting_head = True
start_of_paragraph = False
pre[-1].append(node)
@@ -1092,8 +1171,8 @@ def parse_part_of_speech(posnode, pos):
# Clean up empty pairs, and fix messes with extra newlines that
# separate templates that are followed by lists wiktextract issue #314
- cleaned_pre = []
- cleaned_lists = []
+ cleaned_pre: list[list[Union[str, WikiNode]]] = []
+ cleaned_lists: list[list[WikiNode]] = []
pairless_pre_index = None
for pre1, ls in zip(pre, lists):
@@ -1102,8 +1181,9 @@ def parse_part_of_speech(posnode, pos):
if not pre1 and not ls:
# skip [] + []
continue
- if not ls and all((isinstance(x, str) and not x.strip())
- for x in pre1):
+ if not ls and all(
+ (isinstance(x, str) and not x.strip()) for x in pre1
+ ):
# skip ["\n", " "] + []
continue
if ls and not pre1:
@@ -1118,7 +1198,7 @@ def parse_part_of_speech(posnode, pos):
lists = cleaned_lists
there_are_many_heads = len(pre) > 1
- header_tags = []
+ header_tags: list[str] = []
if not any(g for g in lists):
process_gloss_without_list(poschildren, pos, pos_data, header_tags)
@@ -1128,60 +1208,75 @@ def parse_part_of_speech(posnode, pos):
# # don't have gloss list
# # XXX add code here to filter out 'garbage', like text
# # that isn't a head template or head.
- # continue
+ # continue
if all(not sl for sl in lists[i:]):
if i == 0:
if isinstance(node, str):
- wxr.wtp.debug("first head without list of senses,"
- "string: '{}[...]', {}/{}".format(
- node[:20], word, language),
- sortid="page/1689/20221215")
+ wxr.wtp.debug(
+ "first head without list of senses,"
+ "string: '{}[...]', {}/{}".format(
+ node[:20], word, language
+ ),
+ sortid="page/1689/20221215",
+ )
if isinstance(node, WikiNode):
- if node.largs and node.largs[0][0] in ["Han char",]:
+ if node.largs and node.largs[0][0] in [
+ "Han char",
+ ]:
# just ignore these templates
pass
else:
- wxr.wtp.debug("first head without "
- "list of senses, "
- "template node "
- "{}, {}/{}".format(
- node.largs, word, language),
- sortid="page/1694/20221215")
+ wxr.wtp.debug(
+ "first head without "
+ "list of senses, "
+ "template node "
+ "{}, {}/{}".format(
+ node.largs, word, language
+ ),
+ sortid="page/1694/20221215",
+ )
else:
- wxr.wtp.debug("first head without list of senses, "
- "{}/{}".format(
- word, language),
- sortid="page/1700/20221215")
+ wxr.wtp.debug(
+ "first head without list of senses, "
+ "{}/{}".format(word, language),
+ sortid="page/1700/20221215",
+ )
# no break here so that the first head always
# gets processed.
else:
if isinstance(node, str):
- wxr.wtp.debug("later head without list of senses,"
- "string: '{}[...]', {}/{}".format(
- node[:20], word, language),
- sortid="page/1708/20221215")
+ wxr.wtp.debug(
+ "later head without list of senses,"
+ "string: '{}[...]', {}/{}".format(
+ node[:20], word, language
+ ),
+ sortid="page/1708/20221215",
+ )
if isinstance(node, WikiNode):
- wxr.wtp.debug("later head without list of senses,"
- "template node "
- "{}, {}/{}".format(
- node.sarg if node.sarg else node.largs,
- word, language),
- sortid="page/1713/20221215")
+ wxr.wtp.debug(
+ "later head without list of senses,"
+ "template node "
+ "{}, {}/{}".format(
+ node.sarg if node.sarg else node.largs,
+ word,
+ language,
+ ),
+ sortid="page/1713/20221215",
+ )
else:
- wxr.wtp.debug("later head without list of senses, "
- "{}/{}".format(
- word, language),
- sortid="page/1719/20221215")
+ wxr.wtp.debug(
+ "later head without list of senses, "
+ "{}/{}".format(word, language),
+ sortid="page/1719/20221215",
+ )
break
head_group = i + 1 if there_are_many_heads else None
# print("parse_part_of_speech: {}: {}: pre={}"
- # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
- process_gloss_header(pre1,
- pos,
- head_group,
- pos_data,
- header_tags)
+ # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
+ process_gloss_header(
+ pre1, pos, head_group, pos_data, header_tags
+ )
for l in ls:
# Parse each list associated with this head.
for node in l.children:
@@ -1194,10 +1289,10 @@ def parse_part_of_speech(posnode, pos):
# the data is already pushed into a sub-gloss
# downstream, unless the higher level has examples
# that need to be put somewhere.
- common_data = {"tags": list(header_tags)}
+ common_data: SenseData = {"tags": list(header_tags)}
if head_group:
common_data["head_nr"] = head_group
- parse_sense_node(node, common_data, pos)
+ parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
# If there are no senses extracted, add a dummy sense. We want to
# keep tags extracted from the head for the dummy sense.
@@ -1211,7 +1306,7 @@ def process_gloss_header(
header_nodes: list[Union[WikiNode, str]],
pos_type: str,
header_group: Optional[int],
- pos_data: dict,
+ pos_data: WordData,
header_tags: list[str],
) -> None:
ruby = []
@@ -1223,10 +1318,14 @@ def process_gloss_header(
exp.children,
lambda x: isinstance(x, WikiNode)
and x.kind == NodeKind.HTML
- and x.sarg == "ruby"
+ and x.sarg == "ruby",
)
if rub is not None:
for r in rub:
+ if TYPE_CHECKING:
+ # we know the lambda above in recursively_extract
+ # returns only WikiNodes in rub
+ assert isinstance(r, WikiNode)
rt = parse_ruby(wxr, r)
if rt is not None:
ruby.append(rt)
@@ -1244,27 +1343,30 @@ def process_gloss_header(
ruby=ruby,
)
if "tags" in pos_data:
- header_tags[:] = pos_data["tags"]
- del pos_data["tags"]
+ # pos_data can get "tags" data from some source; type-checkers
+ # doesn't like it, so let's ignore it.
+ header_tags[:] = pos_data["tags"] # type: ignore[typeddict-item]
+ del pos_data["tags"] # type: ignore[typeddict-item]
else:
header_tags.clear()
def process_gloss_without_list(
nodes: list[Union[WikiNode, str]],
pos_type: str,
- pos_data: dict,
+ pos_data: WordData,
header_tags: list[str],
) -> None:
# gloss text might not inside a list
- header_nodes = []
- gloss_nodes = []
+ header_nodes: list[Union[str, WikiNode]] = []
+ gloss_nodes: list[Union[str, WikiNode]] = []
for node in strip_nodes(nodes):
if isinstance(node, WikiNode):
if node.kind == NodeKind.TEMPLATE:
template_name = node.largs[0][0]
- if (
- template_name == "head"
- or template_name.startswith(f"{lang_code}-")
+ if TYPE_CHECKING:
+ assert isinstance(template_name, str)
+ if template_name == "head" or template_name.startswith(
+ f"{lang_code}-"
):
header_nodes.append(node)
continue
@@ -1281,7 +1383,11 @@ def process_gloss_without_list(
gloss_nodes, pos_type, {"tags": list(header_tags)}
)
- def parse_sense_node(node, sense_base, pos):
+ def parse_sense_node(
+ node: Union[str, WikiNode], # never receives str
+ sense_base: SenseData,
+ pos: str,
+ ) -> bool:
"""Recursively (depth first) parse LIST_ITEM nodes for sense data.
Uses push_sense() to attempt adding data to pos_data in the scope
of parse_language() when it reaches deep in the recursion. push_sense()
@@ -1292,14 +1398,18 @@ def parse_sense_node(node, sense_base, pos):
"""
assert isinstance(sense_base, dict) # Added to every sense deeper in
if not isinstance(node, WikiNode):
- wxr.wtp.debug("{}: parse_sense_node called with"
- "something that isn't a WikiNode".format(pos),
- sortid="page/1287/20230119")
+ # This doesn't seem to ever happen in practice.
+ wxr.wtp.debug(
+ "{}: parse_sense_node called with"
+ "something that isn't a WikiNode".format(pos),
+ sortid="page/1287/20230119",
+ )
return False
if node.kind != NodeKind.LIST_ITEM:
- wxr.wtp.debug("{}: non-list-item inside list".format(pos),
- sortid="page/1678")
+ wxr.wtp.debug(
+ "{}: non-list-item inside list".format(pos), sortid="page/1678"
+ )
return False
if node.sarg == ":":
@@ -1315,7 +1425,7 @@ def parse_sense_node(node, sense_base, pos):
# added |= push_sense() or added |= parse_sense_node(...) to OR.
added = False
- gloss_template_args = set()
+ gloss_template_args: set[str] = set()
# For LISTs and LIST_ITEMS, their argument is something like
# "##" or "##:", and using that we can rudimentally determine
@@ -1330,26 +1440,34 @@ def parse_sense_node(node, sense_base, pos):
# of subglosses below this. The list's
# argument ends with #, and its depth should
# be bigger than parent node.
- subentries = [x for x in children
- if isinstance(x, WikiNode) and
- x.kind == NodeKind.LIST and
- x.sarg == current_depth + "#"]
+ subentries = [
+ x
+ for x in children
+ if isinstance(x, WikiNode)
+ and x.kind == NodeKind.LIST
+ and x.sarg == current_depth + "#"
+ ]
# sublists of examples and quotations. .sarg
# does not end with "#".
- others = [x for x in children
- if isinstance(x, WikiNode) and
- x.kind == NodeKind.LIST and
- x.sarg != current_depth + "#"]
+ others = [
+ x
+ for x in children
+ if isinstance(x, WikiNode)
+ and x.kind == NodeKind.LIST
+ and x.sarg != current_depth + "#"
+ ]
# the actual contents of this particular node.
# can be a gloss (or a template that expands into
# many glosses which we can't easily pre-expand)
# or could be an "outer gloss" with more specific
# subglosses, or could be a qualfier for the subglosses.
- contents = [x for x in children
- if not isinstance(x, WikiNode) or
- x.kind != NodeKind.LIST]
+ contents = [
+ x
+ for x in children
+ if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
+ ]
# If this entry has sublists of entries, we should combine
# gloss information from both the "outer" and sublist content.
# Sometimes the outer gloss
@@ -1371,28 +1489,29 @@ def parse_sense_node(node, sense_base, pos):
# copy current node and modify it so it doesn't
# loop infinitely.
cropped_node = copy.copy(node)
- cropped_node.children = [x for x in children
- if not (isinstance(x, WikiNode) and
- x.kind == NodeKind.LIST and
- x.sarg == current_depth + "#")]
- added |= parse_sense_node(cropped_node,
- sense_base,
- pos)
+ cropped_node.children = [
+ x
+ for x in children
+ if not (
+ isinstance(x, WikiNode)
+ and x.kind == NodeKind.LIST
+ and x.sarg == current_depth + "#"
+ )
+ ]
+ added |= parse_sense_node(cropped_node, sense_base, pos)
nonlocal sense_data # this kludge causes duplicated raw_
- # glosses data if this is not done;
- # if the top-level (cropped_node)
- # does not push_sense() properly or
- # parse_sense_node() returns early,
- # sense_data is not reset. This happens
- # for example when you have a no-gloss
- # string like "(intransitive)":
- # no gloss, push_sense() returns early
- # and sense_data has duplicate data with
- # sense_base
+ # glosses data if this is not done;
+ # if the top-level (cropped_node)
+ # does not push_sense() properly or
+ # parse_sense_node() returns early,
+ # sense_data is not reset. This happens
+ # for example when you have a no-gloss
+ # string like "(intransitive)":
+ # no gloss, push_sense() returns early
+ # and sense_data has duplicate data with
+ # sense_base
sense_data = {}
- added |= parse_sense_node(slc[0],
- sense_base,
- pos)
+ added |= parse_sense_node(slc[0], sense_base, pos)
return added
return process_gloss_contents(
@@ -1408,7 +1527,7 @@ def parse_sense_node(node, sense_base, pos):
def process_gloss_contents(
contents: list[Union[str, WikiNode]],
pos: str,
- sense_base: dict,
+ sense_base: SenseData,
subentries: list[WikiNode] = [],
others: list[WikiNode] = [],
gloss_template_args: Set[str] = set(),
@@ -1430,8 +1549,7 @@ def sense_template_fn(
arg = clean_node(wxr, sense_base, ht.get(2, ()))
if re.match(r"Q\d+$", arg):
data_append(sense_base, "wikidata", arg)
- data_append(sense_base, "senseid",
- langid + ":" + arg)
+ data_append(sense_base, "senseid", langid + ":" + arg)
if name in sense_linkage_templates:
# print(f"SENSE_TEMPLATE_FN: {name}")
parse_sense_linkage(wxr, sense_base, name, ht)
@@ -1470,7 +1588,7 @@ def sense_template_fn(
if is_gloss:
wxr.wtp.warning(
"Example template is used for gloss text",
- sortid="extractor.en.page.sense_template_fn/1415"
+ sortid="extractor.en.page.sense_template_fn/1415",
)
else:
return ""
@@ -1483,7 +1601,7 @@ def sense_template_fn(
gloss_template_args.add(v)
return None
- def extract_link_texts(item):
+ def extract_link_texts(item: GeneralNode) -> None:
"""Recursively extracts link texts from the gloss source. This
information is used to select whether to remove final "." from
form_of/alt_of (e.g., ihm/Hunsrik)."""
@@ -1504,8 +1622,11 @@ def extract_link_texts(item):
return
if item.kind == NodeKind.LINK:
v = item.largs[-1]
- if (isinstance(v, list) and len(v) == 1 and
- isinstance(v[0], str)):
+ if (
+ isinstance(v, list)
+ and len(v) == 1
+ and isinstance(v[0], str)
+ ):
gloss_template_args.add(v[0].strip())
for x in item.children:
extract_link_texts(x)
@@ -1514,11 +1635,16 @@ def extract_link_texts(item):
# get the raw text of non-list contents of this node, and other stuff
# like tag and category data added to sense_base
+ # cast = no-op type-setter for the type-checker
+ partial_template_fn = cast(
+ TemplateFnCallable,
+ partial(sense_template_fn, is_gloss=True),
+ )
rawgloss = clean_node(
wxr,
sense_base,
contents,
- template_fn=partial(sense_template_fn, is_gloss=True),
+ template_fn=partial_template_fn,
collect_links=True,
)
@@ -1542,7 +1668,7 @@ def extract_link_texts(item):
strip_ends = [", particularly:"]
for x in strip_ends:
if rawgloss.endswith(x):
- rawgloss = rawgloss[:-len(x)]
+ rawgloss = rawgloss[: -len(x)]
break
# The gloss could contain templates that produce more list items.
@@ -1562,19 +1688,19 @@ def extract_link_texts(item):
if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
data_append(sense_base, "raw_glosses", subglosses[1])
m = re.match(r"\(([^()]+)\):?\s*", rawgloss)
- # ( ..\1.. ): ... or ( ..\1.. ) ...
+ # ( ..\1.. ): ... or ( ..\1.. ) ...
if m:
q = m.group(1)
- rawgloss = rawgloss[m.end():].strip()
+ rawgloss = rawgloss[m.end() :].strip()
parse_sense_qualifier(wxr, q, sense_base)
if rawgloss == "A pejorative:":
data_append(sense_base, "tags", "pejorative")
- rawgloss = None
+ rawgloss = ""
elif rawgloss == "Short forms.":
data_append(sense_base, "tags", "abbreviation")
- rawgloss = None
+ rawgloss = ""
elif rawgloss == "Technical or specialized senses.":
- rawgloss = None
+ rawgloss = ""
if rawgloss:
data_append(sense_base, "glosses", rawgloss)
if rawgloss in ("A person:",):
@@ -1583,15 +1709,20 @@ def extract_link_texts(item):
# The main recursive call (except for the exceptions at the
# start of this function).
for sublist in subentries:
- if not (isinstance(sublist, WikiNode) and
- sublist.kind == NodeKind.LIST):
- wxr.wtp.debug(f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
- f"with items that are not LISTs",
- sortid="page/1511/20230119")
+ if not (
+ isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
+ ):
+ wxr.wtp.debug(
+ f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
+ f"with items that are not LISTs",
+ sortid="page/1511/20230119",
+ )
continue
for item in sublist.children:
- if not (isinstance(item, WikiNode) and
- item.kind == NodeKind.LIST_ITEM):
+ if not (
+ isinstance(item, WikiNode)
+ and item.kind == NodeKind.LIST_ITEM
+ ):
continue
# copy sense_base to prevent cross-contamination between
# subglosses and other subglosses and superglosses
@@ -1611,20 +1742,22 @@ def extract_link_texts(item):
if added:
if examples:
# this higher-up gloss has examples that we do not want to skip
- wxr.wtp.debug("'{}[...]' gloss has examples we want to keep, "
- "but there are subglosses."
- .format(repr(rawgloss[:30])),
- sortid="page/1498/20230118")
+ wxr.wtp.debug(
+ "'{}[...]' gloss has examples we want to keep, "
+ "but there are subglosses.".format(repr(rawgloss[:30])),
+ sortid="page/1498/20230118",
+ )
else:
return True
# Some entries, e.g., "iacebam", have weird sentences in quotes
# after the gloss, but these sentences don't seem to be intended
# as glosses. Skip them.
- subglosses = list(gl for gl in subglosses
- if gl.strip() and
- not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$',
- gl))
+ subglosses = list(
+ gl
+ for gl in subglosses
+ if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
+ )
if len(subglosses) > 1 and "form_of" not in sense_base:
gl = subglosses[0].strip()
@@ -1633,8 +1766,7 @@ def extract_link_texts(item):
parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
if parsed is not None:
infl_tags, infl_dts = parsed
- if (infl_dts and "form-of" in infl_tags and
- len(infl_tags) == 1):
+ if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
# Interpret others as a particular form under
# "inflection of"
data_extend(sense_base, "tags", infl_tags)
@@ -1677,13 +1809,13 @@ def extract_link_texts(item):
data_extend(sense_data, k, v)
else:
assert k not in ("tags", "categories", "topics")
- sense_data[k] = v
+ sense_data[k] = v # type:ignore[literal-required]
# Parse the gloss for this particular sense
m = re.match(r"^\((([^()]|\([^()]*\))*)\):?\s*", gloss)
- # (...): ... or (...(...)...): ...
+ # (...): ... or (...(...)...): ...
if m:
parse_sense_qualifier(wxr, m.group(1), sense_data)
- gloss = gloss[m.end():].strip()
+ gloss = gloss[m.end() :].strip()
# Remove common suffix "[from 14th c.]" and similar
gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
@@ -1691,12 +1823,15 @@ def extract_link_texts(item):
# Check to make sure we don't have unhandled list items in gloss
ofs = max(gloss.find("#"), gloss.find("* "))
if ofs > 10 and "(#)" not in gloss:
- wxr.wtp.debug("gloss may contain unhandled list items: {}"
- .format(gloss),
- sortid="page/1412")
+ wxr.wtp.debug(
+ "gloss may contain unhandled list items: {}".format(gloss),
+ sortid="page/1412",
+ )
elif "\n" in gloss:
- wxr.wtp.debug("gloss contains newline: {}".format(gloss),
- sortid="page/1416")
+ wxr.wtp.debug(
+ "gloss contains newline: {}".format(gloss),
+ sortid="page/1416",
+ )
# Kludge, some glosses have a comma after initial qualifiers in
# parentheses
@@ -1706,7 +1841,7 @@ def extract_link_texts(item):
if gloss.endswith(":"):
gloss = gloss[:-1].strip()
if gloss.startswith("N. of "):
- gloss = "Name of " + gloss[6:]
+ gloss = "Name of " + gloss[6:]
if gloss.startswith("†"):
data_append(sense_data, "tags", "obsolete")
gloss = gloss[1:]
@@ -1729,16 +1864,19 @@ def extract_link_texts(item):
if tag not in sense_tags:
data_append(sense_data, "tags", tag)
if countability_tags:
- if ("countable" not in sense_tags and
- "uncountable" not in sense_tags):
+ if (
+ "countable" not in sense_tags
+ and "uncountable" not in sense_tags
+ ):
data_extend(sense_data, "tags", countability_tags)
# If outer gloss specifies a form-of ("inflection of", see
# aquamarine/German), try to parse the inner glosses as
# tags for an inflected form.
if "form-of" in sense_base.get("tags", ()):
- parsed = parse_alt_or_inflection_of(wxr, gloss,
- gloss_template_args)
+ parsed = parse_alt_or_inflection_of(
+ wxr, gloss, gloss_template_args
+ )
if parsed is not None:
infl_tags, infl_dts = parsed
if not infl_dts and infl_tags:
@@ -1758,18 +1896,23 @@ def extract_link_texts(item):
split_glosses = []
for m in re.finditer(r"Abbreviation of ", gloss):
if m.start() != position:
- split_glosses.append(gloss[position: m.start()])
+ split_glosses.append(gloss[position : m.start()])
position = m.start()
split_glosses.append(gloss[position:])
for gloss in split_glosses:
# Check if this gloss describes an alt-of or inflection-of
- if (lang_code != "en" and " " not in gloss and distw([word], gloss) < 0.3):
+ if (
+ lang_code != "en"
+ and " " not in gloss
+ and distw([word], gloss) < 0.3
+ ):
# Don't try to parse gloss if it is one word
# that is close to the word itself for non-English words
# (probable translations of a tag/form name)
continue
- parsed = parse_alt_or_inflection_of(wxr, gloss,
- gloss_template_args)
+ parsed = parse_alt_or_inflection_of(
+ wxr, gloss, gloss_template_args
+ )
if parsed is None:
continue
tags, dts = parsed
@@ -1797,7 +1940,7 @@ def extract_link_texts(item):
data_append(sense_data, "form_of", dt)
if len(sense_data) == 0:
- if len(sense_base.get("tags")) == 0:
+ if len(sense_base.get("tags", [])) == 0:
del sense_base["tags"]
sense_data.update(sense_base)
if push_sense():
@@ -1806,7 +1949,9 @@ def extract_link_texts(item):
# print("PARSE_SENSE DONE:", pos_datas[-1])
return added
- def parse_inflection(node, section, pos):
+ def parse_inflection(
+ node: WikiNode, section: str, pos: Optional[str]
+ ) -> None:
"""Parses inflection data (declension, conjugation) from the given
page. This retrieves the actual inflection template
parameters, which are very useful for applications that need
@@ -1818,11 +1963,14 @@ def parse_inflection(node, section, pos):
# print("parse_inflection:", node)
if pos is None:
- wxr.wtp.debug("inflection table outside part-of-speech",
- sortid="page/1812")
+ wxr.wtp.debug(
+ "inflection table outside part-of-speech", sortid="page/1812"
+ )
return
- def inflection_template_fn(name, ht):
+ def inflection_template_fn(
+ name: str, ht: TemplateArgs
+ ) -> Optional[str]:
# print("decl_conj_template_fn", name, ht)
if is_panel_template(wxr, name):
return ""
@@ -1830,8 +1978,11 @@ def inflection_template_fn(name, ht):
# These are not to be captured as an exception to the
# generic code below
return None
- m = re.search(r"-(conj|decl|ndecl|adecl|infl|conjugation|"
- r"declension|inflection|mut|mutation)($|-)", name)
+ m = re.search(
+ r"-(conj|decl|ndecl|adecl|infl|conjugation|"
+ r"declension|inflection|mut|mutation)($|-)",
+ name,
+ )
if m:
args_ht = clean_template_args(wxr, ht)
dt = {"name": name, "args": args_ht}
@@ -1844,7 +1995,7 @@ def inflection_template_fn(name, ht):
text = wxr.wtp.node_to_wikitext(node.children)
# Split text into separate sections for each to-level template
- brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"]
+ brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"]
template_sections = []
template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
# Because there is the possibility of triple curly braces
@@ -1860,16 +2011,15 @@ def inflection_template_fn(name, ht):
# print(text)
# print(repr(brace_matches))
if len(brace_matches) > 1:
- tsection = []
+ tsection: list[str] = []
after_templates = False # kludge to keep any text
- # before first template
- # with the first template;
- # otherwise, text
- # goes with preceding template
+ # before first template
+ # with the first template;
+ # otherwise, text
+ # goes with preceding template
for m in brace_matches:
if m.startswith("{{"):
- if (template_nesting == 0 and
- after_templates):
+ if template_nesting == 0 and after_templates:
template_sections.append(tsection)
tsection = []
# start new section
@@ -1879,12 +2029,13 @@ def inflection_template_fn(name, ht):
elif m.startswith("}}"):
template_nesting -= len(m)
if template_nesting < 0:
- wxr.wtp.error("Negatively nested braces, "
- "couldn't split inflection templates, "
- "{}/{} section {}"
- .format(word, language, section),
- sortid="page/1871")
- template_sections = [] # use whole text
+ wxr.wtp.error(
+ "Negatively nested braces, "
+ "couldn't split inflection templates, "
+ "{}/{} section {}".format(word, language, section),
+ sortid="page/1871",
+ )
+ template_sections = [] # use whole text
break
tsection.append(m)
else:
@@ -1904,16 +2055,20 @@ def inflection_template_fn(name, ht):
for tsection in template_sections:
texts.append("".join(tsection))
if template_nesting != 0:
- wxr.wtp.error("Template nesting error: "
- "template_nesting = {} "
- "couldn't split inflection templates, "
- "{}/{} section {}"
- .format(template_nesting, word, language, section),
- sortid="page/1896")
+ wxr.wtp.error(
+ "Template nesting error: "
+ "template_nesting = {} "
+ "couldn't split inflection templates, "
+ "{}/{} section {}".format(
+ template_nesting, word, language, section
+ ),
+ sortid="page/1896",
+ )
texts = [text]
for text in texts:
- tree = wxr.wtp.parse(text, expand_all=True,
- template_fn=inflection_template_fn)
+ tree = wxr.wtp.parse(
+ text, expand_all=True, template_fn=inflection_template_fn
+ )
# Parse inflection tables from the section. The data is stored
# under "forms".
@@ -1924,12 +2079,20 @@ def inflection_template_fn(name, ht):
template_name = m.group(1)
tablecontext = TableContext(template_name)
- parse_inflection_section(wxr, pos_data,
- word, language,
- pos, section, tree,
- tablecontext=tablecontext)
+ parse_inflection_section(
+ wxr,
+ pos_data,
+ word,
+ language,
+ pos,
+ section,
+ tree,
+ tablecontext=tablecontext,
+ )
- def get_subpage_section(title, subtitle, seq):
+ def get_subpage_section(
+ title: str, subtitle: str, seq: Union[list[str], tuple[str, ...]]
+ ) -> Optional[Union[WikiNode, str]]:
"""Loads a subpage of the given page, and finds the section
for the given language, part-of-speech, and section title. This
is used for finding translations and other sections on subpages."""
@@ -1942,11 +2105,16 @@ def get_subpage_section(title, subtitle, seq):
subpage_title = word + "/" + subtitle
subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
if subpage_content is None:
- wxr.wtp.error("/translations not found despite "
- "{{see translation subpage|...}}",
- sortid="page/1934")
+ wxr.wtp.error(
+ "/translations not found despite "
+ "{{see translation subpage|...}}",
+ sortid="page/1934",
+ )
+ return None
- def recurse(node, seq):
+ def recurse(
+ node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
+ ) -> Optional[Union[str, WikiNode]]:
# print(f"seq: {seq}")
if not seq:
return node
@@ -1970,17 +2138,22 @@ def recurse(node, seq):
subpage_content,
pre_expand=True,
additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
- do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES
+ do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
)
assert tree.kind == NodeKind.ROOT
ret = recurse(tree, seq)
if ret is None:
- wxr.wtp.debug("Failed to find subpage section {}/{} seq {}"
- .format(title, subtitle, seq),
- sortid="page/1963")
+ wxr.wtp.debug(
+ "Failed to find subpage section {}/{} seq {}".format(
+ title, subtitle, seq
+ ),
+ sortid="page/1963",
+ )
return ret
- def parse_linkage(data, field, linkagenode):
+ def parse_linkage(
+ data: WordData, field: str, linkagenode: WikiNode
+ ) -> None:
assert isinstance(data, dict)
assert isinstance(field, str)
assert isinstance(linkagenode, WikiNode)
@@ -1995,7 +2168,11 @@ def parse_linkage(data, field, linkagenode):
toplevel_text = []
next_navframe_sense = None # Used for "(sense):" before NavFrame
- def parse_linkage_item(contents, field, sense):
+ def parse_linkage_item(
+ contents: list[Union[str, WikiNode]],
+ field: str,
+ sense: Optional[str] = None,
+ ):
assert isinstance(contents, (list, tuple))
assert isinstance(field, str)
assert sense is None or isinstance(sense, str)
@@ -2003,11 +2180,13 @@ def parse_linkage_item(contents, field, sense):
# print("PARSE_LINKAGE_ITEM: {} ({}): {}"
# .format(field, sense, contents))
- parts = []
- ruby = []
- urls = []
+ parts: list[str] = []
+ ruby: list[tuple[str, str]] = []
+ urls: list[str] = []
- def item_recurse(contents, italic=False):
+ def item_recurse(
+ contents: list[Union[str, WikiNode]], italic=False
+ ) -> None:
assert isinstance(contents, (list, tuple))
nonlocal sense
nonlocal ruby
@@ -2022,24 +2201,34 @@ def item_recurse(contents, italic=False):
# node.sarg if node.sarg else node.largs)
if kind == NodeKind.LIST:
if parts:
+ sense1: Optional[str]
sense1 = clean_node(wxr, None, parts)
if sense1.endswith(":"):
sense1 = sense1[:-1].strip()
if sense1.startswith("(") and sense1.endswith(")"):
sense1 = sense1[1:-1].strip()
- if sense1.lower() == wxr.config.OTHER_SUBTITLES["translations"]:
+ if (
+ sense1.lower()
+ == wxr.config.OTHER_SUBTITLES["translations"]
+ ):
sense1 = None
# print("linkage item_recurse LIST sense1:", sense1)
- parse_linkage_recurse(node.children, field,
- sense=sense1 or sense)
+ parse_linkage_recurse(
+ node.children, field, sense=sense1 or sense
+ )
parts = []
else:
parse_linkage_recurse(node.children, field, sense)
- elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW,
- NodeKind.TABLE_CELL):
+ elif kind in (
+ NodeKind.TABLE,
+ NodeKind.TABLE_ROW,
+ NodeKind.TABLE_CELL,
+ ):
parse_linkage_recurse(node.children, field, sense)
- elif kind in (NodeKind.TABLE_HEADER_CELL,
- NodeKind.TABLE_CAPTION):
+ elif kind in (
+ NodeKind.TABLE_HEADER_CELL,
+ NodeKind.TABLE_CAPTION,
+ ):
continue
elif kind == NodeKind.HTML:
classes = (node.attrs.get("class") or "").split()
@@ -2065,37 +2254,42 @@ def item_recurse(contents, italic=False):
elif kind == NodeKind.LINK:
ignore = False
if isinstance(node.largs[0][0], str):
- v = node.largs[0][0].strip().lower()
- if v.startswith(ns_title_prefix_tuple(wxr,
- "Category", True) \
- + ns_title_prefix_tuple(wxr,
- "File", True)):
+ v1 = node.largs[0][0].strip().lower()
+ if v1.startswith(
+ ns_title_prefix_tuple(wxr, "Category", True)
+ + ns_title_prefix_tuple(wxr, "File", True)
+ ):
ignore = True
if not ignore:
v = node.largs[-1]
- if (len(node.largs) == 1 and
- len(v) > 0 and
- isinstance(v[0], str) and
- v[0][0] == ":"):
- v = [v[0][1:]] + list(v[1:])
+ if (
+ len(node.largs) == 1
+ and len(v) > 0
+ and isinstance(v[0], str)
+ and v[0][0] == ":"
+ ):
+ v = [v[0][1:]] + list(v[1:]) # type:ignore
item_recurse(v, italic=italic)
elif kind == NodeKind.URL:
if len(node.largs) < 2 and node.largs:
# Naked url captured
- urls.extend(node.largs[-1])
+ urls.extend(node.largs[-1]) # type:ignore[arg-type]
continue
if len(node.largs) == 2:
# Url from link with text
- urls.append(node.largs[0][-1])
+ urls.append(node.largs[0][-1]) # type:ignore[arg-type]
# print(f"{node.largs=!r}")
# print("linkage recurse URL {}".format(node))
item_recurse(node.largs[-1], italic=italic)
elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD):
item_recurse(node.children, italic=italic)
else:
- wxr.wtp.debug("linkage item_recurse unhandled {}: {}"
- .format(node.kind, node),
- sortid="page/2073")
+ wxr.wtp.debug(
+ "linkage item_recurse unhandled {}: {}".format(
+ node.kind, node
+ ),
+ sortid="page/2073",
+ )
# print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
# .format(contents))
@@ -2105,48 +2299,18 @@ def item_recurse(contents, italic=False):
# print("CLEANED ITEM: {!r}".format(item))
# print(f"URLS {urls=!r}")
- return parse_linkage_item_text(wxr, word, data, field, item,
- sense, ruby, pos_datas,
- is_reconstruction, urls)
-
- def parse_linkage_template(node):
- nonlocal have_panel_template
- # XXX remove this function but check how to handle the
- # template_linkage_mappings
- # print("LINKAGE TEMPLATE:", node)
-
- def linkage_template_fn(name, ht):
- # print("LINKAGE_TEMPLATE_FN:", name, ht)
- nonlocal field
- nonlocal have_panel_template
- if is_panel_template(wxr, name):
- have_panel_template = True
- return ""
- for prefix, t in template_linkage_mappings:
- if re.search(r"(^|[-/\s]){}($|\b|[0-9])".format(prefix),
- name):
- f = t if isinstance(t, str) else field
- if (name.endswith("-top") or name.endswith("-bottom") or
- name.endswith("-mid")):
- field = f
- return ""
- i = t if isinstance(t, int) else 2
- while True:
- v = ht.get(i, None)
- if v is None:
- break
- v = clean_node(wxr, None, v)
- parse_linkage_item(v, f)
- i += 1
- return ""
- # print("UNHANDLED LINKAGE TEMPLATE:", name, ht)
- return None
-
- # Main body of parse_linkage_template()
- text = wxr.wtp.node_to_wikitext(node)
- parsed = wxr.wtp.parse(text, expand_all=True,
- template_fn=linkage_template_fn)
- parse_linkage_recurse(parsed.children, field, None)
+ return parse_linkage_item_text(
+ wxr,
+ word,
+ data,
+ field,
+ item,
+ sense,
+ ruby,
+ pos_datas,
+ is_reconstruction,
+ urls,
+ )
def parse_linkage_recurse(contents, field, sense):
assert isinstance(contents, (list, tuple))
@@ -2177,9 +2341,12 @@ def parse_linkage_recurse(contents, field, sense):
parse_linkage_recurse(node.children, field, sense)
elif kind == NodeKind.TABLE_CELL:
parse_linkage_item(node.children, field, sense)
- elif kind in (NodeKind.TABLE_CAPTION,
- NodeKind.TABLE_HEADER_CELL,
- NodeKind.PREFORMATTED, NodeKind.BOLD):
+ elif kind in (
+ NodeKind.TABLE_CAPTION,
+ NodeKind.TABLE_HEADER_CELL,
+ NodeKind.PREFORMATTED,
+ NodeKind.BOLD,
+ ):
continue
elif kind == NodeKind.HTML:
# Recurse to process inside the HTML for most tags
@@ -2196,16 +2363,18 @@ def parse_linkage_recurse(contents, field, sense):
if sense1.endswith(":"):
sense1 = sense1[:-1].strip()
if sense and sense1:
- wxr.wtp.debug("linkage qualifier-content on multiple "
- "levels: {!r} and {!r}"
- .format(sense, sense1),
- sortid="page/2170")
+ wxr.wtp.debug(
+ "linkage qualifier-content on multiple "
+ "levels: {!r} and {!r}".format(sense, sense1),
+ sortid="page/2170",
+ )
parse_linkage_recurse(node.children, field, sense1)
elif "NavFrame" in classes:
# NavFrame uses previously assigned next_navframe_sense
# (from a "(sense):" item) and clears it afterwards
- parse_linkage_recurse(node.children, field,
- sense or next_navframe_sense)
+ parse_linkage_recurse(
+ node.children, field, sense or next_navframe_sense
+ )
next_navframe_sense = None
else:
parse_linkage_recurse(node.children, field, sense)
@@ -2222,9 +2391,12 @@ def parse_linkage_recurse(contents, field, sense):
# initial value
parse_linkage_recurse(node.largs[-1], field, sense)
else:
- wxr.wtp.debug("parse_linkage_recurse unhandled {}: {}"
- .format(kind, node),
- sortid="page/2196")
+ wxr.wtp.debug(
+ "parse_linkage_recurse unhandled {}: {}".format(
+ kind, node
+ ),
+ sortid="page/2196",
+ )
def linkage_template_fn1(name, ht):
nonlocal have_panel_template
@@ -2239,10 +2411,14 @@ def parse_zh_synonyms(parsed, data, hdrs, root_word):
if isinstance(item, WikiNode):
if item.kind == NodeKind.TABLE_ROW:
cleaned = clean_node(wxr, None, item.children)
- #print("cleaned:", repr(cleaned))
- if any(["Variety" in cleaned,
- "Location" in cleaned,
- "Words" in cleaned]):
+ # print("cleaned:", repr(cleaned))
+ if any(
+ [
+ "Variety" in cleaned,
+ "Location" in cleaned,
+ "Words" in cleaned,
+ ]
+ ):
pass
else:
split = cleaned.split("\n")
@@ -2268,11 +2444,15 @@ def parse_zh_synonyms(parsed, data, hdrs, root_word):
if tag in zh_tag_lookup:
tags.extend(zh_tag_lookup[tag])
else:
- print(f"MISSING ZH SYNONYM TAG for root {root_word}, word {words}: {tag}")
+ print(
+ f"MISSING ZH SYNONYM TAG for root {root_word}, word {words}: {tag}"
+ )
sys.stdout.flush()
for word in words:
- data.append({"word": word.strip(), "tags": tags})
+ data.append(
+ {"word": word.strip(), "tags": tags}
+ )
elif item.kind == NodeKind.HTML:
cleaned = clean_node(wxr, None, item.children)
if "Synonyms of" in cleaned:
@@ -2288,10 +2468,14 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word):
if isinstance(item, WikiNode):
if item.kind == NodeKind.LIST_ITEM:
cleaned = clean_node(wxr, None, item.children)
- #print("cleaned:", repr(cleaned))
- if any(["Variety" in cleaned,
- "Location" in cleaned,
- "Words" in cleaned]):
+ # print("cleaned:", repr(cleaned))
+ if any(
+ [
+ "Variety" in cleaned,
+ "Location" in cleaned,
+ "Words" in cleaned,
+ ]
+ ):
pass
else:
cleaned = cleaned.replace("(", ",")
@@ -2309,11 +2493,15 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word):
tags.append(tag)
elif tag in zh_tag_lookup:
tags.extend(zh_tag_lookup[tag])
- elif classify_desc(tag) == "romanization" \
- and roman is None:
+ elif (
+ classify_desc(tag) == "romanization"
+ and roman is None
+ ):
roman = tag
else:
- print(f"MISSING ZH SYNONYM TAG (possibly pinyin) - root {root_word}, word {words}: {tag}")
+ print(
+ f"MISSING ZH SYNONYM TAG (possibly pinyin) - root {root_word}, word {words}: {tag}"
+ )
sys.stdout.flush()
for word in words:
@@ -2328,9 +2516,13 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word):
if cleaned.find("Synonyms of") >= 0:
cleaned = cleaned.replace("Synonyms of ", "")
root_word = cleaned
- parse_zh_synonyms_list(item.children, data, hdrs, root_word)
+ parse_zh_synonyms_list(
+ item.children, data, hdrs, root_word
+ )
else:
- parse_zh_synonyms_list(item.children, data, hdrs, root_word)
+ parse_zh_synonyms_list(
+ item.children, data, hdrs, root_word
+ )
def contains_kind(children, nodekind):
assert isinstance(children, list)
@@ -2345,21 +2537,21 @@ def contains_kind(children, nodekind):
# Main body of parse_linkage()
text = wxr.wtp.node_to_wikitext(linkagenode.children)
- parsed = wxr.wtp.parse(text, expand_all=True,
- template_fn=linkage_template_fn1)
+ parsed = wxr.wtp.parse(
+ text, expand_all=True, template_fn=linkage_template_fn1
+ )
if field == "synonyms" and lang_code == "zh":
synonyms = []
if contains_kind(parsed.children, NodeKind.LIST):
parse_zh_synonyms_list(parsed.children, synonyms, [], "")
else:
parse_zh_synonyms(parsed.children, synonyms, [], "")
- #print(json.dumps(synonyms, indent=4, ensure_ascii=False))
+ # print(json.dumps(synonyms, indent=4, ensure_ascii=False))
data_extend(data, "synonyms", synonyms)
parse_linkage_recurse(parsed.children, field, None)
if not data.get(field) and not have_panel_template:
text = "".join(toplevel_text).strip()
- if ("\n" not in text and "," in text and
- text.count(",") > 3):
+ if "\n" not in text and "," in text and text.count(",") > 3:
if not text.startswith("See "):
parse_linkage_item([text], field, None)
@@ -2388,8 +2580,10 @@ def parse_translation_item(contents, lang=None):
# print("sense <- clean_node: ", sense)
idx = sense.find("See also translations at")
if idx > 0:
- wxr.wtp.debug("Skipping translation see also: {}".format(sense),
- sortid="page/2361")
+ wxr.wtp.debug(
+ "Skipping translation see also: {}".format(sense),
+ sortid="page/2361",
+ )
sense = sense[:idx].strip()
if sense.endswith(":"):
sense = sense[:-1].strip()
@@ -2412,10 +2606,13 @@ def translation_item_template_fn(name, ht):
code = ht.get(1)
if code:
if langcode and code != langcode:
- wxr.wtp.debug("inconsistent language codes {} vs "
- "{} in translation item: {!r} {}"
- .format(langcode, code, name, ht),
- sortid="page/2386")
+ wxr.wtp.debug(
+ "inconsistent language codes {} vs "
+ "{} in translation item: {!r} {}".format(
+ langcode, code, name, ht
+ ),
+ sortid="page/2386",
+ )
langcode = code
tr = ht.get(2)
if tr:
@@ -2431,8 +2628,9 @@ def translation_item_template_fn(name, ht):
langcode = code
return None
if name == "trans-see":
- wxr.wtp.error("UNIMPLEMENTED trans-see template",
- sortid="page/2405")
+ wxr.wtp.error(
+ "UNIMPLEMENTED trans-see template", sortid="page/2405"
+ )
return ""
if name.endswith("-top"):
return ""
@@ -2440,28 +2638,41 @@ def translation_item_template_fn(name, ht):
return ""
if name.endswith("-mid"):
return ""
- #wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
+ # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
# .format(name),
# sortid="page/2414")
return None
- sublists = list(x for x in contents
- if isinstance(x, WikiNode) and
- x.kind == NodeKind.LIST)
- contents = list(x for x in contents
- if not isinstance(x, WikiNode) or
- x.kind != NodeKind.LIST)
+ sublists = list(
+ x
+ for x in contents
+ if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
+ )
+ contents = list(
+ x
+ for x in contents
+ if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
+ )
- item = clean_node(wxr, data, contents,
- template_fn=translation_item_template_fn)
+ item = clean_node(
+ wxr, data, contents, template_fn=translation_item_template_fn
+ )
# print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
# Parse the translation item.
if item:
- lang = parse_translation_item_text(wxr, word, data, item, sense,
- pos_datas, lang, langcode,
- translations_from_template,
- is_reconstruction)
+ lang = parse_translation_item_text(
+ wxr,
+ word,
+ data,
+ item,
+ sense,
+ pos_datas,
+ lang,
+ langcode,
+ translations_from_template,
+ is_reconstruction,
+ )
# Handle sublists. They are frequently used for different scripts
# for the language and different variants of the language. We will
@@ -2495,8 +2706,9 @@ def template_fn(name, ht):
sense = None
sub = ht.get(1, "")
if sub:
- m = re.match(r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*",
- sub)
+ m = re.match(
+ r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
+ )
else:
m = None
etym = ""
@@ -2507,51 +2719,83 @@ def template_fn(name, ht):
etym = m.group(2)
pos = m.group(3)
if not sub:
- wxr.wtp.debug("no part-of-speech in "
- "{{see translation subpage|...}}, "
- "defaulting to just wxr.wtp.section "
- "(= language)",
- sortid="page/2468")
+ wxr.wtp.debug(
+ "no part-of-speech in "
+ "{{see translation subpage|...}}, "
+ "defaulting to just wxr.wtp.section "
+ "(= language)",
+ sortid="page/2468",
+ )
# seq sent to get_subpage_section without sub and pos
- seq = [language, wxr.config.OTHER_SUBTITLES["translations"]]
- elif (m and etym.lower().strip()
- in wxr.config.OTHER_SUBTITLES["etymology"]
- and pos.lower() in wxr.config.POS_SUBTITLES):
- seq = [language,
- etym_numbered,
- pos,
- wxr.config.OTHER_SUBTITLES["translations"]]
+ seq = [
+ language,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ ]
+ elif (
+ m
+ and etym.lower().strip()
+ in wxr.config.OTHER_SUBTITLES["etymology"]
+ and pos.lower() in wxr.config.POS_SUBTITLES
+ ):
+ seq = [
+ language,
+ etym_numbered,
+ pos,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ ]
elif sub.lower() in wxr.config.POS_SUBTITLES:
# seq with sub but not pos
- seq = [language,
- sub,
- wxr.config.OTHER_SUBTITLES["translations"]]
+ seq = [
+ language,
+ sub,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ ]
else:
# seq with sub and pos
pos = wxr.wtp.subsection
if pos.lower() not in wxr.config.POS_SUBTITLES:
- wxr.wtp.debug("unhandled see translation subpage: "
- "language={} sub={} wxr.wtp.subsection={}"
- .format(language, sub, wxr.wtp.subsection),
- sortid="page/2478")
- seq = [language,
- sub,
- pos,
- wxr.config.OTHER_SUBTITLES["translations"]]
+ wxr.wtp.debug(
+ "unhandled see translation subpage: "
+ "language={} sub={} wxr.wtp.subsection={}".format(
+ language, sub, wxr.wtp.subsection
+ ),
+ sortid="page/2478",
+ )
+ seq = [
+ language,
+ sub,
+ pos,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ ]
subnode = get_subpage_section(
- wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq)
+ wxr.wtp.title,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ seq,
+ )
if subnode is not None:
parse_translations(data, subnode)
else:
# Failed to find the normal subpage section
seq = [wxr.config.OTHER_SUBTITLES["translations"]]
subnode = get_subpage_section(
- wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq)
+ wxr.wtp.title,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ seq,
+ )
if subnode is not None:
parse_translations(data, subnode)
return ""
- if name in ("c", "C", "categorize", "cat", "catlangname",
- "topics", "top", "qualifier", "cln"):
+ if name in (
+ "c",
+ "C",
+ "categorize",
+ "cat",
+ "catlangname",
+ "topics",
+ "top",
+ "qualifier",
+ "cln",
+ ):
# These are expanded in the default way
return None
if name in ("trans-top",):
@@ -2564,8 +2808,12 @@ def template_fn(name, ht):
sense_parts = []
sense = None
return None
- if name in ("trans-bottom", "trans-mid",
- "checktrans-mid", "checktrans-bottom"):
+ if name in (
+ "trans-bottom",
+ "trans-mid",
+ "checktrans-mid",
+ "checktrans-bottom",
+ ):
return None
if name == "checktrans-top":
sense_parts = []
@@ -2576,11 +2824,17 @@ def template_fn(name, ht):
sense_parts = []
sense = None
return ""
- wxr.wtp.error("UNIMPLEMENTED parse_translation_template: {} {}"
- .format(name, ht),
- sortid="page/2517")
+ wxr.wtp.error(
+ "UNIMPLEMENTED parse_translation_template: {} {}".format(
+ name, ht
+ ),
+ sortid="page/2517",
+ )
return ""
- wxr.wtp.expand(wxr.wtp.node_to_wikitext(node), template_fn=template_fn)
+
+ wxr.wtp.expand(
+ wxr.wtp.node_to_wikitext(node), template_fn=template_fn
+ )
def parse_translation_recurse(xlatnode):
nonlocal sense
@@ -2590,9 +2844,11 @@ def parse_translation_recurse(xlatnode):
if isinstance(node, str):
if sense:
if not node.isspace():
- wxr.wtp.debug("skipping string in the middle of "
- "translations: {}".format(node),
- sortid="page/2530")
+ wxr.wtp.debug(
+ "skipping string in the middle of "
+ "translations: {}".format(node),
+ sortid="page/2530",
+ )
continue
# Add a part to the sense
sense_parts.append(node)
@@ -2616,8 +2872,11 @@ def parse_translation_recurse(xlatnode):
pass
elif kind == NodeKind.TEMPLATE:
parse_translation_template(node)
- elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW,
- NodeKind.TABLE_CELL):
+ elif kind in (
+ NodeKind.TABLE,
+ NodeKind.TABLE_ROW,
+ NodeKind.TABLE_CELL,
+ ):
parse_translation_recurse(node)
elif kind == NodeKind.HTML:
if node.attrs.get("class") == "NavFrame":
@@ -2636,8 +2895,7 @@ def parse_translation_recurse(xlatnode):
elif kind in LEVEL_KINDS:
# Sub-levels will be recursed elsewhere
pass
- elif kind in (NodeKind.ITALIC,
- NodeKind.BOLD):
+ elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
parse_translation_recurse(node)
elif kind == NodeKind.PREFORMATTED:
print("parse_translation_recurse: PREFORMATTED:", node)
@@ -2650,29 +2908,53 @@ def parse_translation_recurse(xlatnode):
# handle them. Note: must be careful not to read other
# links, particularly things like in "human being":
# "a human being -- see [[man/translations]]" (group title)
- if (isinstance(arg0, (list, tuple)) and
- arg0 and
- isinstance(arg0[0], str) and
- arg0[0].endswith("/" + wxr.config.OTHER_SUBTITLES["translations"]) and
- arg0[0][:-(1 + len(wxr.config.OTHER_SUBTITLES["translations"]))] == wxr.wtp.title):
- wxr.wtp.debug("translations subpage link found on main "
- "page instead "
- "of normal {{see translation subpage|...}}",
- sortid="page/2595")
+ if (
+ isinstance(arg0, (list, tuple))
+ and arg0
+ and isinstance(arg0[0], str)
+ and arg0[0].endswith(
+ "/" + wxr.config.OTHER_SUBTITLES["translations"]
+ )
+ and arg0[0][
+ : -(
+ 1
+ + len(
+ wxr.config.OTHER_SUBTITLES["translations"]
+ )
+ )
+ ]
+ == wxr.wtp.title
+ ):
+ wxr.wtp.debug(
+ "translations subpage link found on main "
+ "page instead "
+ "of normal {{see translation subpage|...}}",
+ sortid="page/2595",
+ )
sub = wxr.wtp.subsection
if sub.lower() in wxr.config.POS_SUBTITLES:
- seq = [language, sub, wxr.config.OTHER_SUBTITLES["translations"]]
+ seq = [
+ language,
+ sub,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ ]
subnode = get_subpage_section(
- wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq)
+ wxr.wtp.title,
+ wxr.config.OTHER_SUBTITLES["translations"],
+ seq,
+ )
if subnode is not None:
parse_translations(data, subnode)
else:
- wxr.wtp.errors("/translations link outside "
- "part-of-speech")
+ wxr.wtp.errors(
+ "/translations link outside " "part-of-speech"
+ )
- if (len(arg0) >= 1 and
- isinstance(arg0[0], str) and
- not arg0[0].lower().startswith("category:")):
+ if (
+ len(arg0) >= 1
+ and isinstance(arg0[0], str)
+ and not arg0[0].lower().startswith("category:")
+ ):
for x in node.largs[-1]:
if isinstance(x, str):
sense_parts.append(x)
@@ -2681,9 +2963,11 @@ def parse_translation_recurse(xlatnode):
elif not sense:
sense_parts.append(node)
else:
- wxr.wtp.debug("skipping text between translation items/senses: "
- "{}".format(node),
- sortid="page/2621")
+ wxr.wtp.debug(
+ "skipping text between translation items/senses: "
+ "{}".format(node),
+ sortid="page/2621",
+ )
# Main code of parse_translation(). We want ``sense`` to be assigned
# regardless of recursion levels, and thus the code is structured
@@ -2720,17 +3004,25 @@ def etym_post_template_fn(name, ht, expansion):
if ignore_count == 0:
ht = clean_template_args(wxr, ht)
expansion = clean_node(wxr, None, expansion)
- templates.append({"name": name, "args": ht, "expansion": expansion})
+ templates.append(
+ {"name": name, "args": ht, "expansion": expansion}
+ )
return None
# Remove any subsections
- contents = list(x for x in node.children
- if not isinstance(x, WikiNode) or
- x.kind not in LEVEL_KINDS)
+ contents = list(
+ x
+ for x in node.children
+ if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
+ )
# Convert to text, also capturing templates using post_template_fn
- text = clean_node(wxr, None, contents,
- template_fn=etym_template_fn,
- post_template_fn=etym_post_template_fn)
+ text = clean_node(
+ wxr,
+ None,
+ contents,
+ template_fn=etym_template_fn,
+ post_template_fn=etym_post_template_fn,
+ )
# Save the collected information.
data["etymology_text"] = text
data["etymology_templates"] = templates
@@ -2804,20 +3096,23 @@ def desc_post_template_fn(name, ht, expansion):
# same proto-language, then we tag this descendant entry with
# "derived"
is_derived = (
- is_proto_root_derived_section and
- (name == "l" or name == "link") and
- ("1" in ht and ht["1"] == lang_code)
+ is_proto_root_derived_section
+ and (name == "l" or name == "link")
+ and ("1" in ht and ht["1"] == lang_code)
)
expansion = clean_node(wxr, None, expansion)
- templates.append({
- "name": name, "args": ht, "expansion": expansion
- })
+ templates.append(
+ {"name": name, "args": ht, "expansion": expansion}
+ )
return None
- text = clean_node(wxr, None, children,
- template_fn=desc_template_fn,
- post_template_fn=desc_post_template_fn
- )
+ text = clean_node(
+ wxr,
+ None,
+ children,
+ template_fn=desc_template_fn,
+ post_template_fn=desc_post_template_fn,
+ )
item_data["templates"] = templates
item_data["text"] = text
if is_derived:
@@ -2837,11 +3132,15 @@ def get_sublist_index(list_item):
def get_descendants(node):
"""Appends the data for every list item in every list in node
- to descendants."""
+ to descendants."""
for _, c in node_children(node):
- if (c.kind == NodeKind.TEMPLATE and c.largs
- and len(c.largs[0]) == 1 and isinstance(c.largs[0][0], str)
- and c.largs[0][0] in unignored_non_list_templates):
+ if (
+ c.kind == NodeKind.TEMPLATE
+ and c.largs
+ and len(c.largs[0]) == 1
+ and isinstance(c.largs[0][0], str)
+ and c.largs[0][0] in unignored_non_list_templates
+ ):
# Some Descendants sections have no wikitext list. Rather,
# the list is entirely generated by a single template (see
# e.g. the use of {{CJKV}} in Chinese entries).
@@ -2914,40 +3213,48 @@ def skip_template_fn(name, ht):
if node.kind not in LEVEL_KINDS:
# XXX handle e.g. wikipedia links at the top of a language
# XXX should at least capture "also" at top of page
- if node.kind in (NodeKind.HLINE, NodeKind.LIST,
- NodeKind.LIST_ITEM):
+ if node.kind in (
+ NodeKind.HLINE,
+ NodeKind.LIST,
+ NodeKind.LIST_ITEM,
+ ):
continue
# print(" UNEXPECTED: {}".format(node))
# Clean the node to collect category links
- clean_node(wxr, etym_data, node,
- template_fn=skip_template_fn)
+ clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
continue
- t = clean_node(wxr, etym_data,
- node.sarg if node.sarg else node.largs)
+ t = clean_node(
+ wxr, etym_data, node.sarg if node.sarg else node.largs
+ )
t = t.lower()
# XXX these counts were never implemented fully, and even this
# gets discarded: Search STATISTICS_IMPLEMENTATION
wxr.config.section_counts[t] += 1
# print("PROCESS_CHILDREN: T:", repr(t))
if t.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])):
- if t.startswith(tuple(
+ if t.startswith(
+ tuple(
pron_title + " "
- for pron_title in
- wxr.config.OTHER_SUBTITLES.get("pronunciation", []))):
+ for pron_title in wxr.config.OTHER_SUBTITLES.get(
+ "pronunciation", []
+ )
+ )
+ ):
# Pronunciation 1, etc, are used in Chinese Glyphs,
# and each of them may have senses under Definition
push_etym()
wxr.wtp.start_subsection(None)
if wxr.config.capture_pronunciation:
data = select_data()
- parse_pronunciation(wxr,
- node,
- data,
- etym_data,
- have_etym,
- base_data,
- lang_code,
- )
+ parse_pronunciation(
+ wxr,
+ node,
+ data,
+ etym_data,
+ have_etym,
+ base_data,
+ lang_code,
+ )
elif t.startswith(tuple(wxr.config.OTHER_SUBTITLES["etymology"])):
push_etym()
wxr.wtp.start_subsection(None)
@@ -2963,11 +3270,13 @@ def skip_template_fn(name, ht):
data = select_data()
parse_descendants(data, node)
elif (
- t in wxr.config.OTHER_SUBTITLES.get(
+ t
+ in wxr.config.OTHER_SUBTITLES.get(
"proto_root_derived_sections", []
)
- and pos == "root" and is_reconstruction and
- wxr.config.capture_descendants
+ and pos == "root"
+ and is_reconstruction
+ and wxr.config.capture_descendants
):
data = select_data()
parse_descendants(data, node, True)
@@ -2989,17 +3298,20 @@ def skip_template_fn(name, ht):
pos = dt["pos"]
wxr.wtp.start_subsection(t)
if "debug" in dt:
- wxr.wtp.debug("{} in section {}"
- .format(dt["debug"], t),
- sortid="page/2755")
+ wxr.wtp.debug(
+ "{} in section {}".format(dt["debug"], t),
+ sortid="page/2755",
+ )
if "warning" in dt:
- wxr.wtp.warning("{} in section {}"
- .format(dt["warning"], t),
- sortid="page/2759")
+ wxr.wtp.warning(
+ "{} in section {}".format(dt["warning"], t),
+ sortid="page/2759",
+ )
if "error" in dt:
- wxr.wtp.error("{} in section {}"
- .format(dt["error"], t),
- sortid="page/2763")
+ wxr.wtp.error(
+ "{} in section {}".format(dt["error"], t),
+ sortid="page/2763",
+ )
# Parse word senses for the part-of-speech
parse_part_of_speech(node, pos)
if "tags" in dt:
@@ -3056,10 +3368,10 @@ def usex_template_fn(name, ht):
usex_type = "example"
elif name in quotation_templates:
usex_type = "quotation"
- for prefix, t in template_linkage_mappings:
- if re.search(r"(^|[-/\s]){}($|\b|[0-9])"
- .format(prefix),
- name):
+ for prefix in template_linkages:
+ if re.search(
+ r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
+ ):
return ""
return None
@@ -3068,23 +3380,32 @@ def usex_template_fn(name, ht):
contents = item.children
if lang_code == "ja":
# print(contents)
- if (contents and isinstance(contents, str) and
- re.match(r"\s*$", contents[0])):
+ if (
+ contents
+ and isinstance(contents, str)
+ and re.match(r"\s*$", contents[0])
+ ):
contents = contents[1:]
- exp = wxr.wtp.parse(wxr.wtp.node_to_wikitext(contents),
- # post_template_fn=head_post_template_fn,
- expand_all=True)
+ exp = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(contents),
+ # post_template_fn=head_post_template_fn,
+ expand_all=True,
+ )
rub, rest = extract_ruby(wxr, exp.children)
if rub:
for r in rub:
ruby.append(r)
contents = rest
- subtext = clean_node(wxr, sense_base, contents,
- template_fn=usex_template_fn)
- subtext = re.sub(r"\s*\(please add an English "
- r"translation of this "
- r"(example|usage example|quote)\)",
- "", subtext).strip()
+ subtext = clean_node(
+ wxr, sense_base, contents, template_fn=usex_template_fn
+ )
+ subtext = re.sub(
+ r"\s*\(please add an English "
+ r"translation of this "
+ r"(example|usage example|quote)\)",
+ "",
+ subtext,
+ ).strip()
subtext = re.sub(r"\^\([^)]*\)", "", subtext)
subtext = re.sub(r"\s*[―—]+$", "", subtext)
# print("subtext:", repr(subtext))
@@ -3093,17 +3414,21 @@ def usex_template_fn(name, ht):
# print(lines)
lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
- lines = list(x for x in lines
- if not re.match(
- r"(Synonyms: |Antonyms: |Hyponyms: |"
- r"Synonym: |Antonym: |Hyponym: |"
- r"Hypernyms: |Derived terms: |"
- r"Related terms: |"
- r"Hypernym: |Derived term: |"
- r"Coordinate terms:|"
- r"Related term: |"
- r"For more quotations using )",
- x))
+ lines = list(
+ x
+ for x in lines
+ if not re.match(
+ r"(Synonyms: |Antonyms: |Hyponyms: |"
+ r"Synonym: |Antonym: |Hyponym: |"
+ r"Hypernyms: |Derived terms: |"
+ r"Related terms: |"
+ r"Hypernym: |Derived term: |"
+ r"Coordinate terms:|"
+ r"Related term: |"
+ r"For more quotations using )",
+ x,
+ )
+ )
tr = ""
ref = ""
roman = ""
@@ -3112,26 +3437,28 @@ def usex_template_fn(name, ht):
# print(classify_desc(line))
if len(lines) == 1 and lang_code != "en":
parts = re.split(r"\s*[―—]+\s*", lines[0])
- if (len(parts) == 2 and
- classify_desc(parts[1]) == "english"):
+ if len(parts) == 2 and classify_desc(parts[1]) == "english":
lines = [parts[0].strip()]
tr = parts[1].strip()
- elif (len(parts) == 3 and
- classify_desc(parts[1]) in ("romanization",
- "english") and
- classify_desc(parts[2]) == "english"):
+ elif (
+ len(parts) == 3
+ and classify_desc(parts[1])
+ in ("romanization", "english")
+ and classify_desc(parts[2]) == "english"
+ ):
lines = [parts[0].strip()]
roman = parts[1].strip()
tr = parts[2].strip()
else:
parts = re.split(r"\s+-\s+", lines[0])
- if (len(parts) == 2 and
- classify_desc(parts[1]) == "english"):
+ if (
+ len(parts) == 2
+ and classify_desc(parts[1]) == "english"
+ ):
lines = [parts[0].strip()]
tr = parts[1].strip()
elif len(lines) > 1:
- if any(re.search(r"[]\d:)]\s*$", x)
- for x in lines[:-1]):
+ if any(re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]):
ref = []
for i in range(len(lines)):
if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]):
@@ -3140,13 +3467,17 @@ def usex_template_fn(name, ht):
if re.search(r"[]\d:)]\s*$", lines[i]):
break
ref = " ".join(ref)
- lines = lines[i + 1:]
- if (lang_code != "en" and len(lines) >= 2 and
- classify_desc(lines[-1]) == "english"):
+ lines = lines[i + 1 :]
+ if (
+ lang_code != "en"
+ and len(lines) >= 2
+ and classify_desc(lines[-1]) == "english"
+ ):
i = len(lines) - 1
- while (i > 1 and
- classify_desc(lines[i - 1])
- == "english"):
+ while (
+ i > 1
+ and classify_desc(lines[i - 1]) == "english"
+ ):
i -= 1
tr = "\n".join(lines[i:])
lines = lines[:i]
@@ -3155,8 +3486,7 @@ def usex_template_fn(name, ht):
roman = lines[-1].strip()
lines = lines[:-1]
- elif (lang_code == "en" and
- re.match(r"^[#*]*:+", lines[1])):
+ elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
ref = lines[0]
lines = lines[1:]
elif lang_code != "en" and len(lines) == 2:
@@ -3168,9 +3498,13 @@ def usex_template_fn(name, ht):
elif cls1 == "english" and cls2 != "english":
tr = lines[0]
lines = [lines[1]]
- elif (re.match(r"^[#*]*:+", lines[1]) and
- classify_desc(re.sub(r"^[#*:]+\s*", "",
- lines[1])) == "english"):
+ elif (
+ re.match(r"^[#*]*:+", lines[1])
+ and classify_desc(
+ re.sub(r"^[#*:]+\s*", "", lines[1])
+ )
+ == "english"
+ ):
tr = re.sub(r"^[#*:]+\s*", "", lines[1])
lines = [lines[0]]
elif cls1 == "english" and cls2 == "english":
@@ -3179,20 +3513,27 @@ def usex_template_fn(name, ht):
# non-English, as that seems more common.
tr = lines[1]
lines = [lines[0]]
- elif (usex_type != "quotation" and
- lang_code != "en" and
- len(lines) == 3):
+ elif (
+ usex_type != "quotation"
+ and lang_code != "en"
+ and len(lines) == 3
+ ):
cls1 = classify_desc(lines[0])
cls2 = classify_desc(lines[1])
cls3 = classify_desc(lines[2])
- if (cls3 == "english" and
- cls2 in ["english", "romanization"] and
- cls1 != "english"):
+ if (
+ cls3 == "english"
+ and cls2 in ["english", "romanization"]
+ and cls1 != "english"
+ ):
tr = lines[2].strip()
roman = lines[1].strip()
lines = [lines[0].strip()]
- elif (usex_type == "quotation" and
- lang_code != "en" and len(lines) > 2):
+ elif (
+ usex_type == "quotation"
+ and lang_code != "en"
+ and len(lines) > 2
+ ):
# for x in lines:
# print(" LINE: {}: {}"
# .format(classify_desc(x), x))
@@ -3202,9 +3543,10 @@ def usex_template_fn(name, ht):
cls1 = classify_desc(lines[-1])
if cls1 == "english":
i = len(lines) - 1
- while (i > 1 and
- classify_desc(lines[i - 1])
- == "english"):
+ while (
+ i > 1
+ and classify_desc(lines[i - 1]) == "english"
+ ):
i -= 1
tr = "\n".join(lines[i:])
lines = lines[:i]
@@ -3215,10 +3557,13 @@ def usex_template_fn(name, ht):
tr = re.sub(r"[ \t\r]+", " ", tr).strip()
tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
ref = re.sub(r"^[#*:]+\s*", "", ref)
- ref = re.sub(r", (volume |number |page )?“?"
- r"\(please specify ([^)]|\(s\))*\)”?|"
- ", text here$",
- "", ref)
+ ref = re.sub(
+ r", (volume |number |page )?“?"
+ r"\(please specify ([^)]|\(s\))*\)”?|"
+ ", text here$",
+ "",
+ ref,
+ )
ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
subtext = "\n".join(x for x in lines if x)
@@ -3226,30 +3571,41 @@ def usex_template_fn(name, ht):
m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
if m and classify_desc(m.group(2)) == "english":
tr = m.group(2)
- subtext = subtext[:m.start()] + m.group(1)
+ subtext = subtext[: m.start()] + m.group(1)
elif lines:
parts = re.split(r"\s*[―—]+\s*", lines[0])
- if (len(parts) == 2 and
- classify_desc(parts[1]) == "english"):
+ if (
+ len(parts) == 2
+ and classify_desc(parts[1]) == "english"
+ ):
subtext = parts[0].strip()
tr = parts[1].strip()
- subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1",
- subtext)
- subtext = re.sub(r"(please add an English translation of "
- r"this (quote|usage example))",
- "", subtext)
- subtext = re.sub(r"\s*→New International Version "
- "translation$",
- "", subtext) # e.g. pis/Tok Pisin (Bible)
+ subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
+ subtext = re.sub(
+ r"(please add an English translation of "
+ r"this (quote|usage example))",
+ "",
+ subtext,
+ )
+ subtext = re.sub(
+ r"\s*→New International Version " "translation$",
+ "",
+ subtext,
+ ) # e.g. pis/Tok Pisin (Bible)
subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
note = None
m = re.match(r"^\(([^)]*)\):\s+", subtext)
- if (m is not None and lang_code != "en" and
- (m.group(1).startswith("with ") or
- classify_desc(m.group(1)) == "english")):
+ if (
+ m is not None
+ and lang_code != "en"
+ and (
+ m.group(1).startswith("with ")
+ or classify_desc(m.group(1)) == "english"
+ )
+ ):
note = m.group(1)
- subtext = subtext[m.end():]
+ subtext = subtext[m.end() :]
ref = re.sub(r"\s*\(→ISBN\)", "", ref)
ref = re.sub(r",\s*→ISBN", "", ref)
ref = ref.strip()
@@ -3278,7 +3634,6 @@ def usex_template_fn(name, ht):
return examples
-
# Main code of parse_language()
# Process the section
stack.append(language)
@@ -3358,9 +3713,10 @@ def top_template_fn(name, ht):
if arg.startswith("Q") or arg.startswith("Lexeme:L"):
data_append(data, "wikidata", arg)
return ""
- wxr.wtp.debug("UNIMPLEMENTED top-level template: {} {}"
- .format(name, ht),
- sortid="page/2870")
+ wxr.wtp.debug(
+ "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
+ sortid="page/2870",
+ )
return ""
clean_node(wxr, None, [node], template_fn=top_template_fn)
@@ -3373,9 +3729,9 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
# Known lowercase PoS names are in part_of_speech_map
# Known lowercase linkage section names are in linkage_map
- old = re.split(r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)"
- r"[ \t]*(==+)[ \t]*$",
- text)
+ old = re.split(
+ r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
+ )
parts = []
npar = 4 # Number of parentheses in above expression
@@ -3389,22 +3745,29 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
level = len(left)
part = old[i + npar]
if level != len(right):
- wxr.wtp.debug("subtitle has unbalanced levels: "
- "{!r} has {} on the left and {} on the right"
- .format(title, left, right),
- sortid="page/2904")
+ wxr.wtp.debug(
+ "subtitle has unbalanced levels: "
+ "{!r} has {} on the left and {} on the right".format(
+ title, left, right
+ ),
+ sortid="page/2904",
+ )
lc = title.lower()
if name_to_code(title, "en") != "":
if level > 2:
- wxr.wtp.debug("subtitle has language name {} at level {}"
- .format(title, level),
- sortid="page/2911")
+ wxr.wtp.debug(
+ "subtitle has language name {} at level {}".format(
+ title, level
+ ),
+ sortid="page/2911",
+ )
level = 2
elif lc.startswith(tuple(wxr.config.OTHER_SUBTITLES["etymology"])):
if level > 3:
- wxr.wtp.debug("etymology section {} at level {}"
- .format(title, level),
- sortid="page/2917")
+ wxr.wtp.debug(
+ "etymology section {} at level {}".format(title, level),
+ sortid="page/2917",
+ )
level = 3
elif lc.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])):
level = 3
@@ -3473,7 +3836,7 @@ def parse_page(
text,
pre_expand=True,
additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
- do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES
+ do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
)
# from wikitextprocessor.parser import print_tree
# print("PAGE PARSE:", print_tree(tree))
@@ -3521,7 +3884,7 @@ def parse_page(
if "lang" not in data:
wxr.wtp.debug(
"internal error -- no lang in data: {}".format(data),
- sortid="page/3034"
+ sortid="page/3034",
)
continue
for k, v in top_data.items():
@@ -3552,16 +3915,26 @@ def parse_page(
if not conjs:
continue
cpos = dt.get("pos")
- if (pos == cpos or
- (pos, cpos) in (("noun", "adj"),
- ("noun", "name"),
- ("name", "noun"),
- ("name", "adj"),
- ("adj", "noun"),
- ("adj", "name")) or
- (pos == "adj" and cpos == "verb" and
- any("participle" in s.get("tags", ())
- for s in dt.get("senses", ())))):
+ if (
+ pos == cpos
+ or (pos, cpos)
+ in (
+ ("noun", "adj"),
+ ("noun", "name"),
+ ("name", "noun"),
+ ("name", "adj"),
+ ("adj", "noun"),
+ ("adj", "name"),
+ )
+ or (
+ pos == "adj"
+ and cpos == "verb"
+ and any(
+ "participle" in s.get("tags", ())
+ for s in dt.get("senses", ())
+ )
+ )
+ ):
data["conjugation"] = list(conjs) # Copy list!
break
# Add topics from the last sense of a language to its other senses,
@@ -3579,13 +3952,14 @@ def parse_page(
for x in ret:
if x["word"] != word:
if word.startswith("Unsupported titles/"):
- wxr.wtp.debug(f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
- sortid="20231101/3578page.py"
- )
+ wxr.wtp.debug(
+ f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
+ sortid="20231101/3578page.py",
+ )
else:
- wxr.wtp.debug(f"DIFFERENT ORIGINAL TITLE: '{word}' "
- f"-> '{x['word']}'",
- sortid="20231101/3582page.py"
- )
+ wxr.wtp.debug(
+ f"DIFFERENT ORIGINAL TITLE: '{word}' " f"-> '{x['word']}'",
+ sortid="20231101/3582page.py",
+ )
x["original_title"] = word
return ret
diff --git a/src/wiktextract/extractor/ruby.py b/src/wiktextract/extractor/ruby.py
index 43e2ee38f..1a287758c 100644
--- a/src/wiktextract/extractor/ruby.py
+++ b/src/wiktextract/extractor/ruby.py
@@ -1,8 +1,12 @@
from typing import List, Optional, Tuple, Union
from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import HTMLNode, LevelNode, TemplateNode
-
+from wikitextprocessor.parser import (
+ GeneralNode,
+ HTMLNode,
+ LevelNode,
+ TemplateNode,
+)
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
@@ -13,8 +17,9 @@ def parse_ruby(
"""Parse a HTML 'ruby' node for a kanji part and a furigana (ruby) part,
and return a tuple containing those. Discard the rp-element's parentheses,
we don't do anything with them."""
- ruby_nodes = []
- furi_nodes = []
+ ruby_nodes: list[Union[str, WikiNode]] = []
+ furi_nodes: list[Union[str, WikiNode]] = [] # furi_nodes is technically
+ # just list[WikiNode], but this appeases the type-checker for clean_node()
for child in node.children:
if (
not isinstance(child, WikiNode)
@@ -31,14 +36,14 @@ def parse_ruby(
# element with an empty something (apparently, seeing as how this
# works), leaving no trace of the broken ruby element in the final
# HTML source of the page!
- return
+ return None
return ruby_kanji, furigana
def extract_ruby(
wxr: WiktextractContext,
- contents: Union[WikiNode, List[Union[WikiNode, str]]],
-) -> Tuple[List[Tuple[str]], List[Union[WikiNode, str]]]:
+ contents: GeneralNode,
+) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]:
# If contents is a list, process each element separately
extracted = []
new_contents = []
@@ -69,7 +74,7 @@ def extract_ruby(
}:
# Process args and children
if kind != NodeKind.LINK:
- new_node = LevelNode(new_node.loc)
+ new_node = LevelNode(kind, new_node.loc)
new_args = []
for arg in contents.largs:
e1, c1 = extract_ruby(wxr, arg)
diff --git a/src/wiktextract/linkages.py b/src/wiktextract/linkages.py
index 5efbbea42..0de6f427d 100644
--- a/src/wiktextract/linkages.py
+++ b/src/wiktextract/linkages.py
@@ -8,21 +8,33 @@
from wikitextprocessor import Wtp
from typing import Dict, List, Union, Optional
from .datautils import split_at_comma_semi, data_append
-from .form_descriptions import (classify_desc, parse_head_final_tags,
- parse_sense_qualifier,
- head_final_bantu_langs, head_final_bantu_re,
- head_final_other_langs, head_final_other_re,
- head_final_numeric_langs, head_final_re)
+from .form_descriptions import (
+ classify_desc,
+ parse_head_final_tags,
+ parse_sense_qualifier,
+ head_final_bantu_langs,
+ head_final_bantu_re,
+ head_final_other_langs,
+ head_final_other_re,
+ head_final_numeric_langs,
+ head_final_re,
+)
from .tags import linkage_beginning_tags
+from .type_utils import WordData
# Linkage will be ignored if it matches this regexp before splitting
linkage_pre_split_ignore_re = re.compile(
- r"^(" + "|".join(re.escape(x) for x in [
- "For more variations, see ",
- "Signal flag:",
- "Semaphore:",
- ]) +
- r")")
+ r"^("
+ + "|".join(
+ re.escape(x)
+ for x in [
+ "For more variations, see ",
+ "Signal flag:",
+ "Semaphore:",
+ ]
+ )
+ + r")"
+)
# Linkage will be ignored if it has one of these prefixes
linkage_ignore_prefixes = [
@@ -63,31 +75,40 @@
# Linkage will be ignored if it matches this regexp
linkage_ignore_re = re.compile(
- r"^(" + "|".join(re.escape(x) for x in linkage_ignore_whole) +
- r")$|^(" + "|".join(re.escape(x) for x in linkage_ignore_prefixes) +
- r")|(" + "|".join(re.escape(x) for x in linkage_ignore_suffixes) +
- r")$")
+ r"^("
+ + "|".join(re.escape(x) for x in linkage_ignore_whole)
+ + r")$|^("
+ + "|".join(re.escape(x) for x in linkage_ignore_prefixes)
+ + r")|("
+ + "|".join(re.escape(x) for x in linkage_ignore_suffixes)
+ + r")$"
+)
# These prefixes will be removed from linkages, leaving the rest. This is
# considered separately for each linkage in a list.
linkage_remove_prefixes_re = re.compile(
- r"^(" +
- r"|".join(re.escape(x) for x in [
- ":",
- "see Thesaurus:",
- "See Thesaurus:",
- "see also Thesaurus:",
- "See also Thesaurus:",
- "see also ",
- "See also ",
- "see ",
- "See ",
- "from ",
- "abbreviation of ",
- "ISO 639-1 code ",
- "ISO 639-3 code ",
- "Thesaurus:"]) +
- ")")
+ r"^("
+ + r"|".join(
+ re.escape(x)
+ for x in [
+ ":",
+ "see Thesaurus:",
+ "See Thesaurus:",
+ "see also Thesaurus:",
+ "See also Thesaurus:",
+ "see also ",
+ "See also ",
+ "see ",
+ "See ",
+ "from ",
+ "abbreviation of ",
+ "ISO 639-1 code ",
+ "ISO 639-3 code ",
+ "Thesaurus:",
+ ]
+ )
+ + ")"
+)
# When removing prefix from linkage, this dictionary can be used to map
# the removed prefix to a space-separated list of tags to add
@@ -101,17 +122,22 @@
r"(\s+on (Wikispecies|Wikimedia Commons|"
r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|"
r"\s*[-–] Pre-reform orthography.*)"
- r"$")
+ r"$"
+)
# Ignore linkage parenthesized sections that contain one of these strings
linkage_paren_ignore_contains_re = re.compile(
- r"\b(" +
- "|".join(re.escape(x) for x in [
- "from Etymology",
- "used as",
- "usage notes",
- ]) +
- ")([, ]|$)")
+ r"\b("
+ + "|".join(
+ re.escape(x)
+ for x in [
+ "from Etymology",
+ "used as",
+ "usage notes",
+ ]
+ )
+ + ")([, ]|$)"
+)
taxonomic_ending_map = {
"superkingdoms": "superkingdom",
@@ -133,7 +159,9 @@
taxonomic_ending_map[v] = v # Also add singular -> singular
taxonomic_ending_re = re.compile(
r"\s+[-‐‑‒–—]\s+({})$".format(
- "|".join(re.escape(x) for x in taxonomic_ending_map)))
+ "|".join(re.escape(x) for x in taxonomic_ending_map)
+ )
+)
# Exceptional splits for linkages. This can be used to fix particular linkages
# that are not handled correctly by the default code. This can also be used
@@ -146,10 +174,14 @@
# Truncate linkage word if it matches any of these strings
linkage_truncate_re = re.compile(
- "|".join(re.escape(x) for x in [
- " and its derived terms",
- " UTF-16 0x214C",
- ]))
+ "|".join(
+ re.escape(x)
+ for x in [
+ " and its derived terms",
+ " UTF-16 0x214C",
+ ]
+ )
+)
# Regexp for identifying special linkages containing lists of letters, digits,
# or characters
@@ -161,39 +193,47 @@
r" digits)(;|$)|"
r"(^|; )(Letters using |Letters of the |"
r"Variations of letter )|"
- r"^(Hiragana|Katakana)$")
+ r"^(Hiragana|Katakana)$"
+)
# Matches an unicode character including any combining diacritics (even if
# separate characters)
-unicode_dc_re = re.compile(r"\w[{}]|.".format(
- "".join(chr(x) for x in range(0, 0x110000)
- if unicodedata.category(chr(x)) == "Mn")))
-
-
-def parse_linkage_item_text(wxr: Wtp,
- word: str,
- data: Dict[str, Union[list, str, dict]],
- field: str,
- item: str,
- sense: Optional[str],
- ruby: list,
- pos_datas: list,
- is_reconstruction: bool,
- urls: Optional[List[str]] = None
- ) -> Optional[str]:
+unicode_dc_re = re.compile(
+ r"\w[{}]|.".format(
+ "".join(
+ chr(x)
+ for x in range(0, 0x110000)
+ if unicodedata.category(chr(x)) == "Mn"
+ )
+ )
+)
+
+
+def parse_linkage_item_text(
+ wxr: WiktextractContext,
+ word: str,
+ data: WordData,
+ field: str,
+ item: str,
+ sense: Optional[str],
+ ruby: list,
+ pos_datas: list,
+ is_reconstruction: bool,
+ urls: Optional[List[str]] = None,
+) -> Optional[str]:
"""Parses a linkage item once it has been converted to a string. This
may add one or more linkages to ``data`` under ``field``. This
returns None or a string that contains thats that should be applied
to additional linkages (commonly used in tables for Asian characters)."""
assert isinstance(wxr, WiktextractContext)
- assert isinstance(word, str) # Main word (derived from page title)
+ assert isinstance(word, str) # Main word (derived from page title)
assert isinstance(data, dict) # Parsed linkages are stored here under field
assert isinstance(field, str) # The field under which to store linkage
- assert isinstance(item, str) # The string to parse
+ assert isinstance(item, str) # The string to parse
assert sense is None or isinstance(sense, str)
- assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or ""
+ assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or ""
assert isinstance(pos_datas, list) # List of senses (containing "glosses")
- assert urls is None or isinstance(urls, list) # Captured urls
+ assert urls is None or isinstance(urls, list) # Captured urls
assert is_reconstruction in (True, False)
item = item.replace("()", "")
@@ -229,7 +269,7 @@ def parse_linkage_item_text(wxr: Wtp,
# Replace occurrences of ~ in the item by the page title
safetitle = wxr.wtp.title.replace("\\", "\\\\")
- item = item.replace(" ~ ", " " + safetitle + " ")
+ item = item.replace(" ~ ", " " + safetitle + " ")
item = re.sub(r"^~ ", safetitle + " ", item)
item = re.sub(r" ~$", " " + safetitle, item)
@@ -239,7 +279,7 @@ def parse_linkage_item_text(wxr: Wtp,
m = re.search(taxonomic_ending_re, item)
if m:
base_english = taxonomic_ending_map[m.group(1)]
- item = item[:m.start()]
+ item = item[: m.start()]
# Some Korean and Japanese words use "word (romanized): english" pattern
# Sometimes the parenthesized part contains comma-separated alt and roman.
@@ -248,13 +288,17 @@ def parse_linkage_item_text(wxr: Wtp,
rom = m.group(2)
eng = m.group(3)
rest = m.group(1)
- if (classify_desc(rest, no_unknown_starts=True) == "other" and
- classify_desc(eng, no_unknown_starts=True) == "english"):
+ if (
+ classify_desc(rest, no_unknown_starts=True) == "other"
+ and classify_desc(eng, no_unknown_starts=True) == "english"
+ ):
item = rest
base_roman = rom
lst = base_roman.split(", ")
- if (len(lst) == 2 and
- classify_desc(lst[0], no_unknown_starts=True) == "other"):
+ if (
+ len(lst) == 2
+ and classify_desc(lst[0], no_unknown_starts=True) == "other"
+ ):
base_alt = lst[0]
base_roman = lst[1]
if base_english:
@@ -265,9 +309,10 @@ def parse_linkage_item_text(wxr: Wtp,
# Many words have tags or similar descriptions in the beginning
# followed by a colon and one or more linkages (e.g.,
# panetella/Finnish)
- m = (re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or
- re.match(r"^([a-zA-Z][-'a-zA-Z0-9 ]*"
- r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", item))
+ m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match(
+ r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$",
+ item,
+ )
if m:
desc = m.group(1)
rest = m.group(len(m.groups()))
@@ -326,12 +371,22 @@ def parse_linkage_item_text(wxr: Wtp,
e1 = wxr.wtp.page_exists(desc)
e2 = wxr.wtp.page_exists(rest)
if cls != "tags":
- if (cls2 == "tags" or
- (e1 and not e1) or
- (e1 and e2 and cls2 == "english" and
- cls in ("other", "romanization")) or
- (not e1 and not e2 and cls2 == "english" and
- cls in ("other", "romanization"))):
+ if (
+ cls2 == "tags"
+ or (e1 and not e1)
+ or (
+ e1
+ and e2
+ and cls2 == "english"
+ and cls in ("other", "romanization")
+ )
+ or (
+ not e1
+ and not e2
+ and cls2 == "english"
+ and cls in ("other", "romanization")
+ )
+ ):
desc, rest = rest, desc # Looks like swapped syntax
cls = cls2
if re.search(linkage_paren_ignore_contains_re, desc):
@@ -364,48 +419,56 @@ def parse_linkage_item_text(wxr: Wtp,
d = pos_datas[idx]
gl = "; ".join(d.get("glosses", ()))
if not gl:
- wxr.wtp.debug("parenthesized numeric linkage prefix, "
- "but the referenced sense has no gloss: "
- "{}".format(desc),
- sortid="linkages/355")
+ wxr.wtp.debug(
+ "parenthesized numeric linkage prefix, "
+ "but the referenced sense has no gloss: "
+ "{}".format(desc),
+ sortid="linkages/355",
+ )
elif sense:
sense += "; " + gl
else:
sense = gl
item = rest
else:
- wxr.wtp.debug("parenthesized numeric linkage prefix, "
- "but there is no sense with such index: {}"
- .format(desc),
- sortid="linkages/365")
+ wxr.wtp.debug(
+ "parenthesized numeric linkage prefix, "
+ "but there is no sense with such index: {}".format(desc),
+ sortid="linkages/365",
+ )
item = rest
else:
- wxr.wtp.debug("unrecognized linkage prefix: {} desc={} rest={} "
- "cls={} cls2={} e1={} e2={}"
- .format(item, desc, rest, cls, cls2, e1, e2),
- sortid="linkages/371")
+ wxr.wtp.debug(
+ "unrecognized linkage prefix: {} desc={} rest={} "
+ "cls={} cls2={} e1={} e2={}".format(
+ item, desc, rest, cls, cls2, e1, e2
+ ),
+ sortid="linkages/371",
+ )
item = rest
base_sense = sense
# Check for certain plural tag forms at end of items list, and apply
# them to all items if found
- m = re.search(r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|"
- r"characters|symbols|tetragrams|letter names|names|"
- r"female names|male names|proper nouns|contractions|"
- r"nonstandard spellings|verbs|prepositions|postpositions|"
- r"interjections|Abbreviations|abbreviations|variants|"
- r"ordinals|nouns|phrases|adjectives|adverbs|"
- r"augmentatives|pejoratives|compound words|numerals|"
- r"Tally marks|surnames|modern nonstandard spellings)$",
- item)
+ m = re.search(
+ r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|"
+ r"characters|symbols|tetragrams|letter names|names|"
+ r"female names|male names|proper nouns|contractions|"
+ r"nonstandard spellings|verbs|prepositions|postpositions|"
+ r"interjections|Abbreviations|abbreviations|variants|"
+ r"ordinals|nouns|phrases|adjectives|adverbs|"
+ r"augmentatives|pejoratives|compound words|numerals|"
+ r"Tally marks|surnames|modern nonstandard spellings)$",
+ item,
+ )
if m:
suffix = m.group(1)
if base_qualifier:
base_qualifier += ", " + suffix
else:
base_qualifier = suffix
- item = item[:m.start()]
+ item = item[: m.start()]
# Certain linkage items have space-separated valus. These are
# generated by, e.g., certain templates
@@ -443,17 +506,29 @@ def parse_linkage_item_text(wxr: Wtp,
# Item1 contains " or "
item2 = re.sub(r"\s*\([^)]*\)", "", item1)
item2 = re.sub(r"\s+", " ", item2)
- if ((lang not in head_final_bantu_langs or
- not re.search(head_final_bantu_re, item2)) and
- (lang not in head_final_other_langs or
- not re.search(head_final_other_re, item2)) and
- (not re.search(head_final_re, item2) or
- (item2[-1].isdigit() and
- lang not in head_final_numeric_langs)) and
- not re.search(r"\bor\b", wxr.wtp.title) and
- all(wxr.wtp.title not in x.split(" or ")
+ if (
+ (
+ lang not in head_final_bantu_langs
+ or not re.search(head_final_bantu_re, item2)
+ )
+ and (
+ lang not in head_final_other_langs
+ or not re.search(head_final_other_re, item2)
+ )
+ and (
+ not re.search(head_final_re, item2)
+ or (
+ item2[-1].isdigit()
+ and lang not in head_final_numeric_langs
+ )
+ )
+ and not re.search(r"\bor\b", wxr.wtp.title)
+ and all(
+ wxr.wtp.title not in x.split(" or ")
for x in split_at_comma_semi(item2)
- if " or " in x)):
+ if " or " in x
+ )
+ ):
# We can split this item. Split the non-cleaned version
# that still has any intervening parenthesized parts.
subitems.extend(split_at_comma_semi(item1, extra=[" or "]))
@@ -482,7 +557,7 @@ def parse_linkage_item_text(wxr: Wtp,
m = re.search(r"\s*\(“([^”]+)”\)", item1)
if m:
t = m.group(1)
- item1 = (item1[:m.start()] + item1[m.end():]).strip()
+ item1 = (item1[: m.start()] + item1[m.end() :]).strip()
cls = classify_desc(t)
if cls == "tags":
if qualifier:
@@ -494,20 +569,27 @@ def parse_linkage_item_text(wxr: Wtp,
# Some Korean words use "word (alt, oman, “english”) pattern
# See 滿/Korean
- m = re.match(r'([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), '
- r'[“”"]([^”“"]+)[“”"]\)$', item1)
- if (m and
- classify_desc(m.group(1), no_unknown_starts=True) == "other" and
- classify_desc(m.group(2), no_unknown_starts=True) == "other"):
+ m = re.match(
+ r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), "
+ r'[“”"]([^”“"]+)[“”"]\)$',
+ item1,
+ )
+ if (
+ m
+ and classify_desc(m.group(1), no_unknown_starts=True) == "other"
+ and classify_desc(m.group(2), no_unknown_starts=True) == "other"
+ ):
alt = m.group(2)
roman = m.group(3)
english = m.group(4)
item1 = m.group(1)
words = item1.split(" ")
- if (len(words) > 1 and
- words[0] in linkage_beginning_tags and
- words[0] != wxr.wtp.title):
+ if (
+ len(words) > 1
+ and words[0] in linkage_beginning_tags
+ and words[0] != wxr.wtp.title
+ ):
t = linkage_beginning_tags[words[0]]
item1 = " ".join(words[1:])
if qualifier:
@@ -543,8 +625,9 @@ def english_repl(m):
# sometimes both at the beginning and at the end.
# And sometimes even in the middle, as in e.g.
# wife/English/Translations/Yiddish
- while (not script_chars and
- (not sense or not re.search(script_chars_re, sense))):
+ while not script_chars and (
+ not sense or not re.search(script_chars_re, sense)
+ ):
par = None
nonfirst_par = False
if par is None:
@@ -552,16 +635,17 @@ def english_repl(m):
m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1)
if m:
par = m.group(1)
- item1 = item1[m.end():]
+ item1 = item1[m.end() :]
else:
# Try to find a parenthesized part at the end or from the
# middle.
- m = re.search(r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)"
- r"(\.$)?",
- item1)
+ m = re.search(
+ r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?",
+ item1,
+ )
if m:
par = m.group(1)
- item1 = item1[:m.start()] + item1[m.end():]
+ item1 = item1[: m.start()] + item1[m.end() :]
nonfirst_par = True
if not par:
break
@@ -588,7 +672,7 @@ def english_repl(m):
qualifier = par[:idx]
else:
break
- par = par[idx + 1:].strip()
+ par = par[idx + 1 :].strip()
# Check for certain comma-separated tags combined
# with English text at the beginning or end of a
@@ -676,19 +760,22 @@ def english_repl(m):
d = pos_datas[idx]
gl = "; ".join(d.get("glosses", ()))
if not gl:
- wxr.wtp.debug("parenthesized number "
- "but the referenced sense has no "
- "gloss: {}".format(par),
- sortid="linkages/665")
+ wxr.wtp.debug(
+ "parenthesized number "
+ "but the referenced sense has no "
+ "gloss: {}".format(par),
+ sortid="linkages/665",
+ )
elif sense:
sense += "; " + gl
else:
sense = gl
else:
- wxr.wtp.debug("parenthesized number but there is "
- "no sense with such index: {}"
- .format(par),
- sortid="linkages/674")
+ wxr.wtp.debug(
+ "parenthesized number but there is "
+ "no sense with such index: {}".format(par),
+ sortid="linkages/674",
+ )
else:
if alt:
alt += "; " + par
@@ -706,8 +793,8 @@ def english_repl(m):
# Remove certain prefixes from linkages
m = re.match(linkage_remove_prefixes_re, item1)
if m:
- prefix = item1[:m.end()]
- item1 = item1[m.end():]
+ prefix = item1[: m.end()]
+ item1 = item1[m.end() :]
if prefix in linkage_remove_prefixes_tags:
if qualifier:
qualifier += ", " + linkage_remove_prefixes_tags[prefix]
@@ -720,13 +807,13 @@ def english_repl(m):
# Remove certain suffixes from linkages
m = re.search(linkage_remove_suffixes_re, item1)
if m:
- item1 = item1[:m.start()]
+ item1 = item1[: m.start()]
# Parse linkages with "value = english" syntax (e.g.,
# väittää/Finnish)
idx = item1.find(" = ")
if idx >= 0:
- eng = item1[idx + 3:]
+ eng = item1[idx + 3 :]
if classify_desc(eng, no_unknown_starts=True) == "english":
english = eng
item1 = item1[:idx]
@@ -736,25 +823,25 @@ def english_repl(m):
eng = item1[:idx]
if classify_desc(eng, no_unknown_starts=True) == "english":
english = eng
- item1 = item1[idx + 3:]
+ item1 = item1[idx + 3 :]
# Parse linkages with "value - english" syntax (e.g.,
# man/Faroese)
m = re.search(r" [-‐‑‒–—―] ", item1)
if m and "(" not in item1:
- suffix = item1[m.end():]
+ suffix = item1[m.end() :]
cls = classify_desc(suffix, no_unknown_starts=True)
if cls == "english":
# This case intentionally ignores old values from english
# (otherwise taxonomic lists fail)
english = suffix
- item1 = item1[:m.start()]
+ item1 = item1[: m.start()]
elif cls == "tags":
if qualifier:
qualifier += ", " + suffix
else:
qualifier = suffix
- item1 = item1[:m.start()]
+ item1 = item1[: m.start()]
# Parse certain tags at the end of the linked term (unless
# we are in a letters list)
@@ -768,7 +855,7 @@ def english_repl(m):
m = re.search(linkage_truncate_re, item1)
if m:
# suffix = item1[m.start():] # Currently ignored
- item1 = item1[:m.start()]
+ item1 = item1[: m.start()]
if not item1:
continue # Ignore empty link targets
if item1 == word:
@@ -794,9 +881,11 @@ def add(w, r):
# split as this is used when we have a different number
# of romanizations than written forms, and don't know
# which is which.
- if ((not w or "," not in w) and
- (not r or "," not in r) and
- not wxr.wtp.page_exists(w)):
+ if (
+ (not w or "," not in w)
+ and (not r or "," not in r)
+ and not wxr.wtp.page_exists(w)
+ ):
lst = w.split("/") if len(w) > 1 else [w]
if len(lst) == 1:
lst = w.split(" / ")
@@ -811,9 +900,15 @@ def add(w, r):
# Heuristically remove "." at the end of most linkages
# (some linkage lists end in a period, but we also have
# abbreviations that end with a period that should be kept)
- if (w.endswith(".") and not wxr.wtp.page_exists(w) and
- (wxr.wtp.page_exists(w[:-1]) or
- (len(w) >= 5) and "." not in w[:-1])):
+ if (
+ w.endswith(".")
+ and not wxr.wtp.page_exists(w)
+ and (
+ wxr.wtp.page_exists(w[:-1])
+ or (len(w) >= 5)
+ and "." not in w[:-1]
+ )
+ ):
w = w[:-1]
# If we have roman but not alt and the word is ASCII,
@@ -847,8 +942,9 @@ def add(w, r):
if alt and alt.strip() != w:
dt["alt"] = alt.strip()
if urls:
- dt["urls"] = [url.strip() for url in urls
- if url and isinstance(url, str)]
+ dt["urls"] = [
+ url.strip() for url in urls if url and isinstance(url, str)
+ ]
dt["word"] = w
for old in data.get(field, ()):
if dt == old:
@@ -870,9 +966,11 @@ def add(w, r):
# print("lang={} v={} script_chars={} item1={!r}"
# .format(wxr.wtp.section, v, script_chars, item1))
if v and script_chars:
- if (len(item1.split()) > 1 or
- len(list(re.finditer(unicode_dc_re, item1))) == 2 or
- (len(subitems) > 10 and v in ("Hiragana", "Katakana"))):
+ if (
+ len(item1.split()) > 1
+ or len(list(re.finditer(unicode_dc_re, item1))) == 2
+ or (len(subitems) > 10 and v in ("Hiragana", "Katakana"))
+ ):
if v == qualifier:
# if sense:
# sense += "; " + qualifier
@@ -881,9 +979,12 @@ def add(w, r):
qualifier = None
if re.search(r" (letters|digits|script)$", v):
qualifier = v # Also parse as qualifier
- elif re.search(r"Variations of letter |"
- r"Letters using |"
- r"Letters of the ", v):
+ elif re.search(
+ r"Variations of letter |"
+ r"Letters using |"
+ r"Letters of the ",
+ v,
+ ):
qualifier = "letter"
parts = item1.split(". ")
extra = ()
@@ -892,23 +993,28 @@ def add(w, r):
item1 = parts[0]
# Handle multi-character names for chars in language's
# alphabet, e.g., "Ny ny" in P/Hungarian.
- if (len(subitems) > 20 and len(item1.split()) == 2 and
- all(len(x) <= 3 for x in item1.split())):
- parts = list(m.group(0) for m in
- re.finditer(r"(\w[\u0300-\u036f]?)+|.",
- item1)
- if not m.group(0).isspace() and
- m.group(0) not in ("(", ")"))
+ if (
+ len(subitems) > 20
+ and len(item1.split()) == 2
+ and all(len(x) <= 3 for x in item1.split())
+ ):
+ parts = list(
+ m.group(0)
+ for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1)
+ if not m.group(0).isspace()
+ and m.group(0) not in ("(", ")")
+ )
else:
- parts = list(m.group(0) for m in
- re.finditer(r".[\u0300-\u036f]?",
- item1)
- if not m.group(0).isspace() and
- m.group(0) not in ("(", ")"))
+ parts = list(
+ m.group(0)
+ for m in re.finditer(r".[\u0300-\u036f]?", item1)
+ if not m.group(0).isspace()
+ and m.group(0) not in ("(", ")")
+ )
for e in extra:
idx = e.find(":")
if idx >= 0:
- e = e[idx + 1:].strip()
+ e = e[idx + 1 :].strip()
if e.endswith("."):
e = e[:-1]
parts.extend(e.split())
@@ -920,10 +1026,11 @@ def add(w, r):
rparts = None
if roman:
- rparts = list(m.group(0) for m in
- re.finditer(r".[\u0300-\u036f]",
- roman)
- if not m.group(0).isspace())
+ rparts = list(
+ m.group(0)
+ for m in re.finditer(r".[\u0300-\u036f]", roman)
+ if not m.group(0).isspace()
+ )
if len(rparts) != len(parts):
rparts = None
if not rparts:
diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py
index bf3a7733a..f39e197a7 100644
--- a/src/wiktextract/page.py
+++ b/src/wiktextract/page.py
@@ -8,7 +8,18 @@
from typing import Any, Callable, Optional, Union
from mediawiki_langcodes import get_all_names, name_to_code
-from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor import (
+ NodeKind,
+ WikiNode,
+)
+from wikitextprocessor.core import (
+ TemplateArgs,
+ TemplateFnCallable,
+ PostTemplateFnCallable,
+)
+from wikitextprocessor.parser import (
+ GeneralNode,
+)
from wiktextract.wxr_context import WiktextractContext
@@ -56,9 +67,9 @@ def is_panel_template(wxr: WiktextractContext, template_name: str) -> bool:
def recursively_extract(
- contents: Union[WikiNode, list[WikiNode]],
+ contents: Union[WikiNode, str, list[Union[str, WikiNode]]],
fn: Callable[[Union[WikiNode, list[WikiNode]]], bool],
-) -> tuple[list[WikiNode], list[WikiNode]]:
+) -> tuple[list[Union[str, WikiNode]], list[Union[str, WikiNode]]]:
"""Recursively extracts elements from contents for which ``fn`` returns
True. This returns two lists, the extracted elements and the remaining
content (with the extracted elements removed at each level). Only
@@ -311,9 +322,9 @@ def remove_duplicate_data(page_data: dict) -> None:
def clean_node(
wxr: WiktextractContext,
sense_data: Optional[Any],
- wikinode: Union[str, WikiNode, list[Union[str, WikiNode]]],
- template_fn: Optional[Callable[[str, dict], str]] = None,
- post_template_fn: Optional[Callable[[str, dict, str], str]] = None,
+ wikinode: GeneralNode,
+ template_fn: Optional[TemplateFnCallable] = None,
+ post_template_fn: Optional[PostTemplateFnCallable] = None,
collect_links: bool = False,
) -> str:
"""
diff --git a/src/wiktextract/type_utils.py b/src/wiktextract/type_utils.py
index 389b541e9..81a26f911 100644
--- a/src/wiktextract/type_utils.py
+++ b/src/wiktextract/type_utils.py
@@ -1,14 +1,170 @@
from typing import (
- Union,
+ Sequence,
+ TypedDict,
)
-WordData = dict[str, Union[
- str,
- int,
- list[str],
- list[list[str]],
- "WordData",
- list["WordData"]
- ]
- ]
+class AltOf(TypedDict, total=False):
+ word: str
+ extra: str
+
+
+class LinkageData(TypedDict, total=False):
+ alt: str
+ english: str
+ extra: str
+ qualifier: str
+ roman: str
+ ruby: list[Sequence[str]]
+ sense: str
+ source: str
+ tags: list[str]
+ taxonomic: str
+ topics: list[str]
+ urls: list[str]
+ word: str
+
+
+class ExampleData(TypedDict, total=False):
+ english: str
+ note: str
+ ref: str
+ roman: str
+ ruby: list[Sequence[str]]
+ text: str
+ type: str
+
+
+class FormOf(TypedDict, total=False):
+ word: str
+ extra: str
+ roman: str
+
+
+LinkData = list[Sequence[str]]
+
+
+class TemplateData(TypedDict, total=False):
+ args: dict[str, str]
+ expansion: str
+ name: str
+
+
+class DescendantData(TypedDict, total=False):
+ depth: int
+ tags: list[str]
+ templates: TemplateData
+ text: str
+
+
+class FormData(TypedDict, total=False):
+ form: str
+ head_nr: int
+ ipa: str
+ roman: str
+ ruby: list[Sequence[str]]
+ source: str
+ tags: list[str]
+ topics: list[str]
+
+
+SoundData = TypedDict(
+ "SoundData",
+ {
+ "audio": str,
+ "audio-ipa": str,
+ "enpr": str,
+ "form": str,
+ "homophone": str,
+ "ipa": str,
+ "mp3_url": str,
+ "note": str,
+ "ogg_url": str,
+ "other": str,
+ "rhymes": str,
+ "tags": list[str],
+ "text": str,
+ "topics": list[str],
+ "zh-pron": str,
+ },
+ total=False,
+)
+
+
+class TranslationData(TypedDict, total=False):
+ alt: str
+ code: str
+ english: str
+ lang: str
+ note: str
+ roman: str
+ sense: str
+ tags: list[str]
+ taxonomic: str
+ topics: list[str]
+ word: str
+
+
+class SenseData(TypedDict, total=False):
+ alt_of: list[AltOf]
+ antonyms: list[LinkageData]
+ categories: list[str]
+ compound_of: list[AltOf]
+ coordinate_terms: list[LinkageData]
+ examples: list[ExampleData]
+ form_of: list[FormOf]
+ glosses: list[str]
+ head_nr: int
+ holonyms: list[LinkageData]
+ hypernyms: list[LinkageData]
+ hyponyms: list[LinkageData]
+ instances: list[LinkageData]
+ links: list[LinkData]
+ meronyms: list[LinkageData]
+ qualifier: str
+ raw_glosses: list[str]
+ related: list[LinkageData]
+ senseid: list[str]
+ synonyms: list[LinkageData]
+ tags: list[str]
+ topics: list[str]
+ wikidata: list[str]
+ wikipedia: list[str]
+
+
+class WordData(TypedDict, total=False):
+ abbreviations: list[LinkageData]
+ alt_of: list[AltOf]
+ antonyms: list[LinkageData]
+ categories: list[str]
+ coordinate_terms: list[LinkageData]
+ derived: list[LinkageData]
+ descendants: list[DescendantData]
+ etymology_number: int
+ etymology_templates: list[TemplateData]
+ etymology_text: str
+ form_of: list[FormOf]
+ forms: list[FormData]
+ head_templates: list[TemplateData]
+ holonyms: list[LinkageData]
+ hyphenation: list[str]
+ hypernyms: list[LinkageData]
+ hyponyms: list[LinkageData]
+ inflection_templates: list[TemplateData]
+ instances: list[LinkageData]
+ lang: str
+ lang_code: str
+ meronyms: list[LinkageData]
+ original_title: str
+ pos: str
+ proverbs: list[LinkageData]
+ redirects: list[str]
+ related: list[LinkageData]
+ senses: list[SenseData]
+ sounds: list[SoundData]
+ synonyms: list[LinkageData]
+ translations: list[TranslationData]
+ troponyms: list[LinkageData]
+ wikidata: list[str]
+ wikipedia: list[str]
+ word: str