diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py index 7cb7e46da..1f078fefd 100644 --- a/src/wiktextract/clean.py +++ b/src/wiktextract/clean.py @@ -9,13 +9,12 @@ import re import html import unicodedata -from typing import ( - Callable, - Optional, - Union -) +from typing import Callable, Optional, Union from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST -from wikitextprocessor.core import NamespaceDataEntry +from wikitextprocessor.core import ( + NamespaceDataEntry, + TemplateArgs, +) from .wxr_context import WiktextractContext ###################################################################### @@ -94,7 +93,7 @@ "ι": "ᶥ", "φ": "ᵠ", "χ": "ᵡ", - "∞": "\u2002᪲" # This is a KLUDGE + "∞": "\u2002᪲", # This is a KLUDGE } subscript_ht: dict[str, str] = { @@ -137,6 +136,7 @@ "χ": "ᵪ", } + def to_superscript(text: str) -> str: "Converts text to superscript." if not text: @@ -147,6 +147,7 @@ def to_superscript(text: str) -> str: return "^" + text return "^({})".format(text) + def to_subscript(text: str) -> str: """Converts text to subscript.""" if not text: @@ -157,10 +158,11 @@ def to_subscript(text: str) -> str: return "_" + text return "_({})".format(text) + def to_chem(text: str) -> str: """Converts text to chemical formula, making digits subscript.""" - return "".join(to_subscript(x) if x.isdigit() else x - for x in text) + return "".join(to_subscript(x) if x.isdigit() else x for x in text) + # Mapping from Latex names to Unicode characters/strings. This is the # default mapping (some cases are handled specially in the code). @@ -886,7 +888,6 @@ def to_chem(text: str) -> str: "zpipe": "⨠", "zproject": "⨡", "|": "‖", - # Accents XXX these really should be handled specially with diacritics # after argument "acute": "́", @@ -906,8 +907,6 @@ def to_chem(text: str) -> str: "overline": "◌̅", "tilde": "̃", "vec": "⃑", - - # Some ignored operators "bigl": "", "bigr": "", @@ -973,7 +972,7 @@ def to_chem(text: str) -> str: "z": "𝓏", } -mathfrak_map: dict[str, str]= { +mathfrak_map: dict[str, str] = { "A": "𝔄", "B": "𝔅", "C": "ℭ", @@ -1070,15 +1069,19 @@ def to_chem(text: str) -> str: "9": "𝟡", } + def mathcal_fn(text: str) -> str: return "".join(mathcal_map.get(x, x) for x in text) + def mathfrak_fn(text: str) -> str: return "".join(mathfrak_map.get(x, x) for x in text) + def mathbb_fn(text: str) -> str: return "".join(mathbb_map.get(x, x) for x in text) + def to_math(text: str) -> str: """Converts a mathematical formula to ASCII.""" # print("to_math: {!r}".format(text)) @@ -1088,22 +1091,25 @@ def expand(text: str) -> str: while True: orig = text # formatting with {:c} converts input into character - text = re.sub(r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), - lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST], - text) + text = re.sub( + r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), + lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST], + text, + ) if text == orig: break return text def recurse(text: str) -> str: - def math_magic(text: str, - left: str, - right: str, - fn: Callable[[str], str] + def math_magic( + text: str, left: str, right: str, fn: Callable[[str], str] ) -> str: regexp_str = r"{}([^{}{}]+){}".format( - re.escape(left), re.escape(left), - re.escape(right), re.escape(right)) + re.escape(left), + re.escape(left), + re.escape(right), + re.escape(right), + ) regexp = re.compile(regexp_str) def repl(m: re.Match) -> str: @@ -1150,8 +1156,11 @@ def expand_group(v: str) -> str: elif re.match(r"\\sqrt($|[0-9]|\b)", v): v = "√" elif re.match(r"\\(frac|binom)($|[0-9]|\b)", v): - m = re.match(r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*" - r"(\\[a-zA-Z]+|\\.|.)$", v) + m = re.match( + r"\\(frac|binom)\s*(\\[a-zA-Z]+|\\.|.)\s*" + r"(\\[a-zA-Z]+|\\.|.)$", + v, + ) if not m: print("MATH FRAC/BINOM ERROR: {!r}".format(v)) return v @@ -1198,31 +1207,37 @@ def expand_group(v: str) -> str: text = math_magic(text, "{", "}", recurse) if text == orig: break - for m in re.finditer(r"\s+|" - r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*" - r"(\\dot\\(bigvee|cup|cap|lor|vee)|" - r"\\not\\(subset|supset|subseteq|supseteq|in|ni|" - r"preceq|succeq|vartrianglelefteq|" - r"vartrianglerighteq|trianglelefteq|" - r"trianglerighteq)|" - r"\\widehat\{=\}|\\widehat=|" - r"\\overset\{?\}\{=\}|" - r"\\overset\?=|" - r"\\overset\{\\operatorname\{def\}\}\{=\}|" - r"\\[a-zA-Z]+|\\.|.)|" - r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)" - r"\b\s*|" - r"\\sqrt\b(\[\d+\])?)?" - r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)", text): + for m in re.finditer( + r"\s+|" + r"\\frac\s*(\\[a-zA-Z]+|\\.|.)\s*" + r"(\\dot\\(bigvee|cup|cap|lor|vee)|" + r"\\not\\(subset|supset|subseteq|supseteq|in|ni|" + r"preceq|succeq|vartrianglelefteq|" + r"vartrianglerighteq|trianglelefteq|" + r"trianglerighteq)|" + r"\\widehat\{=\}|\\widehat=|" + r"\\overset\{?\}\{=\}|" + r"\\overset\?=|" + r"\\overset\{\\operatorname\{def\}\}\{=\}|" + r"\\[a-zA-Z]+|\\.|.)|" + r"(\\(mathcal|mathfrak|mathbb|text|begin|end|pmod)" + r"\b\s*|" + r"\\sqrt\b(\[\d+\])?)?" + r"[_^]?(\\[a-zA-Z]+\s*|\\.|\w+|.)", + text, + ): v = m.group(0).strip() if not v: continue v = expand_group(v) if v: - if ((parts and parts[-1][-1].isalpha() and - v[0] in "0123456789") or - (parts and parts[-1][-1] in "0123456789" and - v[0] in "0123456789")): + if ( + parts and parts[-1][-1].isalpha() and v[0] in "0123456789" + ) or ( + parts + and parts[-1][-1] in "0123456789" + and v[0] in "0123456789" + ): v = " " + v parts.append(v) @@ -1237,7 +1252,7 @@ def expand_group(v: str) -> str: def bold_follows(parts: list[str], i: int) -> bool: """Checks if there is a bold (''') in parts after parts[i]. We allow intervening italics ('').""" - parts = parts[i + 1:] + parts = parts[i + 1 :] for p in parts: if not p.startswith("''"): continue @@ -1308,13 +1323,12 @@ def remove_italic_and_bold(text: str) -> str: continue new_text_parts.append(part) new_text_parts.append("\n") - new_text_parts = new_text_parts[:-1] # remove last \n + new_text_parts = new_text_parts[:-1] # remove last \n return "".join(new_text_parts) -def clean_value(wxr: WiktextractContext, - title: str, - no_strip=False, - no_html_strip=False + +def clean_value( + wxr: WiktextractContext, title: str, no_strip=False, no_html_strip=False ) -> str: """Cleans a title or value into a normal string. This should basically remove any Wikimedia formatting from it: HTML tags, templates, links, @@ -1334,17 +1348,18 @@ def repl_exturl(m: re.Match) -> str: break i += 1 return " ".join(args[i:]) + def repl_link(m: re.Match) -> str: if m.group(2) and m.group(2).lower() in ("file", "image"): return "" v = m.group(3).split("|") return clean_value(wxr, v[0], no_strip=True) + def repl_link_bars(m: re.Match) -> str: lnk = m.group(1) if re.match(r"(?si)(File|Image)\s*:", lnk): return "" - return clean_value(wxr, m.group(4) or m.group(2) or "", - no_strip=True) + return clean_value(wxr, m.group(4) or m.group(2) or "", no_strip=True) def repl_1_sup(m: re.Match) -> str: return to_superscript(clean_value(wxr, m.group(1))) @@ -1373,34 +1388,47 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: # Remove references (...). title = re.sub(r"(?is)/]*?>\s*.*?", "", title) # Replace ... by stripped content without newlines - title = re.sub(r"(?is)]*?>(.*?)\s*", - lambda m: re.sub(r"\s+", " ", m.group(1)), - title) + title = re.sub( + r"(?is)]*?>(.*?)\s*", + lambda m: re.sub(r"\s+", " ", m.group(1)), + title, + ) # Replace
by comma space (it is used to express alternatives in some # declensions) title = re.sub(r"(?si)\s*\n*", "\n", title) # Remove divs with floatright class (generated e.g. by {{ja-kanji|...}}) - title = re.sub(r'(?si)]*?\bclass="[^"]*?\bfloatright\b[^>]*?>' - r'((|.)*?)|.)*?' - r'', - "", title) + title = re.sub( + r'(?si)]*?\bclass="[^"]*?\bfloatright\b[^>]*?>' + r"((|.)*?)|.)*?" + r"", + "", + title, + ) # Remove divs with float: attribute - title = re.sub(r'(?si)]*?\bstyle="[^"]*?\bfloat:[^>]*?>' - r'((|.)*?)|.)*?' - r'', - "", title) + title = re.sub( + r'(?si)]*?\bstyle="[^"]*?\bfloat:[^>]*?>' + r"((|.)*?)|.)*?" + r"", + "", + title, + ) # Remove with previewonly class (generated e.g. by {{taxlink|...}}) - title = re.sub(r'(?si)]*?\bclass="[^"<>]*?' - r'\bpreviewonly\b[^>]*?>' - r'.+?', - "", title) + title = re.sub( + r'(?si)]*?\bclass="[^"<>]*?' + r"\bpreviewonly\b[^>]*?>" + r".+?", + "", + title, + ) # Remove ... - title = re.sub(r'(?si)]*?\bclass="[^"]*?\berror\b[^>]*?>' - r'.+?', - "", title) + title = re.sub( + r'(?si)]*?\bclass="[^"]*?\berror\b[^>]*?>' + r".+?", + "", + title, + ) # Change
and
to newlines. Ditto for tr, li, table, dl, ul, ol - title = re.sub(r"(?si)]*>", - "\n", title) + title = re.sub(r"(?si)]*>", "\n", title) # Change
,
, and
into newlines; # these generate new rows/lines. title = re.sub(r"(?i)", "\n", title) @@ -1408,22 +1436,20 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: title = re.sub(r"(?si)]*>", " ", title) # Change ... to ^ title = re.sub(r"(?si)]*>\s*", "", title) - title = re.sub(r"(?si)]*>(.*?)", - repl_1_sup, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_sup, title) # Change ... to _ title = re.sub(r"(?si)]*>\s*", "", title) - title = re.sub(r"(?si)]*>(.*?)", - repl_1_sub, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_sub, title) # Change ... using subscripts for digits - title = re.sub(r"(?si)]*>(.*?)", - repl_1_chem, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_chem, title) # Change ... using special formatting. - title = re.sub(r"(?si)]*>(.*?)", - repl_1_math, title) + title = re.sub(r"(?si)]*>(.*?)", repl_1_math, title) # Change ... using special formatting. - title = re.sub(r"(?si)]*>(.*?)" - r"", - repl_1_syntaxhighlight, title) + title = re.sub( + r"(?si)]*>(.*?)" r"", + repl_1_syntaxhighlight, + title, + ) # Remove any remaining HTML tags. if not no_html_strip: title = re.sub(r"(?s)<[/!a-zA-Z][^>]*>", "", title) @@ -1441,7 +1467,7 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: category_ns_data: NamespaceDataEntry # XXX "Category" -> config variable for portability - category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item] + category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) # type: ignore[typeddict-item] # Fail if we received empty dict from .get() category_ns_names = {category_ns_data["name"]} | set( category_ns_data["aliases"] @@ -1455,22 +1481,30 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: "", title, ) - title = re.sub(r"(?s)\[\[\s*:?([^]|#<>]+?)\s*(#[^][|<>]*?)?\]\]", - repl_1, title) - title = re.sub(r"(?s)\[\[\s*(([a-zA-Z0-9]+)\s*:)?\s*([^][#|<>]+?)" - r"\s*(#[^][|]*?)?\|?\]\]", - repl_link, title) - title = re.sub(r"(?s)\[\[\s*([^][|<>]+?)\s*\|" - r"\s*(([^][|]|\[[^]]*\])+?)" - r"(\s*\|\s*(([^]|]|\[[^]]*\])+?))*\s*\]\]", - repl_link_bars, title) + title = re.sub( + r"(?s)\[\[\s*:?([^]|#<>]+?)\s*(#[^][|<>]*?)?\]\]", repl_1, title + ) + title = re.sub( + r"(?s)\[\[\s*(([a-zA-Z0-9]+)\s*:)?\s*([^][#|<>]+?)" + r"\s*(#[^][|]*?)?\|?\]\]", + repl_link, + title, + ) + title = re.sub( + r"(?s)\[\[\s*([^][|<>]+?)\s*\|" + r"\s*(([^][|]|\[[^]]*\])+?)" + r"(\s*\|\s*(([^]|]|\[[^]]*\])+?))*\s*\]\]", + repl_link_bars, + title, + ) if title == orig: break # Replace remaining HTML links by the URL. while True: orig = title - title = re.sub(r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", - repl_exturl, title) + title = re.sub( + r"\[\s*((https?:|mailto:)?//([^][]+?))\s*\]", repl_exturl, title + ) if title == orig: break @@ -1508,14 +1542,16 @@ def repl_1_syntaxhighlight(m: re.Match) -> str: return title -def clean_template_args(wxr: WiktextractContext, - ht: dict[Union[int, str], str], # XXX -> "TemplateArgs" - no_strip=False +def clean_template_args( + wxr: WiktextractContext, ht: TemplateArgs, no_strip=False ) -> dict[str, str]: """Cleans all values in a template argument dictionary and returns the cleaned dictionary.""" assert isinstance(wxr, WiktextractContext) assert isinstance(ht, dict) - return {clean_value(wxr, str(k), no_html_strip=True): - clean_value(wxr, str(v), no_strip=no_strip, no_html_strip=True) - for k, v in ht.items()} + return { + clean_value(wxr, str(k), no_html_strip=True): clean_value( + wxr, str(v), no_strip=no_strip, no_html_strip=True + ) + for k, v in ht.items() + } diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 00e924b46..23156cb44 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -11,14 +11,22 @@ from functools import partial from re import Pattern from typing import ( + TYPE_CHECKING, + Callable, Optional, Set, Union, + cast, ) from mediawiki_langcodes import get_all_names, name_to_code from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.core import TemplateArgs +from wikitextprocessor.core import ( + TemplateArgs, + TemplateFnCallable, + PostTemplateFnCallable, +) +from wikitextprocessor.parser import GeneralNode from wiktextract.clean import clean_template_args from wiktextract.datautils import ( data_append, @@ -44,7 +52,11 @@ from wiktextract.parts_of_speech import PARTS_OF_SPEECH from wiktextract.tags import valid_tags from wiktextract.translations import parse_translation_item_text -from wiktextract.type_utils import WordData +from wiktextract.type_utils import ( + SenseData, + SoundData, + WordData, +) from wiktextract.wxr_context import WiktextractContext from ..ruby import extract_ruby, parse_ruby @@ -53,174 +65,177 @@ # Matches head tag HEAD_TAG_RE: Pattern = re.compile( - r"^(head|Han char|arabic-noun|arabic-noun-form|" - r"hangul-symbol|syllable-hangul)$|" + - r"^(latin|" + - "|".join(lang_code for lang_code, *_ in get_all_names("en")) + - r")-(" + - "|".join([ - "abbr", - "adj", - "adjective", - "adjective form", - "adjective-form", - "adv", - "adverb", - "affix", - "animal command", - "art", - "article", - "aux", - "bound pronoun", - "bound-pronoun", - "Buyla", - "card num", - "card-num", - "cardinal", - "chunom", - "classifier", - "clitic", - "cls", - "cmene", - "cmavo", - "colloq-verb", - "colverbform", - "combining form", - "combining-form", - "comparative", - "con", - "concord", - "conj", - "conjunction", - "conjug", - "cont", - "contr", - "converb", - "daybox", - "decl", - "decl noun", - "def", - "dem", - "det", - "determ", - "Deva", - "ending", - "entry", - "form", - "fuhivla", - "gerund", - "gismu", - "hanja", - "hantu", - "hanzi", - "head", - "ideophone", - "idiom", - "inf", - "indef", - "infixed pronoun", - "infixed-pronoun", - "infl", - "inflection", - "initialism", - "int", - "interfix", - "interj", - "interjection", - "jyut", - "latin", - "letter", - "locative", - "lujvo", - "monthbox", - "mutverb", - "name", - "nisba", - "nom", - "noun", - "noun form", - "noun-form", - "noun plural", - "noun-plural", - "nounprefix", - "num", - "number", - "numeral", - "ord", - "ordinal", - "par", - "part", - "part form", - "part-form", - "participle", - "particle", - "past", - "past neg", - "past-neg", - "past participle", - "past-participle", - "perfect participle", - "perfect-participle", - "personal pronoun", - "personal-pronoun", - "pref", - "prefix", - "phrase", - "pinyin", - "plural noun", - "plural-noun", - "pos", - "poss-noun", - "post", - "postp", - "postposition", - "PP", - "pp", - "ppron", - "pred", - "predicative", - "prep", - "prep phrase", - "prep-phrase", - "preposition", - "present participle", - "present-participle", - "pron", - "prondem", - "pronindef", - "pronoun", - "prop", - "proper noun", - "proper-noun", - "proper noun form", - "proper-noun form", - "proper noun-form", - "proper-noun-form", - "prov", - "proverb", - "prpn", - "prpr", - "punctuation mark", - "punctuation-mark", - "regnoun", - "rel", - "rom", - "romanji", - "root", - "sign", - "suff", - "suffix", - "syllable", - "symbol", - "verb", - "verb form", - "verb-form", - "verbal noun", - "verbal-noun", - "verbnec", - "vform", - ]) + - r")(-|/|\+|$)") + r"^(head|Han char|arabic-noun|arabic-noun-form|" + r"hangul-symbol|syllable-hangul)$|" + + r"^(latin|" + + "|".join(lang_code for lang_code, *_ in get_all_names("en")) + + r")-(" + + "|".join( + [ + "abbr", + "adj", + "adjective", + "adjective form", + "adjective-form", + "adv", + "adverb", + "affix", + "animal command", + "art", + "article", + "aux", + "bound pronoun", + "bound-pronoun", + "Buyla", + "card num", + "card-num", + "cardinal", + "chunom", + "classifier", + "clitic", + "cls", + "cmene", + "cmavo", + "colloq-verb", + "colverbform", + "combining form", + "combining-form", + "comparative", + "con", + "concord", + "conj", + "conjunction", + "conjug", + "cont", + "contr", + "converb", + "daybox", + "decl", + "decl noun", + "def", + "dem", + "det", + "determ", + "Deva", + "ending", + "entry", + "form", + "fuhivla", + "gerund", + "gismu", + "hanja", + "hantu", + "hanzi", + "head", + "ideophone", + "idiom", + "inf", + "indef", + "infixed pronoun", + "infixed-pronoun", + "infl", + "inflection", + "initialism", + "int", + "interfix", + "interj", + "interjection", + "jyut", + "latin", + "letter", + "locative", + "lujvo", + "monthbox", + "mutverb", + "name", + "nisba", + "nom", + "noun", + "noun form", + "noun-form", + "noun plural", + "noun-plural", + "nounprefix", + "num", + "number", + "numeral", + "ord", + "ordinal", + "par", + "part", + "part form", + "part-form", + "participle", + "particle", + "past", + "past neg", + "past-neg", + "past participle", + "past-participle", + "perfect participle", + "perfect-participle", + "personal pronoun", + "personal-pronoun", + "pref", + "prefix", + "phrase", + "pinyin", + "plural noun", + "plural-noun", + "pos", + "poss-noun", + "post", + "postp", + "postposition", + "PP", + "pp", + "ppron", + "pred", + "predicative", + "prep", + "prep phrase", + "prep-phrase", + "preposition", + "present participle", + "present-participle", + "pron", + "prondem", + "pronindef", + "pronoun", + "prop", + "proper noun", + "proper-noun", + "proper noun form", + "proper-noun form", + "proper noun-form", + "proper-noun-form", + "prov", + "proverb", + "prpn", + "prpr", + "punctuation mark", + "punctuation-mark", + "regnoun", + "rel", + "rom", + "romanji", + "root", + "sign", + "suff", + "suffix", + "syllable", + "symbol", + "verb", + "verb form", + "verb-form", + "verbal noun", + "verbal-noun", + "verbnec", + "vform", + ] + ) + + r")(-|/|\+|$)" +) FLOATING_TABLE_TEMPLATES: set[str] = { # az-suffix-form creates a style=floatright div that is otherwise @@ -439,8 +454,11 @@ "wtorw", } for x in PANEL_PREFIXES & wikipedia_templates: - print("WARNING: {!r} in both panel_templates and wikipedia_templates" - .format(x)) + print( + "WARNING: {!r} in both panel_templates and wikipedia_templates".format( + x + ) + ) # Mapping from a template name (without language prefix) for the main word # (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which @@ -482,8 +500,10 @@ for k, v in template_allowed_pos_map.items(): for x in v: if x not in PARTS_OF_SPEECH: - print("BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" - "".format(x, k, v)) + print( + "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" + "".format(x, k, v) + ) assert False @@ -526,9 +546,10 @@ # Regexp for matching ignored etymology template names. This adds certain # prefixes to the names listed above. ignored_etymology_templates_re = re.compile( - r"^((cite-|R:|RQ:).*|" + - r"|".join(re.escape(x) for x in ignored_etymology_templates) + - r")$") + r"^((cite-|R:|RQ:).*|" + + r"|".join(re.escape(x) for x in ignored_etymology_templates) + + r")$" +) # Regexp for matching ignored descendants template names. Right now we just # copy the ignored etymology templates @@ -618,19 +639,38 @@ # Template name component to linkage section listing. Integer section means # default section, starting at that argument. -template_linkage_mappings: list[list[Union[str, int]]] = [ - ["syn", "synonyms"], - ["synonyms", "synonyms"], - ["ant", "antonyms"], - ["antonyms", "antonyms"], - ["hyp", "hyponyms"], - ["hyponyms", "hyponyms"], - ["der", "derived"], - ["derived terms", "derived"], - ["coordinate terms", "coordinate_terms"], - ["rel", "related"], - ["col", 2], -] +# XXX not used anymore, except for the first elements: moved to +# template_linkages +# template_linkage_mappings: list[list[Union[str, int]]] = [ +# ["syn", "synonyms"], +# ["synonyms", "synonyms"], +# ["ant", "antonyms"], +# ["antonyms", "antonyms"], +# ["hyp", "hyponyms"], +# ["hyponyms", "hyponyms"], +# ["der", "derived"], +# ["derived terms", "derived"], +# ["coordinate terms", "coordinate_terms"], +# ["rel", "related"], +# ["col", 2], +# ] + +# Template names, this was exctracted from template_linkage_mappings, +# because the code using template_linkage_mappings was actually not used +# (but not removed). +template_linkages: set[str] = { + "syn", + "synonyms", + "ant", + "antonyms", + "hyp", + "hyponyms", + "der", + "derived terms", + "coordinate terms", + "rel", + "col", +} # Maps template name used in a word sense to a linkage field that it adds. sense_linkage_templates: dict[str, str] = { @@ -655,11 +695,11 @@ def decode_html_entities(v: Union[str, int]) -> str: return html.unescape(v) -def parse_sense_linkage(wxr: - WiktextractContext, - data: WordData, - name: str, - ht: TemplateArgs, +def parse_sense_linkage( + wxr: WiktextractContext, + data: SenseData, + name: str, + ht: TemplateArgs, ) -> None: """Parses a linkage (synonym, etc) specified in a word sense.""" assert isinstance(wxr, WiktextractContext) @@ -670,13 +710,15 @@ def parse_sense_linkage(wxr: for i in range(2, 20): w = ht.get(i) or "" w = clean_node(wxr, data, w) - if w.startswith(ns_title_prefix_tuple(wxr, "Thesaurus")): - w = w[10:] + for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): + if w.startswith(alias): + w = w[len(alias) :] + break if not w: break tags: list[str] = [] topics: list[str] = [] - english = None + english: Optional[str] = None # Try to find qualifiers for this synonym q = ht.get("q{}".format(i - 1)) if q: @@ -703,7 +745,7 @@ def parse_sense_linkage(wxr: alt = None m = re.search(r"\(([^)]+)\)$", w) if m: - w = w[:m.start()].strip() + w = w[: m.start()].strip() alt = m.group(1) dt = {"word": w} @@ -718,15 +760,15 @@ def parse_sense_linkage(wxr: data_append(data, field, dt) -def parse_language(wxr: WiktextractContext, - langnode: WikiNode, - language: str, - lang_code: str) -> list[WordData]: +def parse_language( + wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str +) -> list[WordData]: """Iterates over the text of the page, returning words (parts-of-speech) defined on the page one at a time. (Individual word senses for the same part-of-speech are typically encoded in the same entry.)""" # imported here to avoid circular import from wiktextract.pronunciations import parse_pronunciation + assert isinstance(wxr, WiktextractContext) assert isinstance(langnode, WikiNode) assert isinstance(language, str) @@ -737,85 +779,110 @@ def parse_language(wxr: WiktextractContext, word = wxr.wtp.title unsupported_prefix = "Unsupported titles/" if word.startswith(unsupported_prefix): - w = word[len(unsupported_prefix):] + w = word[len(unsupported_prefix) :] if w in unsupported_title_map: word = unsupported_title_map[w] else: - wxr.wtp.error("Unimplemented unsupported title: {}".format(word), - sortid="page/870") + wxr.wtp.error( + "Unimplemented unsupported title: {}".format(word), + sortid="page/870", + ) word = w elif word.startswith("Reconstruction:"): - word = word[word.find("/") + 1:] + word = word[word.find("/") + 1 :] is_reconstruction = True - base_data = {"word": word, "lang": language, "lang_code": lang_code} + base_data: WordData = { + "word": word, + "lang": language, + "lang_code": lang_code, + } if is_reconstruction: data_append(base_data, "tags", "reconstruction") - sense_data = {} - pos_data = {} # For a current part-of-speech - etym_data = {} # For one etymology - pos_datas = [] - etym_datas = [] - page_datas = [] + sense_data: SenseData = {} + pos_data: WordData = {} # For a current part-of-speech + etym_data: WordData = {} # For one etymology + pos_datas: list[SenseData] = [] + etym_datas: list[WordData] = [] + page_datas: list[WordData] = [] have_etym = False - stack = [] + stack: list[str] = [] # names of items on the "stack" - def merge_base(data, base): + def merge_base(data: WordData, base: WordData) -> None: for k, v in base.items(): # Copy the value to ensure that we don't share lists or # dicts between structures (even nested ones). v = copy.deepcopy(v) if k not in data: # The list was copied above, so this will not create shared ref - data[k] = v + data[k] = v # type: ignore[literal-required] continue - if data[k] == v: + if data[k] == v: # type: ignore[literal-required] continue - if (isinstance(data[k], (list, tuple)) or - isinstance(v, (list, tuple))): - data[k] = list(data[k]) + list(v) - elif data[k] != v: - wxr.wtp.warning("conflicting values for {} in merge_base: " - "{!r} vs {!r}" - .format(k, data[k], v), - sortid="page/904") - - def complementary_pop(pron, key): + if ( + isinstance(data[k], (list, tuple)) # type: ignore[literal-required] + or isinstance( + v, + (list, tuple), # Should this be "and"? + ) + ): + data[k] = list(data[k]) + list(v) # type: ignore + elif data[k] != v: # type: ignore[literal-required] + wxr.wtp.warning( + "conflicting values for {} in merge_base: " + "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] + sortid="page/904", + ) + + def complementary_pop(pron: SoundData, key: str) -> SoundData: """Remove unnecessary keys from dict values in a list comprehension...""" if key in pron: - pron.pop(key) + pron.pop(key) # type: ignore return pron # If the result has sounds, eliminate sounds that have a prefix that # does not match "word" or one of "forms" if "sounds" in data and "word" in data: accepted = [data["word"]] - accepted.extend(f["form"] for f in data.get("forms", ())) - data["sounds"] = list(complementary_pop(s, "pos") - for s in data["sounds"] - if "form" not in s or s["form"] in accepted) + accepted.extend(f["form"] for f in data.get("forms", dict())) + data["sounds"] = list( + s + for s in data["sounds"] + if "form" not in s or s["form"] in accepted + ) # If the result has sounds, eliminate sounds that have a pos that # does not match "pos" if "sounds" in data and "pos" in data: - data["sounds"] = list(s for s in data["sounds"] - if "pos" not in s or s["pos"] == data["pos"]) + data["sounds"] = list( + complementary_pop(s, "pos") + for s in data["sounds"] + # "pos" is not a field of SoundData, correctly, so we're + # removing it here. It's a kludge on a kludge on a kludge. + if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] + ) - def push_sense(): + def push_sense() -> bool: """Starts collecting data for a new word sense. This returns True if a sense was added.""" nonlocal sense_data tags = sense_data.get("tags", ()) - if (not sense_data.get("glosses") and - "translation-hub" not in tags and - "no-gloss" not in tags): + if ( + not sense_data.get("glosses") + and "translation-hub" not in tags + and "no-gloss" not in tags + ): return False - if (("participle" in sense_data.get("tags", ()) or - "infinitive" in sense_data.get("tags", ())) and - "alt_of" not in sense_data and - "form_of" not in sense_data and - "etymology_text" in etym_data): + if ( + ( + "participle" in sense_data.get("tags", ()) + or "infinitive" in sense_data.get("tags", ()) + ) + and "alt_of" not in sense_data + and "form_of" not in sense_data + and "etymology_text" in etym_data + ): etym = etym_data["etymology_text"] etym = etym.split(". ")[0] ret = parse_alt_or_inflection_of(wxr, etym, set()) @@ -829,28 +896,29 @@ def push_sense(): data_extend(sense_data, "alt_of", lst) data_extend(sense_data, "tags", tags) - if (not sense_data.get("glosses") and - "no-gloss" not in sense_data.get("tags", ())): + if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( + "tags", () + ): data_append(sense_data, "tags", "no-gloss") pos_datas.append(sense_data) sense_data = {} return True - def push_pos(): + def push_pos() -> None: """Starts collecting data for a new part-of-speech.""" nonlocal pos_data nonlocal pos_datas push_sense() if wxr.wtp.subsection: - data = {"senses": pos_datas} + data: WordData = {"senses": pos_datas} merge_base(data, pos_data) etym_datas.append(data) pos_data = {} pos_datas = [] wxr.wtp.start_subsection(None) - def push_etym(): + def push_etym() -> None: """Starts collecting data for a new etymology.""" nonlocal etym_data nonlocal etym_datas @@ -863,7 +931,7 @@ def push_etym(): etym_data = {} etym_datas = [] - def select_data(): + def select_data() -> WordData: """Selects where to store data (pos or etym) based on whether we are inside a pos (part-of-speech).""" if wxr.wtp.subsection is not None: @@ -872,7 +940,9 @@ def select_data(): return base_data return etym_data - def head_post_template_fn(name, ht, expansion): + def head_post_template_fn( + name: str, ht: TemplateArgs, expansion: str + ) -> Optional[str]: """Handles special templates in the head section of a word. Head section is the text after part-of-speech subtitle and before word sense list. Typically it generates the bold line for the word, but @@ -934,15 +1004,15 @@ def head_post_template_fn(name, ht, expansion): return None - def parse_part_of_speech(posnode, pos): + def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: """Parses the subsection for a part-of-speech under a language on a page.""" assert isinstance(posnode, WikiNode) assert isinstance(pos, str) # print("parse_part_of_speech", pos) pos_data["pos"] = pos - pre = [[]] # list of lists - lists = [[]] # list of lists + pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists + lists: list[list[WikiNode]] = [[]] # list of lists first_para = True first_head_tmplt = True collecting_head = True @@ -965,13 +1035,13 @@ def parse_part_of_speech(posnode, pos): floaters, poschildren = recursively_extract( posnode.children, lambda x: ( - isinstance(x, WikiNode) and - x.kind == NodeKind.TEMPLATE and - x.largs[0][0] in FLOATING_TABLE_TEMPLATES - ) + isinstance(x, WikiNode) + and x.kind == NodeKind.TEMPLATE + and x.largs[0][0] in FLOATING_TABLE_TEMPLATES + ), ) tempnode = WikiNode(NodeKind.LEVEL5, 0) - tempnode.largs = ['Inflection'] + tempnode.largs = [["Inflection"]] tempnode.children = floaters parse_inflection(tempnode, "Floating Div", pos) # print(poschildren) @@ -981,12 +1051,12 @@ def parse_part_of_speech(posnode, pos): if not floaters: wxr.wtp.debug( "PoS section without contents", - sortid="en/page/1051/20230612" + sortid="en/page/1051/20230612", ) else: wxr.wtp.debug( "PoS section without contents except for a floating table", - sortid="en/page/1056/20230612" + sortid="en/page/1056/20230612", ) return @@ -1019,16 +1089,19 @@ def parse_part_of_speech(posnode, pos): elif collecting_head and kind == NodeKind.LINK: # We might collect relevant links as they are often pictures # relating to the word - if (len(node.largs[0]) >= 1 and - isinstance(node.largs[0][0], str)): - if node.largs[0][0].startswith(ns_title_prefix_tuple( - wxr, "Category")): + if len(node.largs[0]) >= 1 and isinstance( + node.largs[0][0], str + ): + if node.largs[0][0].startswith( + ns_title_prefix_tuple(wxr, "Category") + ): # [[Category:...]] # We're at the end of the file, probably, so stop # here. Otherwise the head will get garbage. break - if node.largs[0][0].startswith(ns_title_prefix_tuple( - wxr, "File")): + if node.largs[0][0].startswith( + ns_title_prefix_tuple(wxr, "File") + ): # Skips file links continue start_of_paragraph = False @@ -1040,8 +1113,12 @@ def parse_part_of_speech(posnode, pos): lists.append([]) # Lists parallels pre collecting_head = True start_of_paragraph = True - elif (collecting_head and - node.sarg not in ("gallery", "ref", "cite", "caption")): + elif collecting_head and node.sarg not in ( + "gallery", + "ref", + "cite", + "caption", + ): start_of_paragraph = False pre[-1].append(node) else: @@ -1061,21 +1138,23 @@ def parse_part_of_speech(posnode, pos): # skip these templates; panel_templates is already used # to skip certain templates else, but it also applies to # head parsing quite well. - if is_panel_template(wxr, node.largs[0][0]): + # node.largs[0][0] should always be str, but can't type-check + # that. + if is_panel_template(wxr, node.largs[0][0]): # type: ignore[arg-type] continue # skip these templates # if node.largs[0][0] in skip_these_templates_in_head: - # first_head_tmplt = False # no first_head_tmplt at all - # start_of_paragraph = False - # continue + # first_head_tmplt = False # no first_head_tmplt at all + # start_of_paragraph = False + # continue if first_head_tmplt and pre[-1]: first_head_tmplt = False start_of_paragraph = False pre[-1].append(node) elif pre[-1] and start_of_paragraph: - pre.append([]) # Switch to the next head - lists.append([]) # lists parallel pre + pre.append([]) # Switch to the next head + lists.append([]) # lists parallel pre collecting_head = True start_of_paragraph = False pre[-1].append(node) @@ -1092,8 +1171,8 @@ def parse_part_of_speech(posnode, pos): # Clean up empty pairs, and fix messes with extra newlines that # separate templates that are followed by lists wiktextract issue #314 - cleaned_pre = [] - cleaned_lists = [] + cleaned_pre: list[list[Union[str, WikiNode]]] = [] + cleaned_lists: list[list[WikiNode]] = [] pairless_pre_index = None for pre1, ls in zip(pre, lists): @@ -1102,8 +1181,9 @@ def parse_part_of_speech(posnode, pos): if not pre1 and not ls: # skip [] + [] continue - if not ls and all((isinstance(x, str) and not x.strip()) - for x in pre1): + if not ls and all( + (isinstance(x, str) and not x.strip()) for x in pre1 + ): # skip ["\n", " "] + [] continue if ls and not pre1: @@ -1118,7 +1198,7 @@ def parse_part_of_speech(posnode, pos): lists = cleaned_lists there_are_many_heads = len(pre) > 1 - header_tags = [] + header_tags: list[str] = [] if not any(g for g in lists): process_gloss_without_list(poschildren, pos, pos_data, header_tags) @@ -1128,60 +1208,75 @@ def parse_part_of_speech(posnode, pos): # # don't have gloss list # # XXX add code here to filter out 'garbage', like text # # that isn't a head template or head. - # continue + # continue if all(not sl for sl in lists[i:]): if i == 0: if isinstance(node, str): - wxr.wtp.debug("first head without list of senses," - "string: '{}[...]', {}/{}".format( - node[:20], word, language), - sortid="page/1689/20221215") + wxr.wtp.debug( + "first head without list of senses," + "string: '{}[...]', {}/{}".format( + node[:20], word, language + ), + sortid="page/1689/20221215", + ) if isinstance(node, WikiNode): - if node.largs and node.largs[0][0] in ["Han char",]: + if node.largs and node.largs[0][0] in [ + "Han char", + ]: # just ignore these templates pass else: - wxr.wtp.debug("first head without " - "list of senses, " - "template node " - "{}, {}/{}".format( - node.largs, word, language), - sortid="page/1694/20221215") + wxr.wtp.debug( + "first head without " + "list of senses, " + "template node " + "{}, {}/{}".format( + node.largs, word, language + ), + sortid="page/1694/20221215", + ) else: - wxr.wtp.debug("first head without list of senses, " - "{}/{}".format( - word, language), - sortid="page/1700/20221215") + wxr.wtp.debug( + "first head without list of senses, " + "{}/{}".format(word, language), + sortid="page/1700/20221215", + ) # no break here so that the first head always # gets processed. else: if isinstance(node, str): - wxr.wtp.debug("later head without list of senses," - "string: '{}[...]', {}/{}".format( - node[:20], word, language), - sortid="page/1708/20221215") + wxr.wtp.debug( + "later head without list of senses," + "string: '{}[...]', {}/{}".format( + node[:20], word, language + ), + sortid="page/1708/20221215", + ) if isinstance(node, WikiNode): - wxr.wtp.debug("later head without list of senses," - "template node " - "{}, {}/{}".format( - node.sarg if node.sarg else node.largs, - word, language), - sortid="page/1713/20221215") + wxr.wtp.debug( + "later head without list of senses," + "template node " + "{}, {}/{}".format( + node.sarg if node.sarg else node.largs, + word, + language, + ), + sortid="page/1713/20221215", + ) else: - wxr.wtp.debug("later head without list of senses, " - "{}/{}".format( - word, language), - sortid="page/1719/20221215") + wxr.wtp.debug( + "later head without list of senses, " + "{}/{}".format(word, language), + sortid="page/1719/20221215", + ) break head_group = i + 1 if there_are_many_heads else None # print("parse_part_of_speech: {}: {}: pre={}" - # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) - process_gloss_header(pre1, - pos, - head_group, - pos_data, - header_tags) + # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) + process_gloss_header( + pre1, pos, head_group, pos_data, header_tags + ) for l in ls: # Parse each list associated with this head. for node in l.children: @@ -1194,10 +1289,10 @@ def parse_part_of_speech(posnode, pos): # the data is already pushed into a sub-gloss # downstream, unless the higher level has examples # that need to be put somewhere. - common_data = {"tags": list(header_tags)} + common_data: SenseData = {"tags": list(header_tags)} if head_group: common_data["head_nr"] = head_group - parse_sense_node(node, common_data, pos) + parse_sense_node(node, common_data, pos) # type: ignore[arg-type] # If there are no senses extracted, add a dummy sense. We want to # keep tags extracted from the head for the dummy sense. @@ -1211,7 +1306,7 @@ def process_gloss_header( header_nodes: list[Union[WikiNode, str]], pos_type: str, header_group: Optional[int], - pos_data: dict, + pos_data: WordData, header_tags: list[str], ) -> None: ruby = [] @@ -1223,10 +1318,14 @@ def process_gloss_header( exp.children, lambda x: isinstance(x, WikiNode) and x.kind == NodeKind.HTML - and x.sarg == "ruby" + and x.sarg == "ruby", ) if rub is not None: for r in rub: + if TYPE_CHECKING: + # we know the lambda above in recursively_extract + # returns only WikiNodes in rub + assert isinstance(r, WikiNode) rt = parse_ruby(wxr, r) if rt is not None: ruby.append(rt) @@ -1244,27 +1343,30 @@ def process_gloss_header( ruby=ruby, ) if "tags" in pos_data: - header_tags[:] = pos_data["tags"] - del pos_data["tags"] + # pos_data can get "tags" data from some source; type-checkers + # doesn't like it, so let's ignore it. + header_tags[:] = pos_data["tags"] # type: ignore[typeddict-item] + del pos_data["tags"] # type: ignore[typeddict-item] else: header_tags.clear() def process_gloss_without_list( nodes: list[Union[WikiNode, str]], pos_type: str, - pos_data: dict, + pos_data: WordData, header_tags: list[str], ) -> None: # gloss text might not inside a list - header_nodes = [] - gloss_nodes = [] + header_nodes: list[Union[str, WikiNode]] = [] + gloss_nodes: list[Union[str, WikiNode]] = [] for node in strip_nodes(nodes): if isinstance(node, WikiNode): if node.kind == NodeKind.TEMPLATE: template_name = node.largs[0][0] - if ( - template_name == "head" - or template_name.startswith(f"{lang_code}-") + if TYPE_CHECKING: + assert isinstance(template_name, str) + if template_name == "head" or template_name.startswith( + f"{lang_code}-" ): header_nodes.append(node) continue @@ -1281,7 +1383,11 @@ def process_gloss_without_list( gloss_nodes, pos_type, {"tags": list(header_tags)} ) - def parse_sense_node(node, sense_base, pos): + def parse_sense_node( + node: Union[str, WikiNode], # never receives str + sense_base: SenseData, + pos: str, + ) -> bool: """Recursively (depth first) parse LIST_ITEM nodes for sense data. Uses push_sense() to attempt adding data to pos_data in the scope of parse_language() when it reaches deep in the recursion. push_sense() @@ -1292,14 +1398,18 @@ def parse_sense_node(node, sense_base, pos): """ assert isinstance(sense_base, dict) # Added to every sense deeper in if not isinstance(node, WikiNode): - wxr.wtp.debug("{}: parse_sense_node called with" - "something that isn't a WikiNode".format(pos), - sortid="page/1287/20230119") + # This doesn't seem to ever happen in practice. + wxr.wtp.debug( + "{}: parse_sense_node called with" + "something that isn't a WikiNode".format(pos), + sortid="page/1287/20230119", + ) return False if node.kind != NodeKind.LIST_ITEM: - wxr.wtp.debug("{}: non-list-item inside list".format(pos), - sortid="page/1678") + wxr.wtp.debug( + "{}: non-list-item inside list".format(pos), sortid="page/1678" + ) return False if node.sarg == ":": @@ -1315,7 +1425,7 @@ def parse_sense_node(node, sense_base, pos): # added |= push_sense() or added |= parse_sense_node(...) to OR. added = False - gloss_template_args = set() + gloss_template_args: set[str] = set() # For LISTs and LIST_ITEMS, their argument is something like # "##" or "##:", and using that we can rudimentally determine @@ -1330,26 +1440,34 @@ def parse_sense_node(node, sense_base, pos): # of subglosses below this. The list's # argument ends with #, and its depth should # be bigger than parent node. - subentries = [x for x in children - if isinstance(x, WikiNode) and - x.kind == NodeKind.LIST and - x.sarg == current_depth + "#"] + subentries = [ + x + for x in children + if isinstance(x, WikiNode) + and x.kind == NodeKind.LIST + and x.sarg == current_depth + "#" + ] # sublists of examples and quotations. .sarg # does not end with "#". - others = [x for x in children - if isinstance(x, WikiNode) and - x.kind == NodeKind.LIST and - x.sarg != current_depth + "#"] + others = [ + x + for x in children + if isinstance(x, WikiNode) + and x.kind == NodeKind.LIST + and x.sarg != current_depth + "#" + ] # the actual contents of this particular node. # can be a gloss (or a template that expands into # many glosses which we can't easily pre-expand) # or could be an "outer gloss" with more specific # subglosses, or could be a qualfier for the subglosses. - contents = [x for x in children - if not isinstance(x, WikiNode) or - x.kind != NodeKind.LIST] + contents = [ + x + for x in children + if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST + ] # If this entry has sublists of entries, we should combine # gloss information from both the "outer" and sublist content. # Sometimes the outer gloss @@ -1371,28 +1489,29 @@ def parse_sense_node(node, sense_base, pos): # copy current node and modify it so it doesn't # loop infinitely. cropped_node = copy.copy(node) - cropped_node.children = [x for x in children - if not (isinstance(x, WikiNode) and - x.kind == NodeKind.LIST and - x.sarg == current_depth + "#")] - added |= parse_sense_node(cropped_node, - sense_base, - pos) + cropped_node.children = [ + x + for x in children + if not ( + isinstance(x, WikiNode) + and x.kind == NodeKind.LIST + and x.sarg == current_depth + "#" + ) + ] + added |= parse_sense_node(cropped_node, sense_base, pos) nonlocal sense_data # this kludge causes duplicated raw_ - # glosses data if this is not done; - # if the top-level (cropped_node) - # does not push_sense() properly or - # parse_sense_node() returns early, - # sense_data is not reset. This happens - # for example when you have a no-gloss - # string like "(intransitive)": - # no gloss, push_sense() returns early - # and sense_data has duplicate data with - # sense_base + # glosses data if this is not done; + # if the top-level (cropped_node) + # does not push_sense() properly or + # parse_sense_node() returns early, + # sense_data is not reset. This happens + # for example when you have a no-gloss + # string like "(intransitive)": + # no gloss, push_sense() returns early + # and sense_data has duplicate data with + # sense_base sense_data = {} - added |= parse_sense_node(slc[0], - sense_base, - pos) + added |= parse_sense_node(slc[0], sense_base, pos) return added return process_gloss_contents( @@ -1408,7 +1527,7 @@ def parse_sense_node(node, sense_base, pos): def process_gloss_contents( contents: list[Union[str, WikiNode]], pos: str, - sense_base: dict, + sense_base: SenseData, subentries: list[WikiNode] = [], others: list[WikiNode] = [], gloss_template_args: Set[str] = set(), @@ -1430,8 +1549,7 @@ def sense_template_fn( arg = clean_node(wxr, sense_base, ht.get(2, ())) if re.match(r"Q\d+$", arg): data_append(sense_base, "wikidata", arg) - data_append(sense_base, "senseid", - langid + ":" + arg) + data_append(sense_base, "senseid", langid + ":" + arg) if name in sense_linkage_templates: # print(f"SENSE_TEMPLATE_FN: {name}") parse_sense_linkage(wxr, sense_base, name, ht) @@ -1470,7 +1588,7 @@ def sense_template_fn( if is_gloss: wxr.wtp.warning( "Example template is used for gloss text", - sortid="extractor.en.page.sense_template_fn/1415" + sortid="extractor.en.page.sense_template_fn/1415", ) else: return "" @@ -1483,7 +1601,7 @@ def sense_template_fn( gloss_template_args.add(v) return None - def extract_link_texts(item): + def extract_link_texts(item: GeneralNode) -> None: """Recursively extracts link texts from the gloss source. This information is used to select whether to remove final "." from form_of/alt_of (e.g., ihm/Hunsrik).""" @@ -1504,8 +1622,11 @@ def extract_link_texts(item): return if item.kind == NodeKind.LINK: v = item.largs[-1] - if (isinstance(v, list) and len(v) == 1 and - isinstance(v[0], str)): + if ( + isinstance(v, list) + and len(v) == 1 + and isinstance(v[0], str) + ): gloss_template_args.add(v[0].strip()) for x in item.children: extract_link_texts(x) @@ -1514,11 +1635,16 @@ def extract_link_texts(item): # get the raw text of non-list contents of this node, and other stuff # like tag and category data added to sense_base + # cast = no-op type-setter for the type-checker + partial_template_fn = cast( + TemplateFnCallable, + partial(sense_template_fn, is_gloss=True), + ) rawgloss = clean_node( wxr, sense_base, contents, - template_fn=partial(sense_template_fn, is_gloss=True), + template_fn=partial_template_fn, collect_links=True, ) @@ -1542,7 +1668,7 @@ def extract_link_texts(item): strip_ends = [", particularly:"] for x in strip_ends: if rawgloss.endswith(x): - rawgloss = rawgloss[:-len(x)] + rawgloss = rawgloss[: -len(x)] break # The gloss could contain templates that produce more list items. @@ -1562,19 +1688,19 @@ def extract_link_texts(item): if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): data_append(sense_base, "raw_glosses", subglosses[1]) m = re.match(r"\(([^()]+)\):?\s*", rawgloss) - # ( ..\1.. ): ... or ( ..\1.. ) ... + # ( ..\1.. ): ... or ( ..\1.. ) ... if m: q = m.group(1) - rawgloss = rawgloss[m.end():].strip() + rawgloss = rawgloss[m.end() :].strip() parse_sense_qualifier(wxr, q, sense_base) if rawgloss == "A pejorative:": data_append(sense_base, "tags", "pejorative") - rawgloss = None + rawgloss = "" elif rawgloss == "Short forms.": data_append(sense_base, "tags", "abbreviation") - rawgloss = None + rawgloss = "" elif rawgloss == "Technical or specialized senses.": - rawgloss = None + rawgloss = "" if rawgloss: data_append(sense_base, "glosses", rawgloss) if rawgloss in ("A person:",): @@ -1583,15 +1709,20 @@ def extract_link_texts(item): # The main recursive call (except for the exceptions at the # start of this function). for sublist in subentries: - if not (isinstance(sublist, WikiNode) and - sublist.kind == NodeKind.LIST): - wxr.wtp.debug(f"'{repr(rawgloss[:20])}.' gloss has `subentries`" - f"with items that are not LISTs", - sortid="page/1511/20230119") + if not ( + isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST + ): + wxr.wtp.debug( + f"'{repr(rawgloss[:20])}.' gloss has `subentries`" + f"with items that are not LISTs", + sortid="page/1511/20230119", + ) continue for item in sublist.children: - if not (isinstance(item, WikiNode) and - item.kind == NodeKind.LIST_ITEM): + if not ( + isinstance(item, WikiNode) + and item.kind == NodeKind.LIST_ITEM + ): continue # copy sense_base to prevent cross-contamination between # subglosses and other subglosses and superglosses @@ -1611,20 +1742,22 @@ def extract_link_texts(item): if added: if examples: # this higher-up gloss has examples that we do not want to skip - wxr.wtp.debug("'{}[...]' gloss has examples we want to keep, " - "but there are subglosses." - .format(repr(rawgloss[:30])), - sortid="page/1498/20230118") + wxr.wtp.debug( + "'{}[...]' gloss has examples we want to keep, " + "but there are subglosses.".format(repr(rawgloss[:30])), + sortid="page/1498/20230118", + ) else: return True # Some entries, e.g., "iacebam", have weird sentences in quotes # after the gloss, but these sentences don't seem to be intended # as glosses. Skip them. - subglosses = list(gl for gl in subglosses - if gl.strip() and - not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', - gl)) + subglosses = list( + gl + for gl in subglosses + if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) + ) if len(subglosses) > 1 and "form_of" not in sense_base: gl = subglosses[0].strip() @@ -1633,8 +1766,7 @@ def extract_link_texts(item): parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) if parsed is not None: infl_tags, infl_dts = parsed - if (infl_dts and "form-of" in infl_tags and - len(infl_tags) == 1): + if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: # Interpret others as a particular form under # "inflection of" data_extend(sense_base, "tags", infl_tags) @@ -1677,13 +1809,13 @@ def extract_link_texts(item): data_extend(sense_data, k, v) else: assert k not in ("tags", "categories", "topics") - sense_data[k] = v + sense_data[k] = v # type:ignore[literal-required] # Parse the gloss for this particular sense m = re.match(r"^\((([^()]|\([^()]*\))*)\):?\s*", gloss) - # (...): ... or (...(...)...): ... + # (...): ... or (...(...)...): ... if m: parse_sense_qualifier(wxr, m.group(1), sense_data) - gloss = gloss[m.end():].strip() + gloss = gloss[m.end() :].strip() # Remove common suffix "[from 14th c.]" and similar gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) @@ -1691,12 +1823,15 @@ def extract_link_texts(item): # Check to make sure we don't have unhandled list items in gloss ofs = max(gloss.find("#"), gloss.find("* ")) if ofs > 10 and "(#)" not in gloss: - wxr.wtp.debug("gloss may contain unhandled list items: {}" - .format(gloss), - sortid="page/1412") + wxr.wtp.debug( + "gloss may contain unhandled list items: {}".format(gloss), + sortid="page/1412", + ) elif "\n" in gloss: - wxr.wtp.debug("gloss contains newline: {}".format(gloss), - sortid="page/1416") + wxr.wtp.debug( + "gloss contains newline: {}".format(gloss), + sortid="page/1416", + ) # Kludge, some glosses have a comma after initial qualifiers in # parentheses @@ -1706,7 +1841,7 @@ def extract_link_texts(item): if gloss.endswith(":"): gloss = gloss[:-1].strip() if gloss.startswith("N. of "): - gloss = "Name of " + gloss[6:] + gloss = "Name of " + gloss[6:] if gloss.startswith("†"): data_append(sense_data, "tags", "obsolete") gloss = gloss[1:] @@ -1729,16 +1864,19 @@ def extract_link_texts(item): if tag not in sense_tags: data_append(sense_data, "tags", tag) if countability_tags: - if ("countable" not in sense_tags and - "uncountable" not in sense_tags): + if ( + "countable" not in sense_tags + and "uncountable" not in sense_tags + ): data_extend(sense_data, "tags", countability_tags) # If outer gloss specifies a form-of ("inflection of", see # aquamarine/German), try to parse the inner glosses as # tags for an inflected form. if "form-of" in sense_base.get("tags", ()): - parsed = parse_alt_or_inflection_of(wxr, gloss, - gloss_template_args) + parsed = parse_alt_or_inflection_of( + wxr, gloss, gloss_template_args + ) if parsed is not None: infl_tags, infl_dts = parsed if not infl_dts and infl_tags: @@ -1758,18 +1896,23 @@ def extract_link_texts(item): split_glosses = [] for m in re.finditer(r"Abbreviation of ", gloss): if m.start() != position: - split_glosses.append(gloss[position: m.start()]) + split_glosses.append(gloss[position : m.start()]) position = m.start() split_glosses.append(gloss[position:]) for gloss in split_glosses: # Check if this gloss describes an alt-of or inflection-of - if (lang_code != "en" and " " not in gloss and distw([word], gloss) < 0.3): + if ( + lang_code != "en" + and " " not in gloss + and distw([word], gloss) < 0.3 + ): # Don't try to parse gloss if it is one word # that is close to the word itself for non-English words # (probable translations of a tag/form name) continue - parsed = parse_alt_or_inflection_of(wxr, gloss, - gloss_template_args) + parsed = parse_alt_or_inflection_of( + wxr, gloss, gloss_template_args + ) if parsed is None: continue tags, dts = parsed @@ -1797,7 +1940,7 @@ def extract_link_texts(item): data_append(sense_data, "form_of", dt) if len(sense_data) == 0: - if len(sense_base.get("tags")) == 0: + if len(sense_base.get("tags", [])) == 0: del sense_base["tags"] sense_data.update(sense_base) if push_sense(): @@ -1806,7 +1949,9 @@ def extract_link_texts(item): # print("PARSE_SENSE DONE:", pos_datas[-1]) return added - def parse_inflection(node, section, pos): + def parse_inflection( + node: WikiNode, section: str, pos: Optional[str] + ) -> None: """Parses inflection data (declension, conjugation) from the given page. This retrieves the actual inflection template parameters, which are very useful for applications that need @@ -1818,11 +1963,14 @@ def parse_inflection(node, section, pos): # print("parse_inflection:", node) if pos is None: - wxr.wtp.debug("inflection table outside part-of-speech", - sortid="page/1812") + wxr.wtp.debug( + "inflection table outside part-of-speech", sortid="page/1812" + ) return - def inflection_template_fn(name, ht): + def inflection_template_fn( + name: str, ht: TemplateArgs + ) -> Optional[str]: # print("decl_conj_template_fn", name, ht) if is_panel_template(wxr, name): return "" @@ -1830,8 +1978,11 @@ def inflection_template_fn(name, ht): # These are not to be captured as an exception to the # generic code below return None - m = re.search(r"-(conj|decl|ndecl|adecl|infl|conjugation|" - r"declension|inflection|mut|mutation)($|-)", name) + m = re.search( + r"-(conj|decl|ndecl|adecl|infl|conjugation|" + r"declension|inflection|mut|mutation)($|-)", + name, + ) if m: args_ht = clean_template_args(wxr, ht) dt = {"name": name, "args": args_ht} @@ -1844,7 +1995,7 @@ def inflection_template_fn(name, ht): text = wxr.wtp.node_to_wikitext(node.children) # Split text into separate sections for each to-level template - brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"] + brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"] template_sections = [] template_nesting = 0 # depth of SINGLE BRACES { { nesting } } # Because there is the possibility of triple curly braces @@ -1860,16 +2011,15 @@ def inflection_template_fn(name, ht): # print(text) # print(repr(brace_matches)) if len(brace_matches) > 1: - tsection = [] + tsection: list[str] = [] after_templates = False # kludge to keep any text - # before first template - # with the first template; - # otherwise, text - # goes with preceding template + # before first template + # with the first template; + # otherwise, text + # goes with preceding template for m in brace_matches: if m.startswith("{{"): - if (template_nesting == 0 and - after_templates): + if template_nesting == 0 and after_templates: template_sections.append(tsection) tsection = [] # start new section @@ -1879,12 +2029,13 @@ def inflection_template_fn(name, ht): elif m.startswith("}}"): template_nesting -= len(m) if template_nesting < 0: - wxr.wtp.error("Negatively nested braces, " - "couldn't split inflection templates, " - "{}/{} section {}" - .format(word, language, section), - sortid="page/1871") - template_sections = [] # use whole text + wxr.wtp.error( + "Negatively nested braces, " + "couldn't split inflection templates, " + "{}/{} section {}".format(word, language, section), + sortid="page/1871", + ) + template_sections = [] # use whole text break tsection.append(m) else: @@ -1904,16 +2055,20 @@ def inflection_template_fn(name, ht): for tsection in template_sections: texts.append("".join(tsection)) if template_nesting != 0: - wxr.wtp.error("Template nesting error: " - "template_nesting = {} " - "couldn't split inflection templates, " - "{}/{} section {}" - .format(template_nesting, word, language, section), - sortid="page/1896") + wxr.wtp.error( + "Template nesting error: " + "template_nesting = {} " + "couldn't split inflection templates, " + "{}/{} section {}".format( + template_nesting, word, language, section + ), + sortid="page/1896", + ) texts = [text] for text in texts: - tree = wxr.wtp.parse(text, expand_all=True, - template_fn=inflection_template_fn) + tree = wxr.wtp.parse( + text, expand_all=True, template_fn=inflection_template_fn + ) # Parse inflection tables from the section. The data is stored # under "forms". @@ -1924,12 +2079,20 @@ def inflection_template_fn(name, ht): template_name = m.group(1) tablecontext = TableContext(template_name) - parse_inflection_section(wxr, pos_data, - word, language, - pos, section, tree, - tablecontext=tablecontext) + parse_inflection_section( + wxr, + pos_data, + word, + language, + pos, + section, + tree, + tablecontext=tablecontext, + ) - def get_subpage_section(title, subtitle, seq): + def get_subpage_section( + title: str, subtitle: str, seq: Union[list[str], tuple[str, ...]] + ) -> Optional[Union[WikiNode, str]]: """Loads a subpage of the given page, and finds the section for the given language, part-of-speech, and section title. This is used for finding translations and other sections on subpages.""" @@ -1942,11 +2105,16 @@ def get_subpage_section(title, subtitle, seq): subpage_title = word + "/" + subtitle subpage_content = wxr.wtp.get_page_body(subpage_title, 0) if subpage_content is None: - wxr.wtp.error("/translations not found despite " - "{{see translation subpage|...}}", - sortid="page/1934") + wxr.wtp.error( + "/translations not found despite " + "{{see translation subpage|...}}", + sortid="page/1934", + ) + return None - def recurse(node, seq): + def recurse( + node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] + ) -> Optional[Union[str, WikiNode]]: # print(f"seq: {seq}") if not seq: return node @@ -1970,17 +2138,22 @@ def recurse(node, seq): subpage_content, pre_expand=True, additional_expand=ADDITIONAL_EXPAND_TEMPLATES, - do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES + do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, ) assert tree.kind == NodeKind.ROOT ret = recurse(tree, seq) if ret is None: - wxr.wtp.debug("Failed to find subpage section {}/{} seq {}" - .format(title, subtitle, seq), - sortid="page/1963") + wxr.wtp.debug( + "Failed to find subpage section {}/{} seq {}".format( + title, subtitle, seq + ), + sortid="page/1963", + ) return ret - def parse_linkage(data, field, linkagenode): + def parse_linkage( + data: WordData, field: str, linkagenode: WikiNode + ) -> None: assert isinstance(data, dict) assert isinstance(field, str) assert isinstance(linkagenode, WikiNode) @@ -1995,7 +2168,11 @@ def parse_linkage(data, field, linkagenode): toplevel_text = [] next_navframe_sense = None # Used for "(sense):" before NavFrame - def parse_linkage_item(contents, field, sense): + def parse_linkage_item( + contents: list[Union[str, WikiNode]], + field: str, + sense: Optional[str] = None, + ): assert isinstance(contents, (list, tuple)) assert isinstance(field, str) assert sense is None or isinstance(sense, str) @@ -2003,11 +2180,13 @@ def parse_linkage_item(contents, field, sense): # print("PARSE_LINKAGE_ITEM: {} ({}): {}" # .format(field, sense, contents)) - parts = [] - ruby = [] - urls = [] + parts: list[str] = [] + ruby: list[tuple[str, str]] = [] + urls: list[str] = [] - def item_recurse(contents, italic=False): + def item_recurse( + contents: list[Union[str, WikiNode]], italic=False + ) -> None: assert isinstance(contents, (list, tuple)) nonlocal sense nonlocal ruby @@ -2022,24 +2201,34 @@ def item_recurse(contents, italic=False): # node.sarg if node.sarg else node.largs) if kind == NodeKind.LIST: if parts: + sense1: Optional[str] sense1 = clean_node(wxr, None, parts) if sense1.endswith(":"): sense1 = sense1[:-1].strip() if sense1.startswith("(") and sense1.endswith(")"): sense1 = sense1[1:-1].strip() - if sense1.lower() == wxr.config.OTHER_SUBTITLES["translations"]: + if ( + sense1.lower() + == wxr.config.OTHER_SUBTITLES["translations"] + ): sense1 = None # print("linkage item_recurse LIST sense1:", sense1) - parse_linkage_recurse(node.children, field, - sense=sense1 or sense) + parse_linkage_recurse( + node.children, field, sense=sense1 or sense + ) parts = [] else: parse_linkage_recurse(node.children, field, sense) - elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW, - NodeKind.TABLE_CELL): + elif kind in ( + NodeKind.TABLE, + NodeKind.TABLE_ROW, + NodeKind.TABLE_CELL, + ): parse_linkage_recurse(node.children, field, sense) - elif kind in (NodeKind.TABLE_HEADER_CELL, - NodeKind.TABLE_CAPTION): + elif kind in ( + NodeKind.TABLE_HEADER_CELL, + NodeKind.TABLE_CAPTION, + ): continue elif kind == NodeKind.HTML: classes = (node.attrs.get("class") or "").split() @@ -2065,37 +2254,42 @@ def item_recurse(contents, italic=False): elif kind == NodeKind.LINK: ignore = False if isinstance(node.largs[0][0], str): - v = node.largs[0][0].strip().lower() - if v.startswith(ns_title_prefix_tuple(wxr, - "Category", True) \ - + ns_title_prefix_tuple(wxr, - "File", True)): + v1 = node.largs[0][0].strip().lower() + if v1.startswith( + ns_title_prefix_tuple(wxr, "Category", True) + + ns_title_prefix_tuple(wxr, "File", True) + ): ignore = True if not ignore: v = node.largs[-1] - if (len(node.largs) == 1 and - len(v) > 0 and - isinstance(v[0], str) and - v[0][0] == ":"): - v = [v[0][1:]] + list(v[1:]) + if ( + len(node.largs) == 1 + and len(v) > 0 + and isinstance(v[0], str) + and v[0][0] == ":" + ): + v = [v[0][1:]] + list(v[1:]) # type:ignore item_recurse(v, italic=italic) elif kind == NodeKind.URL: if len(node.largs) < 2 and node.largs: # Naked url captured - urls.extend(node.largs[-1]) + urls.extend(node.largs[-1]) # type:ignore[arg-type] continue if len(node.largs) == 2: # Url from link with text - urls.append(node.largs[0][-1]) + urls.append(node.largs[0][-1]) # type:ignore[arg-type] # print(f"{node.largs=!r}") # print("linkage recurse URL {}".format(node)) item_recurse(node.largs[-1], italic=italic) elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): item_recurse(node.children, italic=italic) else: - wxr.wtp.debug("linkage item_recurse unhandled {}: {}" - .format(node.kind, node), - sortid="page/2073") + wxr.wtp.debug( + "linkage item_recurse unhandled {}: {}".format( + node.kind, node + ), + sortid="page/2073", + ) # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" # .format(contents)) @@ -2105,48 +2299,18 @@ def item_recurse(contents, italic=False): # print("CLEANED ITEM: {!r}".format(item)) # print(f"URLS {urls=!r}") - return parse_linkage_item_text(wxr, word, data, field, item, - sense, ruby, pos_datas, - is_reconstruction, urls) - - def parse_linkage_template(node): - nonlocal have_panel_template - # XXX remove this function but check how to handle the - # template_linkage_mappings - # print("LINKAGE TEMPLATE:", node) - - def linkage_template_fn(name, ht): - # print("LINKAGE_TEMPLATE_FN:", name, ht) - nonlocal field - nonlocal have_panel_template - if is_panel_template(wxr, name): - have_panel_template = True - return "" - for prefix, t in template_linkage_mappings: - if re.search(r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), - name): - f = t if isinstance(t, str) else field - if (name.endswith("-top") or name.endswith("-bottom") or - name.endswith("-mid")): - field = f - return "" - i = t if isinstance(t, int) else 2 - while True: - v = ht.get(i, None) - if v is None: - break - v = clean_node(wxr, None, v) - parse_linkage_item(v, f) - i += 1 - return "" - # print("UNHANDLED LINKAGE TEMPLATE:", name, ht) - return None - - # Main body of parse_linkage_template() - text = wxr.wtp.node_to_wikitext(node) - parsed = wxr.wtp.parse(text, expand_all=True, - template_fn=linkage_template_fn) - parse_linkage_recurse(parsed.children, field, None) + return parse_linkage_item_text( + wxr, + word, + data, + field, + item, + sense, + ruby, + pos_datas, + is_reconstruction, + urls, + ) def parse_linkage_recurse(contents, field, sense): assert isinstance(contents, (list, tuple)) @@ -2177,9 +2341,12 @@ def parse_linkage_recurse(contents, field, sense): parse_linkage_recurse(node.children, field, sense) elif kind == NodeKind.TABLE_CELL: parse_linkage_item(node.children, field, sense) - elif kind in (NodeKind.TABLE_CAPTION, - NodeKind.TABLE_HEADER_CELL, - NodeKind.PREFORMATTED, NodeKind.BOLD): + elif kind in ( + NodeKind.TABLE_CAPTION, + NodeKind.TABLE_HEADER_CELL, + NodeKind.PREFORMATTED, + NodeKind.BOLD, + ): continue elif kind == NodeKind.HTML: # Recurse to process inside the HTML for most tags @@ -2196,16 +2363,18 @@ def parse_linkage_recurse(contents, field, sense): if sense1.endswith(":"): sense1 = sense1[:-1].strip() if sense and sense1: - wxr.wtp.debug("linkage qualifier-content on multiple " - "levels: {!r} and {!r}" - .format(sense, sense1), - sortid="page/2170") + wxr.wtp.debug( + "linkage qualifier-content on multiple " + "levels: {!r} and {!r}".format(sense, sense1), + sortid="page/2170", + ) parse_linkage_recurse(node.children, field, sense1) elif "NavFrame" in classes: # NavFrame uses previously assigned next_navframe_sense # (from a "(sense):" item) and clears it afterwards - parse_linkage_recurse(node.children, field, - sense or next_navframe_sense) + parse_linkage_recurse( + node.children, field, sense or next_navframe_sense + ) next_navframe_sense = None else: parse_linkage_recurse(node.children, field, sense) @@ -2222,9 +2391,12 @@ def parse_linkage_recurse(contents, field, sense): # initial value parse_linkage_recurse(node.largs[-1], field, sense) else: - wxr.wtp.debug("parse_linkage_recurse unhandled {}: {}" - .format(kind, node), - sortid="page/2196") + wxr.wtp.debug( + "parse_linkage_recurse unhandled {}: {}".format( + kind, node + ), + sortid="page/2196", + ) def linkage_template_fn1(name, ht): nonlocal have_panel_template @@ -2239,10 +2411,14 @@ def parse_zh_synonyms(parsed, data, hdrs, root_word): if isinstance(item, WikiNode): if item.kind == NodeKind.TABLE_ROW: cleaned = clean_node(wxr, None, item.children) - #print("cleaned:", repr(cleaned)) - if any(["Variety" in cleaned, - "Location" in cleaned, - "Words" in cleaned]): + # print("cleaned:", repr(cleaned)) + if any( + [ + "Variety" in cleaned, + "Location" in cleaned, + "Words" in cleaned, + ] + ): pass else: split = cleaned.split("\n") @@ -2268,11 +2444,15 @@ def parse_zh_synonyms(parsed, data, hdrs, root_word): if tag in zh_tag_lookup: tags.extend(zh_tag_lookup[tag]) else: - print(f"MISSING ZH SYNONYM TAG for root {root_word}, word {words}: {tag}") + print( + f"MISSING ZH SYNONYM TAG for root {root_word}, word {words}: {tag}" + ) sys.stdout.flush() for word in words: - data.append({"word": word.strip(), "tags": tags}) + data.append( + {"word": word.strip(), "tags": tags} + ) elif item.kind == NodeKind.HTML: cleaned = clean_node(wxr, None, item.children) if "Synonyms of" in cleaned: @@ -2288,10 +2468,14 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word): if isinstance(item, WikiNode): if item.kind == NodeKind.LIST_ITEM: cleaned = clean_node(wxr, None, item.children) - #print("cleaned:", repr(cleaned)) - if any(["Variety" in cleaned, - "Location" in cleaned, - "Words" in cleaned]): + # print("cleaned:", repr(cleaned)) + if any( + [ + "Variety" in cleaned, + "Location" in cleaned, + "Words" in cleaned, + ] + ): pass else: cleaned = cleaned.replace("(", ",") @@ -2309,11 +2493,15 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word): tags.append(tag) elif tag in zh_tag_lookup: tags.extend(zh_tag_lookup[tag]) - elif classify_desc(tag) == "romanization" \ - and roman is None: + elif ( + classify_desc(tag) == "romanization" + and roman is None + ): roman = tag else: - print(f"MISSING ZH SYNONYM TAG (possibly pinyin) - root {root_word}, word {words}: {tag}") + print( + f"MISSING ZH SYNONYM TAG (possibly pinyin) - root {root_word}, word {words}: {tag}" + ) sys.stdout.flush() for word in words: @@ -2328,9 +2516,13 @@ def parse_zh_synonyms_list(parsed, data, hdrs, root_word): if cleaned.find("Synonyms of") >= 0: cleaned = cleaned.replace("Synonyms of ", "") root_word = cleaned - parse_zh_synonyms_list(item.children, data, hdrs, root_word) + parse_zh_synonyms_list( + item.children, data, hdrs, root_word + ) else: - parse_zh_synonyms_list(item.children, data, hdrs, root_word) + parse_zh_synonyms_list( + item.children, data, hdrs, root_word + ) def contains_kind(children, nodekind): assert isinstance(children, list) @@ -2345,21 +2537,21 @@ def contains_kind(children, nodekind): # Main body of parse_linkage() text = wxr.wtp.node_to_wikitext(linkagenode.children) - parsed = wxr.wtp.parse(text, expand_all=True, - template_fn=linkage_template_fn1) + parsed = wxr.wtp.parse( + text, expand_all=True, template_fn=linkage_template_fn1 + ) if field == "synonyms" and lang_code == "zh": synonyms = [] if contains_kind(parsed.children, NodeKind.LIST): parse_zh_synonyms_list(parsed.children, synonyms, [], "") else: parse_zh_synonyms(parsed.children, synonyms, [], "") - #print(json.dumps(synonyms, indent=4, ensure_ascii=False)) + # print(json.dumps(synonyms, indent=4, ensure_ascii=False)) data_extend(data, "synonyms", synonyms) parse_linkage_recurse(parsed.children, field, None) if not data.get(field) and not have_panel_template: text = "".join(toplevel_text).strip() - if ("\n" not in text and "," in text and - text.count(",") > 3): + if "\n" not in text and "," in text and text.count(",") > 3: if not text.startswith("See "): parse_linkage_item([text], field, None) @@ -2388,8 +2580,10 @@ def parse_translation_item(contents, lang=None): # print("sense <- clean_node: ", sense) idx = sense.find("See also translations at") if idx > 0: - wxr.wtp.debug("Skipping translation see also: {}".format(sense), - sortid="page/2361") + wxr.wtp.debug( + "Skipping translation see also: {}".format(sense), + sortid="page/2361", + ) sense = sense[:idx].strip() if sense.endswith(":"): sense = sense[:-1].strip() @@ -2412,10 +2606,13 @@ def translation_item_template_fn(name, ht): code = ht.get(1) if code: if langcode and code != langcode: - wxr.wtp.debug("inconsistent language codes {} vs " - "{} in translation item: {!r} {}" - .format(langcode, code, name, ht), - sortid="page/2386") + wxr.wtp.debug( + "inconsistent language codes {} vs " + "{} in translation item: {!r} {}".format( + langcode, code, name, ht + ), + sortid="page/2386", + ) langcode = code tr = ht.get(2) if tr: @@ -2431,8 +2628,9 @@ def translation_item_template_fn(name, ht): langcode = code return None if name == "trans-see": - wxr.wtp.error("UNIMPLEMENTED trans-see template", - sortid="page/2405") + wxr.wtp.error( + "UNIMPLEMENTED trans-see template", sortid="page/2405" + ) return "" if name.endswith("-top"): return "" @@ -2440,28 +2638,41 @@ def translation_item_template_fn(name, ht): return "" if name.endswith("-mid"): return "" - #wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" + # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" # .format(name), # sortid="page/2414") return None - sublists = list(x for x in contents - if isinstance(x, WikiNode) and - x.kind == NodeKind.LIST) - contents = list(x for x in contents - if not isinstance(x, WikiNode) or - x.kind != NodeKind.LIST) + sublists = list( + x + for x in contents + if isinstance(x, WikiNode) and x.kind == NodeKind.LIST + ) + contents = list( + x + for x in contents + if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST + ) - item = clean_node(wxr, data, contents, - template_fn=translation_item_template_fn) + item = clean_node( + wxr, data, contents, template_fn=translation_item_template_fn + ) # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) # Parse the translation item. if item: - lang = parse_translation_item_text(wxr, word, data, item, sense, - pos_datas, lang, langcode, - translations_from_template, - is_reconstruction) + lang = parse_translation_item_text( + wxr, + word, + data, + item, + sense, + pos_datas, + lang, + langcode, + translations_from_template, + is_reconstruction, + ) # Handle sublists. They are frequently used for different scripts # for the language and different variants of the language. We will @@ -2495,8 +2706,9 @@ def template_fn(name, ht): sense = None sub = ht.get(1, "") if sub: - m = re.match(r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", - sub) + m = re.match( + r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub + ) else: m = None etym = "" @@ -2507,51 +2719,83 @@ def template_fn(name, ht): etym = m.group(2) pos = m.group(3) if not sub: - wxr.wtp.debug("no part-of-speech in " - "{{see translation subpage|...}}, " - "defaulting to just wxr.wtp.section " - "(= language)", - sortid="page/2468") + wxr.wtp.debug( + "no part-of-speech in " + "{{see translation subpage|...}}, " + "defaulting to just wxr.wtp.section " + "(= language)", + sortid="page/2468", + ) # seq sent to get_subpage_section without sub and pos - seq = [language, wxr.config.OTHER_SUBTITLES["translations"]] - elif (m and etym.lower().strip() - in wxr.config.OTHER_SUBTITLES["etymology"] - and pos.lower() in wxr.config.POS_SUBTITLES): - seq = [language, - etym_numbered, - pos, - wxr.config.OTHER_SUBTITLES["translations"]] + seq = [ + language, + wxr.config.OTHER_SUBTITLES["translations"], + ] + elif ( + m + and etym.lower().strip() + in wxr.config.OTHER_SUBTITLES["etymology"] + and pos.lower() in wxr.config.POS_SUBTITLES + ): + seq = [ + language, + etym_numbered, + pos, + wxr.config.OTHER_SUBTITLES["translations"], + ] elif sub.lower() in wxr.config.POS_SUBTITLES: # seq with sub but not pos - seq = [language, - sub, - wxr.config.OTHER_SUBTITLES["translations"]] + seq = [ + language, + sub, + wxr.config.OTHER_SUBTITLES["translations"], + ] else: # seq with sub and pos pos = wxr.wtp.subsection if pos.lower() not in wxr.config.POS_SUBTITLES: - wxr.wtp.debug("unhandled see translation subpage: " - "language={} sub={} wxr.wtp.subsection={}" - .format(language, sub, wxr.wtp.subsection), - sortid="page/2478") - seq = [language, - sub, - pos, - wxr.config.OTHER_SUBTITLES["translations"]] + wxr.wtp.debug( + "unhandled see translation subpage: " + "language={} sub={} wxr.wtp.subsection={}".format( + language, sub, wxr.wtp.subsection + ), + sortid="page/2478", + ) + seq = [ + language, + sub, + pos, + wxr.config.OTHER_SUBTITLES["translations"], + ] subnode = get_subpage_section( - wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq) + wxr.wtp.title, + wxr.config.OTHER_SUBTITLES["translations"], + seq, + ) if subnode is not None: parse_translations(data, subnode) else: # Failed to find the normal subpage section seq = [wxr.config.OTHER_SUBTITLES["translations"]] subnode = get_subpage_section( - wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq) + wxr.wtp.title, + wxr.config.OTHER_SUBTITLES["translations"], + seq, + ) if subnode is not None: parse_translations(data, subnode) return "" - if name in ("c", "C", "categorize", "cat", "catlangname", - "topics", "top", "qualifier", "cln"): + if name in ( + "c", + "C", + "categorize", + "cat", + "catlangname", + "topics", + "top", + "qualifier", + "cln", + ): # These are expanded in the default way return None if name in ("trans-top",): @@ -2564,8 +2808,12 @@ def template_fn(name, ht): sense_parts = [] sense = None return None - if name in ("trans-bottom", "trans-mid", - "checktrans-mid", "checktrans-bottom"): + if name in ( + "trans-bottom", + "trans-mid", + "checktrans-mid", + "checktrans-bottom", + ): return None if name == "checktrans-top": sense_parts = [] @@ -2576,11 +2824,17 @@ def template_fn(name, ht): sense_parts = [] sense = None return "" - wxr.wtp.error("UNIMPLEMENTED parse_translation_template: {} {}" - .format(name, ht), - sortid="page/2517") + wxr.wtp.error( + "UNIMPLEMENTED parse_translation_template: {} {}".format( + name, ht + ), + sortid="page/2517", + ) return "" - wxr.wtp.expand(wxr.wtp.node_to_wikitext(node), template_fn=template_fn) + + wxr.wtp.expand( + wxr.wtp.node_to_wikitext(node), template_fn=template_fn + ) def parse_translation_recurse(xlatnode): nonlocal sense @@ -2590,9 +2844,11 @@ def parse_translation_recurse(xlatnode): if isinstance(node, str): if sense: if not node.isspace(): - wxr.wtp.debug("skipping string in the middle of " - "translations: {}".format(node), - sortid="page/2530") + wxr.wtp.debug( + "skipping string in the middle of " + "translations: {}".format(node), + sortid="page/2530", + ) continue # Add a part to the sense sense_parts.append(node) @@ -2616,8 +2872,11 @@ def parse_translation_recurse(xlatnode): pass elif kind == NodeKind.TEMPLATE: parse_translation_template(node) - elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW, - NodeKind.TABLE_CELL): + elif kind in ( + NodeKind.TABLE, + NodeKind.TABLE_ROW, + NodeKind.TABLE_CELL, + ): parse_translation_recurse(node) elif kind == NodeKind.HTML: if node.attrs.get("class") == "NavFrame": @@ -2636,8 +2895,7 @@ def parse_translation_recurse(xlatnode): elif kind in LEVEL_KINDS: # Sub-levels will be recursed elsewhere pass - elif kind in (NodeKind.ITALIC, - NodeKind.BOLD): + elif kind in (NodeKind.ITALIC, NodeKind.BOLD): parse_translation_recurse(node) elif kind == NodeKind.PREFORMATTED: print("parse_translation_recurse: PREFORMATTED:", node) @@ -2650,29 +2908,53 @@ def parse_translation_recurse(xlatnode): # handle them. Note: must be careful not to read other # links, particularly things like in "human being": # "a human being -- see [[man/translations]]" (group title) - if (isinstance(arg0, (list, tuple)) and - arg0 and - isinstance(arg0[0], str) and - arg0[0].endswith("/" + wxr.config.OTHER_SUBTITLES["translations"]) and - arg0[0][:-(1 + len(wxr.config.OTHER_SUBTITLES["translations"]))] == wxr.wtp.title): - wxr.wtp.debug("translations subpage link found on main " - "page instead " - "of normal {{see translation subpage|...}}", - sortid="page/2595") + if ( + isinstance(arg0, (list, tuple)) + and arg0 + and isinstance(arg0[0], str) + and arg0[0].endswith( + "/" + wxr.config.OTHER_SUBTITLES["translations"] + ) + and arg0[0][ + : -( + 1 + + len( + wxr.config.OTHER_SUBTITLES["translations"] + ) + ) + ] + == wxr.wtp.title + ): + wxr.wtp.debug( + "translations subpage link found on main " + "page instead " + "of normal {{see translation subpage|...}}", + sortid="page/2595", + ) sub = wxr.wtp.subsection if sub.lower() in wxr.config.POS_SUBTITLES: - seq = [language, sub, wxr.config.OTHER_SUBTITLES["translations"]] + seq = [ + language, + sub, + wxr.config.OTHER_SUBTITLES["translations"], + ] subnode = get_subpage_section( - wxr.wtp.title, wxr.config.OTHER_SUBTITLES["translations"], seq) + wxr.wtp.title, + wxr.config.OTHER_SUBTITLES["translations"], + seq, + ) if subnode is not None: parse_translations(data, subnode) else: - wxr.wtp.errors("/translations link outside " - "part-of-speech") + wxr.wtp.errors( + "/translations link outside " "part-of-speech" + ) - if (len(arg0) >= 1 and - isinstance(arg0[0], str) and - not arg0[0].lower().startswith("category:")): + if ( + len(arg0) >= 1 + and isinstance(arg0[0], str) + and not arg0[0].lower().startswith("category:") + ): for x in node.largs[-1]: if isinstance(x, str): sense_parts.append(x) @@ -2681,9 +2963,11 @@ def parse_translation_recurse(xlatnode): elif not sense: sense_parts.append(node) else: - wxr.wtp.debug("skipping text between translation items/senses: " - "{}".format(node), - sortid="page/2621") + wxr.wtp.debug( + "skipping text between translation items/senses: " + "{}".format(node), + sortid="page/2621", + ) # Main code of parse_translation(). We want ``sense`` to be assigned # regardless of recursion levels, and thus the code is structured @@ -2720,17 +3004,25 @@ def etym_post_template_fn(name, ht, expansion): if ignore_count == 0: ht = clean_template_args(wxr, ht) expansion = clean_node(wxr, None, expansion) - templates.append({"name": name, "args": ht, "expansion": expansion}) + templates.append( + {"name": name, "args": ht, "expansion": expansion} + ) return None # Remove any subsections - contents = list(x for x in node.children - if not isinstance(x, WikiNode) or - x.kind not in LEVEL_KINDS) + contents = list( + x + for x in node.children + if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS + ) # Convert to text, also capturing templates using post_template_fn - text = clean_node(wxr, None, contents, - template_fn=etym_template_fn, - post_template_fn=etym_post_template_fn) + text = clean_node( + wxr, + None, + contents, + template_fn=etym_template_fn, + post_template_fn=etym_post_template_fn, + ) # Save the collected information. data["etymology_text"] = text data["etymology_templates"] = templates @@ -2804,20 +3096,23 @@ def desc_post_template_fn(name, ht, expansion): # same proto-language, then we tag this descendant entry with # "derived" is_derived = ( - is_proto_root_derived_section and - (name == "l" or name == "link") and - ("1" in ht and ht["1"] == lang_code) + is_proto_root_derived_section + and (name == "l" or name == "link") + and ("1" in ht and ht["1"] == lang_code) ) expansion = clean_node(wxr, None, expansion) - templates.append({ - "name": name, "args": ht, "expansion": expansion - }) + templates.append( + {"name": name, "args": ht, "expansion": expansion} + ) return None - text = clean_node(wxr, None, children, - template_fn=desc_template_fn, - post_template_fn=desc_post_template_fn - ) + text = clean_node( + wxr, + None, + children, + template_fn=desc_template_fn, + post_template_fn=desc_post_template_fn, + ) item_data["templates"] = templates item_data["text"] = text if is_derived: @@ -2837,11 +3132,15 @@ def get_sublist_index(list_item): def get_descendants(node): """Appends the data for every list item in every list in node - to descendants.""" + to descendants.""" for _, c in node_children(node): - if (c.kind == NodeKind.TEMPLATE and c.largs - and len(c.largs[0]) == 1 and isinstance(c.largs[0][0], str) - and c.largs[0][0] in unignored_non_list_templates): + if ( + c.kind == NodeKind.TEMPLATE + and c.largs + and len(c.largs[0]) == 1 + and isinstance(c.largs[0][0], str) + and c.largs[0][0] in unignored_non_list_templates + ): # Some Descendants sections have no wikitext list. Rather, # the list is entirely generated by a single template (see # e.g. the use of {{CJKV}} in Chinese entries). @@ -2914,40 +3213,48 @@ def skip_template_fn(name, ht): if node.kind not in LEVEL_KINDS: # XXX handle e.g. wikipedia links at the top of a language # XXX should at least capture "also" at top of page - if node.kind in (NodeKind.HLINE, NodeKind.LIST, - NodeKind.LIST_ITEM): + if node.kind in ( + NodeKind.HLINE, + NodeKind.LIST, + NodeKind.LIST_ITEM, + ): continue # print(" UNEXPECTED: {}".format(node)) # Clean the node to collect category links - clean_node(wxr, etym_data, node, - template_fn=skip_template_fn) + clean_node(wxr, etym_data, node, template_fn=skip_template_fn) continue - t = clean_node(wxr, etym_data, - node.sarg if node.sarg else node.largs) + t = clean_node( + wxr, etym_data, node.sarg if node.sarg else node.largs + ) t = t.lower() # XXX these counts were never implemented fully, and even this # gets discarded: Search STATISTICS_IMPLEMENTATION wxr.config.section_counts[t] += 1 # print("PROCESS_CHILDREN: T:", repr(t)) if t.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])): - if t.startswith(tuple( + if t.startswith( + tuple( pron_title + " " - for pron_title in - wxr.config.OTHER_SUBTITLES.get("pronunciation", []))): + for pron_title in wxr.config.OTHER_SUBTITLES.get( + "pronunciation", [] + ) + ) + ): # Pronunciation 1, etc, are used in Chinese Glyphs, # and each of them may have senses under Definition push_etym() wxr.wtp.start_subsection(None) if wxr.config.capture_pronunciation: data = select_data() - parse_pronunciation(wxr, - node, - data, - etym_data, - have_etym, - base_data, - lang_code, - ) + parse_pronunciation( + wxr, + node, + data, + etym_data, + have_etym, + base_data, + lang_code, + ) elif t.startswith(tuple(wxr.config.OTHER_SUBTITLES["etymology"])): push_etym() wxr.wtp.start_subsection(None) @@ -2963,11 +3270,13 @@ def skip_template_fn(name, ht): data = select_data() parse_descendants(data, node) elif ( - t in wxr.config.OTHER_SUBTITLES.get( + t + in wxr.config.OTHER_SUBTITLES.get( "proto_root_derived_sections", [] ) - and pos == "root" and is_reconstruction and - wxr.config.capture_descendants + and pos == "root" + and is_reconstruction + and wxr.config.capture_descendants ): data = select_data() parse_descendants(data, node, True) @@ -2989,17 +3298,20 @@ def skip_template_fn(name, ht): pos = dt["pos"] wxr.wtp.start_subsection(t) if "debug" in dt: - wxr.wtp.debug("{} in section {}" - .format(dt["debug"], t), - sortid="page/2755") + wxr.wtp.debug( + "{} in section {}".format(dt["debug"], t), + sortid="page/2755", + ) if "warning" in dt: - wxr.wtp.warning("{} in section {}" - .format(dt["warning"], t), - sortid="page/2759") + wxr.wtp.warning( + "{} in section {}".format(dt["warning"], t), + sortid="page/2759", + ) if "error" in dt: - wxr.wtp.error("{} in section {}" - .format(dt["error"], t), - sortid="page/2763") + wxr.wtp.error( + "{} in section {}".format(dt["error"], t), + sortid="page/2763", + ) # Parse word senses for the part-of-speech parse_part_of_speech(node, pos) if "tags" in dt: @@ -3056,10 +3368,10 @@ def usex_template_fn(name, ht): usex_type = "example" elif name in quotation_templates: usex_type = "quotation" - for prefix, t in template_linkage_mappings: - if re.search(r"(^|[-/\s]){}($|\b|[0-9])" - .format(prefix), - name): + for prefix in template_linkages: + if re.search( + r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name + ): return "" return None @@ -3068,23 +3380,32 @@ def usex_template_fn(name, ht): contents = item.children if lang_code == "ja": # print(contents) - if (contents and isinstance(contents, str) and - re.match(r"\s*$", contents[0])): + if ( + contents + and isinstance(contents, str) + and re.match(r"\s*$", contents[0]) + ): contents = contents[1:] - exp = wxr.wtp.parse(wxr.wtp.node_to_wikitext(contents), - # post_template_fn=head_post_template_fn, - expand_all=True) + exp = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(contents), + # post_template_fn=head_post_template_fn, + expand_all=True, + ) rub, rest = extract_ruby(wxr, exp.children) if rub: for r in rub: ruby.append(r) contents = rest - subtext = clean_node(wxr, sense_base, contents, - template_fn=usex_template_fn) - subtext = re.sub(r"\s*\(please add an English " - r"translation of this " - r"(example|usage example|quote)\)", - "", subtext).strip() + subtext = clean_node( + wxr, sense_base, contents, template_fn=usex_template_fn + ) + subtext = re.sub( + r"\s*\(please add an English " + r"translation of this " + r"(example|usage example|quote)\)", + "", + subtext, + ).strip() subtext = re.sub(r"\^\([^)]*\)", "", subtext) subtext = re.sub(r"\s*[―—]+$", "", subtext) # print("subtext:", repr(subtext)) @@ -3093,17 +3414,21 @@ def usex_template_fn(name, ht): # print(lines) lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) - lines = list(x for x in lines - if not re.match( - r"(Synonyms: |Antonyms: |Hyponyms: |" - r"Synonym: |Antonym: |Hyponym: |" - r"Hypernyms: |Derived terms: |" - r"Related terms: |" - r"Hypernym: |Derived term: |" - r"Coordinate terms:|" - r"Related term: |" - r"For more quotations using )", - x)) + lines = list( + x + for x in lines + if not re.match( + r"(Synonyms: |Antonyms: |Hyponyms: |" + r"Synonym: |Antonym: |Hyponym: |" + r"Hypernyms: |Derived terms: |" + r"Related terms: |" + r"Hypernym: |Derived term: |" + r"Coordinate terms:|" + r"Related term: |" + r"For more quotations using )", + x, + ) + ) tr = "" ref = "" roman = "" @@ -3112,26 +3437,28 @@ def usex_template_fn(name, ht): # print(classify_desc(line)) if len(lines) == 1 and lang_code != "en": parts = re.split(r"\s*[―—]+\s*", lines[0]) - if (len(parts) == 2 and - classify_desc(parts[1]) == "english"): + if len(parts) == 2 and classify_desc(parts[1]) == "english": lines = [parts[0].strip()] tr = parts[1].strip() - elif (len(parts) == 3 and - classify_desc(parts[1]) in ("romanization", - "english") and - classify_desc(parts[2]) == "english"): + elif ( + len(parts) == 3 + and classify_desc(parts[1]) + in ("romanization", "english") + and classify_desc(parts[2]) == "english" + ): lines = [parts[0].strip()] roman = parts[1].strip() tr = parts[2].strip() else: parts = re.split(r"\s+-\s+", lines[0]) - if (len(parts) == 2 and - classify_desc(parts[1]) == "english"): + if ( + len(parts) == 2 + and classify_desc(parts[1]) == "english" + ): lines = [parts[0].strip()] tr = parts[1].strip() elif len(lines) > 1: - if any(re.search(r"[]\d:)]\s*$", x) - for x in lines[:-1]): + if any(re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]): ref = [] for i in range(len(lines)): if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): @@ -3140,13 +3467,17 @@ def usex_template_fn(name, ht): if re.search(r"[]\d:)]\s*$", lines[i]): break ref = " ".join(ref) - lines = lines[i + 1:] - if (lang_code != "en" and len(lines) >= 2 and - classify_desc(lines[-1]) == "english"): + lines = lines[i + 1 :] + if ( + lang_code != "en" + and len(lines) >= 2 + and classify_desc(lines[-1]) == "english" + ): i = len(lines) - 1 - while (i > 1 and - classify_desc(lines[i - 1]) - == "english"): + while ( + i > 1 + and classify_desc(lines[i - 1]) == "english" + ): i -= 1 tr = "\n".join(lines[i:]) lines = lines[:i] @@ -3155,8 +3486,7 @@ def usex_template_fn(name, ht): roman = lines[-1].strip() lines = lines[:-1] - elif (lang_code == "en" and - re.match(r"^[#*]*:+", lines[1])): + elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): ref = lines[0] lines = lines[1:] elif lang_code != "en" and len(lines) == 2: @@ -3168,9 +3498,13 @@ def usex_template_fn(name, ht): elif cls1 == "english" and cls2 != "english": tr = lines[0] lines = [lines[1]] - elif (re.match(r"^[#*]*:+", lines[1]) and - classify_desc(re.sub(r"^[#*:]+\s*", "", - lines[1])) == "english"): + elif ( + re.match(r"^[#*]*:+", lines[1]) + and classify_desc( + re.sub(r"^[#*:]+\s*", "", lines[1]) + ) + == "english" + ): tr = re.sub(r"^[#*:]+\s*", "", lines[1]) lines = [lines[0]] elif cls1 == "english" and cls2 == "english": @@ -3179,20 +3513,27 @@ def usex_template_fn(name, ht): # non-English, as that seems more common. tr = lines[1] lines = [lines[0]] - elif (usex_type != "quotation" and - lang_code != "en" and - len(lines) == 3): + elif ( + usex_type != "quotation" + and lang_code != "en" + and len(lines) == 3 + ): cls1 = classify_desc(lines[0]) cls2 = classify_desc(lines[1]) cls3 = classify_desc(lines[2]) - if (cls3 == "english" and - cls2 in ["english", "romanization"] and - cls1 != "english"): + if ( + cls3 == "english" + and cls2 in ["english", "romanization"] + and cls1 != "english" + ): tr = lines[2].strip() roman = lines[1].strip() lines = [lines[0].strip()] - elif (usex_type == "quotation" and - lang_code != "en" and len(lines) > 2): + elif ( + usex_type == "quotation" + and lang_code != "en" + and len(lines) > 2 + ): # for x in lines: # print(" LINE: {}: {}" # .format(classify_desc(x), x)) @@ -3202,9 +3543,10 @@ def usex_template_fn(name, ht): cls1 = classify_desc(lines[-1]) if cls1 == "english": i = len(lines) - 1 - while (i > 1 and - classify_desc(lines[i - 1]) - == "english"): + while ( + i > 1 + and classify_desc(lines[i - 1]) == "english" + ): i -= 1 tr = "\n".join(lines[i:]) lines = lines[:i] @@ -3215,10 +3557,13 @@ def usex_template_fn(name, ht): tr = re.sub(r"[ \t\r]+", " ", tr).strip() tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) ref = re.sub(r"^[#*:]+\s*", "", ref) - ref = re.sub(r", (volume |number |page )?“?" - r"\(please specify ([^)]|\(s\))*\)”?|" - ", text here$", - "", ref) + ref = re.sub( + r", (volume |number |page )?“?" + r"\(please specify ([^)]|\(s\))*\)”?|" + ", text here$", + "", + ref, + ) ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) subtext = "\n".join(x for x in lines if x) @@ -3226,30 +3571,41 @@ def usex_template_fn(name, ht): m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) if m and classify_desc(m.group(2)) == "english": tr = m.group(2) - subtext = subtext[:m.start()] + m.group(1) + subtext = subtext[: m.start()] + m.group(1) elif lines: parts = re.split(r"\s*[―—]+\s*", lines[0]) - if (len(parts) == 2 and - classify_desc(parts[1]) == "english"): + if ( + len(parts) == 2 + and classify_desc(parts[1]) == "english" + ): subtext = parts[0].strip() tr = parts[1].strip() - subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", - subtext) - subtext = re.sub(r"(please add an English translation of " - r"this (quote|usage example))", - "", subtext) - subtext = re.sub(r"\s*→New International Version " - "translation$", - "", subtext) # e.g. pis/Tok Pisin (Bible) + subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) + subtext = re.sub( + r"(please add an English translation of " + r"this (quote|usage example))", + "", + subtext, + ) + subtext = re.sub( + r"\s*→New International Version " "translation$", + "", + subtext, + ) # e.g. pis/Tok Pisin (Bible) subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) note = None m = re.match(r"^\(([^)]*)\):\s+", subtext) - if (m is not None and lang_code != "en" and - (m.group(1).startswith("with ") or - classify_desc(m.group(1)) == "english")): + if ( + m is not None + and lang_code != "en" + and ( + m.group(1).startswith("with ") + or classify_desc(m.group(1)) == "english" + ) + ): note = m.group(1) - subtext = subtext[m.end():] + subtext = subtext[m.end() :] ref = re.sub(r"\s*\(→ISBN\)", "", ref) ref = re.sub(r",\s*→ISBN", "", ref) ref = ref.strip() @@ -3278,7 +3634,6 @@ def usex_template_fn(name, ht): return examples - # Main code of parse_language() # Process the section stack.append(language) @@ -3358,9 +3713,10 @@ def top_template_fn(name, ht): if arg.startswith("Q") or arg.startswith("Lexeme:L"): data_append(data, "wikidata", arg) return "" - wxr.wtp.debug("UNIMPLEMENTED top-level template: {} {}" - .format(name, ht), - sortid="page/2870") + wxr.wtp.debug( + "UNIMPLEMENTED top-level template: {} {}".format(name, ht), + sortid="page/2870", + ) return "" clean_node(wxr, None, [node], template_fn=top_template_fn) @@ -3373,9 +3729,9 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: # Known lowercase PoS names are in part_of_speech_map # Known lowercase linkage section names are in linkage_map - old = re.split(r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" - r"[ \t]*(==+)[ \t]*$", - text) + old = re.split( + r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text + ) parts = [] npar = 4 # Number of parentheses in above expression @@ -3389,22 +3745,29 @@ def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: level = len(left) part = old[i + npar] if level != len(right): - wxr.wtp.debug("subtitle has unbalanced levels: " - "{!r} has {} on the left and {} on the right" - .format(title, left, right), - sortid="page/2904") + wxr.wtp.debug( + "subtitle has unbalanced levels: " + "{!r} has {} on the left and {} on the right".format( + title, left, right + ), + sortid="page/2904", + ) lc = title.lower() if name_to_code(title, "en") != "": if level > 2: - wxr.wtp.debug("subtitle has language name {} at level {}" - .format(title, level), - sortid="page/2911") + wxr.wtp.debug( + "subtitle has language name {} at level {}".format( + title, level + ), + sortid="page/2911", + ) level = 2 elif lc.startswith(tuple(wxr.config.OTHER_SUBTITLES["etymology"])): if level > 3: - wxr.wtp.debug("etymology section {} at level {}" - .format(title, level), - sortid="page/2917") + wxr.wtp.debug( + "etymology section {} at level {}".format(title, level), + sortid="page/2917", + ) level = 3 elif lc.startswith(tuple(wxr.config.OTHER_SUBTITLES["pronunciation"])): level = 3 @@ -3473,7 +3836,7 @@ def parse_page( text, pre_expand=True, additional_expand=ADDITIONAL_EXPAND_TEMPLATES, - do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES + do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, ) # from wikitextprocessor.parser import print_tree # print("PAGE PARSE:", print_tree(tree)) @@ -3521,7 +3884,7 @@ def parse_page( if "lang" not in data: wxr.wtp.debug( "internal error -- no lang in data: {}".format(data), - sortid="page/3034" + sortid="page/3034", ) continue for k, v in top_data.items(): @@ -3552,16 +3915,26 @@ def parse_page( if not conjs: continue cpos = dt.get("pos") - if (pos == cpos or - (pos, cpos) in (("noun", "adj"), - ("noun", "name"), - ("name", "noun"), - ("name", "adj"), - ("adj", "noun"), - ("adj", "name")) or - (pos == "adj" and cpos == "verb" and - any("participle" in s.get("tags", ()) - for s in dt.get("senses", ())))): + if ( + pos == cpos + or (pos, cpos) + in ( + ("noun", "adj"), + ("noun", "name"), + ("name", "noun"), + ("name", "adj"), + ("adj", "noun"), + ("adj", "name"), + ) + or ( + pos == "adj" + and cpos == "verb" + and any( + "participle" in s.get("tags", ()) + for s in dt.get("senses", ()) + ) + ) + ): data["conjugation"] = list(conjs) # Copy list! break # Add topics from the last sense of a language to its other senses, @@ -3579,13 +3952,14 @@ def parse_page( for x in ret: if x["word"] != word: if word.startswith("Unsupported titles/"): - wxr.wtp.debug(f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", - sortid="20231101/3578page.py" - ) + wxr.wtp.debug( + f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", + sortid="20231101/3578page.py", + ) else: - wxr.wtp.debug(f"DIFFERENT ORIGINAL TITLE: '{word}' " - f"-> '{x['word']}'", - sortid="20231101/3582page.py" - ) + wxr.wtp.debug( + f"DIFFERENT ORIGINAL TITLE: '{word}' " f"-> '{x['word']}'", + sortid="20231101/3582page.py", + ) x["original_title"] = word return ret diff --git a/src/wiktextract/extractor/ruby.py b/src/wiktextract/extractor/ruby.py index 43e2ee38f..1a287758c 100644 --- a/src/wiktextract/extractor/ruby.py +++ b/src/wiktextract/extractor/ruby.py @@ -1,8 +1,12 @@ from typing import List, Optional, Tuple, Union from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.parser import HTMLNode, LevelNode, TemplateNode - +from wikitextprocessor.parser import ( + GeneralNode, + HTMLNode, + LevelNode, + TemplateNode, +) from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -13,8 +17,9 @@ def parse_ruby( """Parse a HTML 'ruby' node for a kanji part and a furigana (ruby) part, and return a tuple containing those. Discard the rp-element's parentheses, we don't do anything with them.""" - ruby_nodes = [] - furi_nodes = [] + ruby_nodes: list[Union[str, WikiNode]] = [] + furi_nodes: list[Union[str, WikiNode]] = [] # furi_nodes is technically + # just list[WikiNode], but this appeases the type-checker for clean_node() for child in node.children: if ( not isinstance(child, WikiNode) @@ -31,14 +36,14 @@ def parse_ruby( # element with an empty something (apparently, seeing as how this # works), leaving no trace of the broken ruby element in the final # HTML source of the page! - return + return None return ruby_kanji, furigana def extract_ruby( wxr: WiktextractContext, - contents: Union[WikiNode, List[Union[WikiNode, str]]], -) -> Tuple[List[Tuple[str]], List[Union[WikiNode, str]]]: + contents: GeneralNode, +) -> tuple[list[tuple[str, str]], list[Union[WikiNode, str]]]: # If contents is a list, process each element separately extracted = [] new_contents = [] @@ -69,7 +74,7 @@ def extract_ruby( }: # Process args and children if kind != NodeKind.LINK: - new_node = LevelNode(new_node.loc) + new_node = LevelNode(kind, new_node.loc) new_args = [] for arg in contents.largs: e1, c1 = extract_ruby(wxr, arg) diff --git a/src/wiktextract/linkages.py b/src/wiktextract/linkages.py index 5efbbea42..0de6f427d 100644 --- a/src/wiktextract/linkages.py +++ b/src/wiktextract/linkages.py @@ -8,21 +8,33 @@ from wikitextprocessor import Wtp from typing import Dict, List, Union, Optional from .datautils import split_at_comma_semi, data_append -from .form_descriptions import (classify_desc, parse_head_final_tags, - parse_sense_qualifier, - head_final_bantu_langs, head_final_bantu_re, - head_final_other_langs, head_final_other_re, - head_final_numeric_langs, head_final_re) +from .form_descriptions import ( + classify_desc, + parse_head_final_tags, + parse_sense_qualifier, + head_final_bantu_langs, + head_final_bantu_re, + head_final_other_langs, + head_final_other_re, + head_final_numeric_langs, + head_final_re, +) from .tags import linkage_beginning_tags +from .type_utils import WordData # Linkage will be ignored if it matches this regexp before splitting linkage_pre_split_ignore_re = re.compile( - r"^(" + "|".join(re.escape(x) for x in [ - "For more variations, see ", - "Signal flag:", - "Semaphore:", - ]) + - r")") + r"^(" + + "|".join( + re.escape(x) + for x in [ + "For more variations, see ", + "Signal flag:", + "Semaphore:", + ] + ) + + r")" +) # Linkage will be ignored if it has one of these prefixes linkage_ignore_prefixes = [ @@ -63,31 +75,40 @@ # Linkage will be ignored if it matches this regexp linkage_ignore_re = re.compile( - r"^(" + "|".join(re.escape(x) for x in linkage_ignore_whole) + - r")$|^(" + "|".join(re.escape(x) for x in linkage_ignore_prefixes) + - r")|(" + "|".join(re.escape(x) for x in linkage_ignore_suffixes) + - r")$") + r"^(" + + "|".join(re.escape(x) for x in linkage_ignore_whole) + + r")$|^(" + + "|".join(re.escape(x) for x in linkage_ignore_prefixes) + + r")|(" + + "|".join(re.escape(x) for x in linkage_ignore_suffixes) + + r")$" +) # These prefixes will be removed from linkages, leaving the rest. This is # considered separately for each linkage in a list. linkage_remove_prefixes_re = re.compile( - r"^(" + - r"|".join(re.escape(x) for x in [ - ":", - "see Thesaurus:", - "See Thesaurus:", - "see also Thesaurus:", - "See also Thesaurus:", - "see also ", - "See also ", - "see ", - "See ", - "from ", - "abbreviation of ", - "ISO 639-1 code ", - "ISO 639-3 code ", - "Thesaurus:"]) + - ")") + r"^(" + + r"|".join( + re.escape(x) + for x in [ + ":", + "see Thesaurus:", + "See Thesaurus:", + "see also Thesaurus:", + "See also Thesaurus:", + "see also ", + "See also ", + "see ", + "See ", + "from ", + "abbreviation of ", + "ISO 639-1 code ", + "ISO 639-3 code ", + "Thesaurus:", + ] + ) + + ")" +) # When removing prefix from linkage, this dictionary can be used to map # the removed prefix to a space-separated list of tags to add @@ -101,17 +122,22 @@ r"(\s+on (Wikispecies|Wikimedia Commons|" r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|" r"\s*[-–] Pre-reform orthography.*)" - r"$") + r"$" +) # Ignore linkage parenthesized sections that contain one of these strings linkage_paren_ignore_contains_re = re.compile( - r"\b(" + - "|".join(re.escape(x) for x in [ - "from Etymology", - "used as", - "usage notes", - ]) + - ")([, ]|$)") + r"\b(" + + "|".join( + re.escape(x) + for x in [ + "from Etymology", + "used as", + "usage notes", + ] + ) + + ")([, ]|$)" +) taxonomic_ending_map = { "superkingdoms": "superkingdom", @@ -133,7 +159,9 @@ taxonomic_ending_map[v] = v # Also add singular -> singular taxonomic_ending_re = re.compile( r"\s+[-‐‑‒–—]\s+({})$".format( - "|".join(re.escape(x) for x in taxonomic_ending_map))) + "|".join(re.escape(x) for x in taxonomic_ending_map) + ) +) # Exceptional splits for linkages. This can be used to fix particular linkages # that are not handled correctly by the default code. This can also be used @@ -146,10 +174,14 @@ # Truncate linkage word if it matches any of these strings linkage_truncate_re = re.compile( - "|".join(re.escape(x) for x in [ - " and its derived terms", - " UTF-16 0x214C", - ])) + "|".join( + re.escape(x) + for x in [ + " and its derived terms", + " UTF-16 0x214C", + ] + ) +) # Regexp for identifying special linkages containing lists of letters, digits, # or characters @@ -161,39 +193,47 @@ r" digits)(;|$)|" r"(^|; )(Letters using |Letters of the |" r"Variations of letter )|" - r"^(Hiragana|Katakana)$") + r"^(Hiragana|Katakana)$" +) # Matches an unicode character including any combining diacritics (even if # separate characters) -unicode_dc_re = re.compile(r"\w[{}]|.".format( - "".join(chr(x) for x in range(0, 0x110000) - if unicodedata.category(chr(x)) == "Mn"))) - - -def parse_linkage_item_text(wxr: Wtp, - word: str, - data: Dict[str, Union[list, str, dict]], - field: str, - item: str, - sense: Optional[str], - ruby: list, - pos_datas: list, - is_reconstruction: bool, - urls: Optional[List[str]] = None - ) -> Optional[str]: +unicode_dc_re = re.compile( + r"\w[{}]|.".format( + "".join( + chr(x) + for x in range(0, 0x110000) + if unicodedata.category(chr(x)) == "Mn" + ) + ) +) + + +def parse_linkage_item_text( + wxr: WiktextractContext, + word: str, + data: WordData, + field: str, + item: str, + sense: Optional[str], + ruby: list, + pos_datas: list, + is_reconstruction: bool, + urls: Optional[List[str]] = None, +) -> Optional[str]: """Parses a linkage item once it has been converted to a string. This may add one or more linkages to ``data`` under ``field``. This returns None or a string that contains thats that should be applied to additional linkages (commonly used in tables for Asian characters).""" assert isinstance(wxr, WiktextractContext) - assert isinstance(word, str) # Main word (derived from page title) + assert isinstance(word, str) # Main word (derived from page title) assert isinstance(data, dict) # Parsed linkages are stored here under field assert isinstance(field, str) # The field under which to store linkage - assert isinstance(item, str) # The string to parse + assert isinstance(item, str) # The string to parse assert sense is None or isinstance(sense, str) - assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" + assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" assert isinstance(pos_datas, list) # List of senses (containing "glosses") - assert urls is None or isinstance(urls, list) # Captured urls + assert urls is None or isinstance(urls, list) # Captured urls assert is_reconstruction in (True, False) item = item.replace("()", "") @@ -229,7 +269,7 @@ def parse_linkage_item_text(wxr: Wtp, # Replace occurrences of ~ in the item by the page title safetitle = wxr.wtp.title.replace("\\", "\\\\") - item = item.replace(" ~ ", " " + safetitle + " ") + item = item.replace(" ~ ", " " + safetitle + " ") item = re.sub(r"^~ ", safetitle + " ", item) item = re.sub(r" ~$", " " + safetitle, item) @@ -239,7 +279,7 @@ def parse_linkage_item_text(wxr: Wtp, m = re.search(taxonomic_ending_re, item) if m: base_english = taxonomic_ending_map[m.group(1)] - item = item[:m.start()] + item = item[: m.start()] # Some Korean and Japanese words use "word (romanized): english" pattern # Sometimes the parenthesized part contains comma-separated alt and roman. @@ -248,13 +288,17 @@ def parse_linkage_item_text(wxr: Wtp, rom = m.group(2) eng = m.group(3) rest = m.group(1) - if (classify_desc(rest, no_unknown_starts=True) == "other" and - classify_desc(eng, no_unknown_starts=True) == "english"): + if ( + classify_desc(rest, no_unknown_starts=True) == "other" + and classify_desc(eng, no_unknown_starts=True) == "english" + ): item = rest base_roman = rom lst = base_roman.split(", ") - if (len(lst) == 2 and - classify_desc(lst[0], no_unknown_starts=True) == "other"): + if ( + len(lst) == 2 + and classify_desc(lst[0], no_unknown_starts=True) == "other" + ): base_alt = lst[0] base_roman = lst[1] if base_english: @@ -265,9 +309,10 @@ def parse_linkage_item_text(wxr: Wtp, # Many words have tags or similar descriptions in the beginning # followed by a colon and one or more linkages (e.g., # panetella/Finnish) - m = (re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or - re.match(r"^([a-zA-Z][-'a-zA-Z0-9 ]*" - r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", item)) + m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match( + r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", + item, + ) if m: desc = m.group(1) rest = m.group(len(m.groups())) @@ -326,12 +371,22 @@ def parse_linkage_item_text(wxr: Wtp, e1 = wxr.wtp.page_exists(desc) e2 = wxr.wtp.page_exists(rest) if cls != "tags": - if (cls2 == "tags" or - (e1 and not e1) or - (e1 and e2 and cls2 == "english" and - cls in ("other", "romanization")) or - (not e1 and not e2 and cls2 == "english" and - cls in ("other", "romanization"))): + if ( + cls2 == "tags" + or (e1 and not e1) + or ( + e1 + and e2 + and cls2 == "english" + and cls in ("other", "romanization") + ) + or ( + not e1 + and not e2 + and cls2 == "english" + and cls in ("other", "romanization") + ) + ): desc, rest = rest, desc # Looks like swapped syntax cls = cls2 if re.search(linkage_paren_ignore_contains_re, desc): @@ -364,48 +419,56 @@ def parse_linkage_item_text(wxr: Wtp, d = pos_datas[idx] gl = "; ".join(d.get("glosses", ())) if not gl: - wxr.wtp.debug("parenthesized numeric linkage prefix, " - "but the referenced sense has no gloss: " - "{}".format(desc), - sortid="linkages/355") + wxr.wtp.debug( + "parenthesized numeric linkage prefix, " + "but the referenced sense has no gloss: " + "{}".format(desc), + sortid="linkages/355", + ) elif sense: sense += "; " + gl else: sense = gl item = rest else: - wxr.wtp.debug("parenthesized numeric linkage prefix, " - "but there is no sense with such index: {}" - .format(desc), - sortid="linkages/365") + wxr.wtp.debug( + "parenthesized numeric linkage prefix, " + "but there is no sense with such index: {}".format(desc), + sortid="linkages/365", + ) item = rest else: - wxr.wtp.debug("unrecognized linkage prefix: {} desc={} rest={} " - "cls={} cls2={} e1={} e2={}" - .format(item, desc, rest, cls, cls2, e1, e2), - sortid="linkages/371") + wxr.wtp.debug( + "unrecognized linkage prefix: {} desc={} rest={} " + "cls={} cls2={} e1={} e2={}".format( + item, desc, rest, cls, cls2, e1, e2 + ), + sortid="linkages/371", + ) item = rest base_sense = sense # Check for certain plural tag forms at end of items list, and apply # them to all items if found - m = re.search(r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" - r"characters|symbols|tetragrams|letter names|names|" - r"female names|male names|proper nouns|contractions|" - r"nonstandard spellings|verbs|prepositions|postpositions|" - r"interjections|Abbreviations|abbreviations|variants|" - r"ordinals|nouns|phrases|adjectives|adverbs|" - r"augmentatives|pejoratives|compound words|numerals|" - r"Tally marks|surnames|modern nonstandard spellings)$", - item) + m = re.search( + r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" + r"characters|symbols|tetragrams|letter names|names|" + r"female names|male names|proper nouns|contractions|" + r"nonstandard spellings|verbs|prepositions|postpositions|" + r"interjections|Abbreviations|abbreviations|variants|" + r"ordinals|nouns|phrases|adjectives|adverbs|" + r"augmentatives|pejoratives|compound words|numerals|" + r"Tally marks|surnames|modern nonstandard spellings)$", + item, + ) if m: suffix = m.group(1) if base_qualifier: base_qualifier += ", " + suffix else: base_qualifier = suffix - item = item[:m.start()] + item = item[: m.start()] # Certain linkage items have space-separated valus. These are # generated by, e.g., certain templates @@ -443,17 +506,29 @@ def parse_linkage_item_text(wxr: Wtp, # Item1 contains " or " item2 = re.sub(r"\s*\([^)]*\)", "", item1) item2 = re.sub(r"\s+", " ", item2) - if ((lang not in head_final_bantu_langs or - not re.search(head_final_bantu_re, item2)) and - (lang not in head_final_other_langs or - not re.search(head_final_other_re, item2)) and - (not re.search(head_final_re, item2) or - (item2[-1].isdigit() and - lang not in head_final_numeric_langs)) and - not re.search(r"\bor\b", wxr.wtp.title) and - all(wxr.wtp.title not in x.split(" or ") + if ( + ( + lang not in head_final_bantu_langs + or not re.search(head_final_bantu_re, item2) + ) + and ( + lang not in head_final_other_langs + or not re.search(head_final_other_re, item2) + ) + and ( + not re.search(head_final_re, item2) + or ( + item2[-1].isdigit() + and lang not in head_final_numeric_langs + ) + ) + and not re.search(r"\bor\b", wxr.wtp.title) + and all( + wxr.wtp.title not in x.split(" or ") for x in split_at_comma_semi(item2) - if " or " in x)): + if " or " in x + ) + ): # We can split this item. Split the non-cleaned version # that still has any intervening parenthesized parts. subitems.extend(split_at_comma_semi(item1, extra=[" or "])) @@ -482,7 +557,7 @@ def parse_linkage_item_text(wxr: Wtp, m = re.search(r"\s*\(“([^”]+)”\)", item1) if m: t = m.group(1) - item1 = (item1[:m.start()] + item1[m.end():]).strip() + item1 = (item1[: m.start()] + item1[m.end() :]).strip() cls = classify_desc(t) if cls == "tags": if qualifier: @@ -494,20 +569,27 @@ def parse_linkage_item_text(wxr: Wtp, # Some Korean words use "word (alt, oman, “english”) pattern # See 滿/Korean - m = re.match(r'([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), ' - r'[“”"]([^”“"]+)[“”"]\)$', item1) - if (m and - classify_desc(m.group(1), no_unknown_starts=True) == "other" and - classify_desc(m.group(2), no_unknown_starts=True) == "other"): + m = re.match( + r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), " + r'[“”"]([^”“"]+)[“”"]\)$', + item1, + ) + if ( + m + and classify_desc(m.group(1), no_unknown_starts=True) == "other" + and classify_desc(m.group(2), no_unknown_starts=True) == "other" + ): alt = m.group(2) roman = m.group(3) english = m.group(4) item1 = m.group(1) words = item1.split(" ") - if (len(words) > 1 and - words[0] in linkage_beginning_tags and - words[0] != wxr.wtp.title): + if ( + len(words) > 1 + and words[0] in linkage_beginning_tags + and words[0] != wxr.wtp.title + ): t = linkage_beginning_tags[words[0]] item1 = " ".join(words[1:]) if qualifier: @@ -543,8 +625,9 @@ def english_repl(m): # sometimes both at the beginning and at the end. # And sometimes even in the middle, as in e.g. # wife/English/Translations/Yiddish - while (not script_chars and - (not sense or not re.search(script_chars_re, sense))): + while not script_chars and ( + not sense or not re.search(script_chars_re, sense) + ): par = None nonfirst_par = False if par is None: @@ -552,16 +635,17 @@ def english_repl(m): m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1) if m: par = m.group(1) - item1 = item1[m.end():] + item1 = item1[m.end() :] else: # Try to find a parenthesized part at the end or from the # middle. - m = re.search(r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" - r"(\.$)?", - item1) + m = re.search( + r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?", + item1, + ) if m: par = m.group(1) - item1 = item1[:m.start()] + item1[m.end():] + item1 = item1[: m.start()] + item1[m.end() :] nonfirst_par = True if not par: break @@ -588,7 +672,7 @@ def english_repl(m): qualifier = par[:idx] else: break - par = par[idx + 1:].strip() + par = par[idx + 1 :].strip() # Check for certain comma-separated tags combined # with English text at the beginning or end of a @@ -676,19 +760,22 @@ def english_repl(m): d = pos_datas[idx] gl = "; ".join(d.get("glosses", ())) if not gl: - wxr.wtp.debug("parenthesized number " - "but the referenced sense has no " - "gloss: {}".format(par), - sortid="linkages/665") + wxr.wtp.debug( + "parenthesized number " + "but the referenced sense has no " + "gloss: {}".format(par), + sortid="linkages/665", + ) elif sense: sense += "; " + gl else: sense = gl else: - wxr.wtp.debug("parenthesized number but there is " - "no sense with such index: {}" - .format(par), - sortid="linkages/674") + wxr.wtp.debug( + "parenthesized number but there is " + "no sense with such index: {}".format(par), + sortid="linkages/674", + ) else: if alt: alt += "; " + par @@ -706,8 +793,8 @@ def english_repl(m): # Remove certain prefixes from linkages m = re.match(linkage_remove_prefixes_re, item1) if m: - prefix = item1[:m.end()] - item1 = item1[m.end():] + prefix = item1[: m.end()] + item1 = item1[m.end() :] if prefix in linkage_remove_prefixes_tags: if qualifier: qualifier += ", " + linkage_remove_prefixes_tags[prefix] @@ -720,13 +807,13 @@ def english_repl(m): # Remove certain suffixes from linkages m = re.search(linkage_remove_suffixes_re, item1) if m: - item1 = item1[:m.start()] + item1 = item1[: m.start()] # Parse linkages with "value = english" syntax (e.g., # väittää/Finnish) idx = item1.find(" = ") if idx >= 0: - eng = item1[idx + 3:] + eng = item1[idx + 3 :] if classify_desc(eng, no_unknown_starts=True) == "english": english = eng item1 = item1[:idx] @@ -736,25 +823,25 @@ def english_repl(m): eng = item1[:idx] if classify_desc(eng, no_unknown_starts=True) == "english": english = eng - item1 = item1[idx + 3:] + item1 = item1[idx + 3 :] # Parse linkages with "value - english" syntax (e.g., # man/Faroese) m = re.search(r" [-‐‑‒–—―] ", item1) if m and "(" not in item1: - suffix = item1[m.end():] + suffix = item1[m.end() :] cls = classify_desc(suffix, no_unknown_starts=True) if cls == "english": # This case intentionally ignores old values from english # (otherwise taxonomic lists fail) english = suffix - item1 = item1[:m.start()] + item1 = item1[: m.start()] elif cls == "tags": if qualifier: qualifier += ", " + suffix else: qualifier = suffix - item1 = item1[:m.start()] + item1 = item1[: m.start()] # Parse certain tags at the end of the linked term (unless # we are in a letters list) @@ -768,7 +855,7 @@ def english_repl(m): m = re.search(linkage_truncate_re, item1) if m: # suffix = item1[m.start():] # Currently ignored - item1 = item1[:m.start()] + item1 = item1[: m.start()] if not item1: continue # Ignore empty link targets if item1 == word: @@ -794,9 +881,11 @@ def add(w, r): # split as this is used when we have a different number # of romanizations than written forms, and don't know # which is which. - if ((not w or "," not in w) and - (not r or "," not in r) and - not wxr.wtp.page_exists(w)): + if ( + (not w or "," not in w) + and (not r or "," not in r) + and not wxr.wtp.page_exists(w) + ): lst = w.split("/") if len(w) > 1 else [w] if len(lst) == 1: lst = w.split(" / ") @@ -811,9 +900,15 @@ def add(w, r): # Heuristically remove "." at the end of most linkages # (some linkage lists end in a period, but we also have # abbreviations that end with a period that should be kept) - if (w.endswith(".") and not wxr.wtp.page_exists(w) and - (wxr.wtp.page_exists(w[:-1]) or - (len(w) >= 5) and "." not in w[:-1])): + if ( + w.endswith(".") + and not wxr.wtp.page_exists(w) + and ( + wxr.wtp.page_exists(w[:-1]) + or (len(w) >= 5) + and "." not in w[:-1] + ) + ): w = w[:-1] # If we have roman but not alt and the word is ASCII, @@ -847,8 +942,9 @@ def add(w, r): if alt and alt.strip() != w: dt["alt"] = alt.strip() if urls: - dt["urls"] = [url.strip() for url in urls - if url and isinstance(url, str)] + dt["urls"] = [ + url.strip() for url in urls if url and isinstance(url, str) + ] dt["word"] = w for old in data.get(field, ()): if dt == old: @@ -870,9 +966,11 @@ def add(w, r): # print("lang={} v={} script_chars={} item1={!r}" # .format(wxr.wtp.section, v, script_chars, item1)) if v and script_chars: - if (len(item1.split()) > 1 or - len(list(re.finditer(unicode_dc_re, item1))) == 2 or - (len(subitems) > 10 and v in ("Hiragana", "Katakana"))): + if ( + len(item1.split()) > 1 + or len(list(re.finditer(unicode_dc_re, item1))) == 2 + or (len(subitems) > 10 and v in ("Hiragana", "Katakana")) + ): if v == qualifier: # if sense: # sense += "; " + qualifier @@ -881,9 +979,12 @@ def add(w, r): qualifier = None if re.search(r" (letters|digits|script)$", v): qualifier = v # Also parse as qualifier - elif re.search(r"Variations of letter |" - r"Letters using |" - r"Letters of the ", v): + elif re.search( + r"Variations of letter |" + r"Letters using |" + r"Letters of the ", + v, + ): qualifier = "letter" parts = item1.split(". ") extra = () @@ -892,23 +993,28 @@ def add(w, r): item1 = parts[0] # Handle multi-character names for chars in language's # alphabet, e.g., "Ny ny" in P/Hungarian. - if (len(subitems) > 20 and len(item1.split()) == 2 and - all(len(x) <= 3 for x in item1.split())): - parts = list(m.group(0) for m in - re.finditer(r"(\w[\u0300-\u036f]?)+|.", - item1) - if not m.group(0).isspace() and - m.group(0) not in ("(", ")")) + if ( + len(subitems) > 20 + and len(item1.split()) == 2 + and all(len(x) <= 3 for x in item1.split()) + ): + parts = list( + m.group(0) + for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1) + if not m.group(0).isspace() + and m.group(0) not in ("(", ")") + ) else: - parts = list(m.group(0) for m in - re.finditer(r".[\u0300-\u036f]?", - item1) - if not m.group(0).isspace() and - m.group(0) not in ("(", ")")) + parts = list( + m.group(0) + for m in re.finditer(r".[\u0300-\u036f]?", item1) + if not m.group(0).isspace() + and m.group(0) not in ("(", ")") + ) for e in extra: idx = e.find(":") if idx >= 0: - e = e[idx + 1:].strip() + e = e[idx + 1 :].strip() if e.endswith("."): e = e[:-1] parts.extend(e.split()) @@ -920,10 +1026,11 @@ def add(w, r): rparts = None if roman: - rparts = list(m.group(0) for m in - re.finditer(r".[\u0300-\u036f]", - roman) - if not m.group(0).isspace()) + rparts = list( + m.group(0) + for m in re.finditer(r".[\u0300-\u036f]", roman) + if not m.group(0).isspace() + ) if len(rparts) != len(parts): rparts = None if not rparts: diff --git a/src/wiktextract/page.py b/src/wiktextract/page.py index bf3a7733a..f39e197a7 100644 --- a/src/wiktextract/page.py +++ b/src/wiktextract/page.py @@ -8,7 +8,18 @@ from typing import Any, Callable, Optional, Union from mediawiki_langcodes import get_all_names, name_to_code -from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor import ( + NodeKind, + WikiNode, +) +from wikitextprocessor.core import ( + TemplateArgs, + TemplateFnCallable, + PostTemplateFnCallable, +) +from wikitextprocessor.parser import ( + GeneralNode, +) from wiktextract.wxr_context import WiktextractContext @@ -56,9 +67,9 @@ def is_panel_template(wxr: WiktextractContext, template_name: str) -> bool: def recursively_extract( - contents: Union[WikiNode, list[WikiNode]], + contents: Union[WikiNode, str, list[Union[str, WikiNode]]], fn: Callable[[Union[WikiNode, list[WikiNode]]], bool], -) -> tuple[list[WikiNode], list[WikiNode]]: +) -> tuple[list[Union[str, WikiNode]], list[Union[str, WikiNode]]]: """Recursively extracts elements from contents for which ``fn`` returns True. This returns two lists, the extracted elements and the remaining content (with the extracted elements removed at each level). Only @@ -311,9 +322,9 @@ def remove_duplicate_data(page_data: dict) -> None: def clean_node( wxr: WiktextractContext, sense_data: Optional[Any], - wikinode: Union[str, WikiNode, list[Union[str, WikiNode]]], - template_fn: Optional[Callable[[str, dict], str]] = None, - post_template_fn: Optional[Callable[[str, dict, str], str]] = None, + wikinode: GeneralNode, + template_fn: Optional[TemplateFnCallable] = None, + post_template_fn: Optional[PostTemplateFnCallable] = None, collect_links: bool = False, ) -> str: """ diff --git a/src/wiktextract/type_utils.py b/src/wiktextract/type_utils.py index 389b541e9..81a26f911 100644 --- a/src/wiktextract/type_utils.py +++ b/src/wiktextract/type_utils.py @@ -1,14 +1,170 @@ from typing import ( - Union, + Sequence, + TypedDict, ) -WordData = dict[str, Union[ - str, - int, - list[str], - list[list[str]], - "WordData", - list["WordData"] - ] - ] +class AltOf(TypedDict, total=False): + word: str + extra: str + + +class LinkageData(TypedDict, total=False): + alt: str + english: str + extra: str + qualifier: str + roman: str + ruby: list[Sequence[str]] + sense: str + source: str + tags: list[str] + taxonomic: str + topics: list[str] + urls: list[str] + word: str + + +class ExampleData(TypedDict, total=False): + english: str + note: str + ref: str + roman: str + ruby: list[Sequence[str]] + text: str + type: str + + +class FormOf(TypedDict, total=False): + word: str + extra: str + roman: str + + +LinkData = list[Sequence[str]] + + +class TemplateData(TypedDict, total=False): + args: dict[str, str] + expansion: str + name: str + + +class DescendantData(TypedDict, total=False): + depth: int + tags: list[str] + templates: TemplateData + text: str + + +class FormData(TypedDict, total=False): + form: str + head_nr: int + ipa: str + roman: str + ruby: list[Sequence[str]] + source: str + tags: list[str] + topics: list[str] + + +SoundData = TypedDict( + "SoundData", + { + "audio": str, + "audio-ipa": str, + "enpr": str, + "form": str, + "homophone": str, + "ipa": str, + "mp3_url": str, + "note": str, + "ogg_url": str, + "other": str, + "rhymes": str, + "tags": list[str], + "text": str, + "topics": list[str], + "zh-pron": str, + }, + total=False, +) + + +class TranslationData(TypedDict, total=False): + alt: str + code: str + english: str + lang: str + note: str + roman: str + sense: str + tags: list[str] + taxonomic: str + topics: list[str] + word: str + + +class SenseData(TypedDict, total=False): + alt_of: list[AltOf] + antonyms: list[LinkageData] + categories: list[str] + compound_of: list[AltOf] + coordinate_terms: list[LinkageData] + examples: list[ExampleData] + form_of: list[FormOf] + glosses: list[str] + head_nr: int + holonyms: list[LinkageData] + hypernyms: list[LinkageData] + hyponyms: list[LinkageData] + instances: list[LinkageData] + links: list[LinkData] + meronyms: list[LinkageData] + qualifier: str + raw_glosses: list[str] + related: list[LinkageData] + senseid: list[str] + synonyms: list[LinkageData] + tags: list[str] + topics: list[str] + wikidata: list[str] + wikipedia: list[str] + + +class WordData(TypedDict, total=False): + abbreviations: list[LinkageData] + alt_of: list[AltOf] + antonyms: list[LinkageData] + categories: list[str] + coordinate_terms: list[LinkageData] + derived: list[LinkageData] + descendants: list[DescendantData] + etymology_number: int + etymology_templates: list[TemplateData] + etymology_text: str + form_of: list[FormOf] + forms: list[FormData] + head_templates: list[TemplateData] + holonyms: list[LinkageData] + hyphenation: list[str] + hypernyms: list[LinkageData] + hyponyms: list[LinkageData] + inflection_templates: list[TemplateData] + instances: list[LinkageData] + lang: str + lang_code: str + meronyms: list[LinkageData] + original_title: str + pos: str + proverbs: list[LinkageData] + redirects: list[str] + related: list[LinkageData] + senses: list[SenseData] + sounds: list[SoundData] + synonyms: list[LinkageData] + translations: list[TranslationData] + troponyms: list[LinkageData] + wikidata: list[str] + wikipedia: list[str] + word: str