diff --git a/pyproject.toml b/pyproject.toml index 9b0bbe936..ef42ff744 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,3 +82,11 @@ select = [ "I", # isort "W", # pycodestyle warning ] + +[tool.mypy] +mypy_path = "typestubs" +python_version = 3.9 + +[[tool.mypy.overrides]] +module = "importlib_resources.*" +ignore_missing_imports = true diff --git a/src/wiktextract/categories.py b/src/wiktextract/categories.py index 52658d0cc..1cc9efe56 100644 --- a/src/wiktextract/categories.py +++ b/src/wiktextract/categories.py @@ -2,6 +2,13 @@ # # Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org +from wikitextprocessor.core import NamespaceDataEntry +from typing import ( + Any, + Optional, + TypedDict, + Union, +) from wiktextract.wxr_context import WiktextractContext from .page import clean_node @@ -65,16 +72,39 @@ return export """ -def extract_categories(wxr: WiktextractContext): +CategoryEntry = TypedDict( + "CategoryEntry", + { + "name": str, + "desc": str, + "clean_desc": str, + "children": list[str], + "sort": list[str], + }, + total=False, +) + +CategoryReturn = TypedDict( + "CategoryReturn", + { + "roots": list[str], + "nodes": dict[str, CategoryEntry], + }, + total=False, +) + +def extract_categories(wxr: WiktextractContext) -> CategoryReturn: """Extracts the category tree from Wiktionary.""" - module_ns = wxr.wtp.NAMESPACE_DATA.get("Module", {}) + module_ns: Optional[NamespaceDataEntry] = wxr.wtp.NAMESPACE_DATA.get( + "Module", None) + assert module_ns is not None module_ns_local_name = module_ns.get("name") module_ns_id = module_ns.get("id") wxr.wtp.add_page(f"{module_ns_local_name}:wiktextract cat tree", module_ns_id, LUA_CODE, model="Scribunto") wxr.wtp.start_page("Wiktextract category tree extraction") rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}") - ht = {} + ht: dict[str, CategoryEntry] = {} for line in rawdata.split("\n"): if not line: continue @@ -97,7 +127,7 @@ def extract_categories(wxr: WiktextractContext): parent_name_lc = parent_name.lower() parent_sort = parts[i + 1] if parent_name_lc not in ht: - p = {"name": parent_name} + p: CategoryEntry = {"name": parent_name} ht[parent_name_lc] = p else: p = ht[parent_name_lc] @@ -109,10 +139,10 @@ def extract_categories(wxr: WiktextractContext): p["sort"] = [] p["sort"].append(parent_sort) - seen = set() - is_child = set() + seen: set[str] = set() + is_child: set[str] = set() - def recurse(name): + def recurse(name: str) -> None: if name in seen: return seen.add(name) @@ -125,8 +155,8 @@ def recurse(name): for child in v.get("children", ()): is_child.add(child.lower()) - notseen = set(x.lower() for x in ht.keys()) - seen - is_child - notseen = list(ht[x]["name"] for x in sorted(notseen)) + notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child + notseen = list(ht[x]["name"] for x in sorted(notseen_set)) #if notseen: # print("NOT SEEN:", "; ".join(notseen)) @@ -137,7 +167,7 @@ def recurse(name): roots = ["Fundamental"] roots.extend(notseen) - ret = {"roots": roots, "nodes": ht} + ret: CategoryReturn = {"roots": roots, "nodes": ht} # import json # print(json.dumps(ret, sort_keys=True, indent=2)) return ret diff --git a/src/wiktextract/clean.py b/src/wiktextract/clean.py index b5c37ff62..6ba82173e 100644 --- a/src/wiktextract/clean.py +++ b/src/wiktextract/clean.py @@ -9,14 +9,20 @@ import re import html import unicodedata +from typing import ( + Callable, + Optional, + Union +) from wikitextprocessor.common import MAGIC_FIRST, MAGIC_LAST +from wikitextprocessor.core import NamespaceDataEntry from .wxr_context import WiktextractContext ###################################################################### # Cleaning values into plain text. ###################################################################### -superscript_ht = { +superscript_ht: dict[str, str] = { "0": "⁰", "1": "¹", "2": "²", @@ -91,7 +97,7 @@ "∞": "\u2002᪲" # This is a KLUDGE } -subscript_ht = { +subscript_ht: dict[str, str] = { "0": "₀", "1": "₁", "2": "₂", @@ -131,7 +137,7 @@ "χ": "ᵪ", } -def to_superscript(text): +def to_superscript(text: str) -> str: "Converts text to superscript." if not text: return "" @@ -141,7 +147,7 @@ def to_superscript(text): return "^" + text return "^({})".format(text) -def to_subscript(text): +def to_subscript(text: str) -> str: """Converts text to subscript.""" if not text: return "" @@ -151,14 +157,14 @@ def to_subscript(text): return "_" + text return "_({})".format(text) -def to_chem(text): +def to_chem(text: str) -> str: """Converts text to chemical formula, making digits subscript.""" return "".join(to_subscript(x) if x.isdigit() else x for x in text) # Mapping from Latex names to Unicode characters/strings. This is the # default mapping (some cases are handled specially in the code). -math_map = { +math_map: dict[str, str] = { # XXX should probably change greek characters to non-slanted ones? "AC": "∿", "APLcomment": "⍝", @@ -912,7 +918,7 @@ def to_chem(text): "mathrm": "", } -mathcal_map = { +mathcal_map: dict[str, str] = { "A": "𝒜", "B": "ℬ", "C": "𝒞", @@ -967,7 +973,7 @@ def to_chem(text): "z": "𝓏", } -mathfrak_map = { +mathfrak_map: dict[str, str]= { "A": "𝔄", "B": "𝔅", "C": "ℭ", @@ -994,7 +1000,7 @@ def to_chem(text): "Z": "ℨ", } -mathbb_map = { +mathbb_map: dict[str, str] = { "A": "𝔸", "B": "𝔹", "C": "ℂ", @@ -1064,23 +1070,24 @@ def to_chem(text): "9": "𝟡", } -def mathcal_fn(text): +def mathcal_fn(text: str) -> str: return "".join(mathcal_map.get(x, x) for x in text) -def mathfrak_fn(text): +def mathfrak_fn(text: str) -> str: return "".join(mathfrak_map.get(x, x) for x in text) -def mathbb_fn(text): +def mathbb_fn(text: str) -> str: return "".join(mathbb_map.get(x, x) for x in text) -def to_math(text): +def to_math(text: str) -> str: """Converts a mathematical formula to ASCII.""" # print("to_math: {!r}".format(text)) - magic_vec = [] + magic_vec: list[str] = [] - def expand(text): + def expand(text: str) -> str: while True: orig = text + # formatting with {:c} converts input into character text = re.sub(r"[{:c}-{:c}]".format(MAGIC_FIRST, MAGIC_LAST), lambda m: magic_vec[ord(m.group(0)) - MAGIC_FIRST], text) @@ -1088,14 +1095,18 @@ def expand(text): break return text - def recurse(text): - def math_magic(text, left, right, fn): - regexp = r"{}([^{}{}]+){}".format( + def recurse(text: str) -> str: + def math_magic(text: str, + left: str, + right: str, + fn: Callable[[str], str] + ) -> str: + regexp_str = r"{}([^{}{}]+){}".format( re.escape(left), re.escape(left), re.escape(right), re.escape(right)) - regexp = re.compile(regexp) + regexp = re.compile(regexp_str) - def repl(m): + def repl(m: re.Match) -> str: magic = chr(MAGIC_FIRST + len(magic_vec)) t = fn(m.group(1)).strip() magic_vec.append(t) @@ -1108,8 +1119,8 @@ def repl(m): break return text - def expand_group(v): - fn = None + def expand_group(v: str) -> str: + fn: Optional[Callable[[str], str]] = None if re.match(r"\\mathcal\b", v): fn = mathcal_fn v = v[8:].strip() @@ -1181,7 +1192,7 @@ def expand_group(v): v = expand(v) return v - parts = [] + parts: list[str] = [] while True: orig = text text = math_magic(text, "{", "}", recurse) @@ -1223,7 +1234,7 @@ def expand_group(v): return text -def bold_follows(parts, i): +def bold_follows(parts: list[str], i: int) -> bool: """Checks if there is a bold (''') in parts after parts[i]. We allow intervening italics ('').""" parts = parts[i + 1:] @@ -1235,7 +1246,7 @@ def bold_follows(parts, i): return False -def remove_italic_and_bold(text): +def remove_italic_and_bold(text: str) -> str: """Based on token_iter in wikitextprocessor""" assert isinstance(text, str) lines = re.split(r"(\n+)", text) # Lines and separators @@ -1300,7 +1311,11 @@ def remove_italic_and_bold(text): new_text_parts = new_text_parts[:-1] # remove last \n return "".join(new_text_parts) -def clean_value(wxr, title, no_strip=False, no_html_strip=False): +def clean_value(wxr: WiktextractContext, + title: str, + no_strip=False, + no_html_strip=False +) -> str: """Cleans a title or value into a normal string. This should basically remove any Wikimedia formatting from it: HTML tags, templates, links, emphasis, etc. This will also merge multiple whitespaces into one @@ -1308,9 +1323,10 @@ def clean_value(wxr, title, no_strip=False, no_html_strip=False): assert isinstance(wxr, WiktextractContext) assert isinstance(title, str) - def repl_1(m): + def repl_1(m: re.Match) -> str: return clean_value(wxr, m.group(1), no_strip=True) - def repl_exturl(m): + + def repl_exturl(m: re.Match) -> str: args = re.split(r"\s+", m.group(1)) i = 0 while i < len(args) - 1: @@ -1318,33 +1334,33 @@ def repl_exturl(m): break i += 1 return " ".join(args[i:]) - def repl_link(m): + def repl_link(m: re.Match) -> str: if m.group(2) and m.group(2).lower() in ("file", "image"): return "" v = m.group(3).split("|") return clean_value(wxr, v[0], no_strip=True) - def repl_link_bars(m): + def repl_link_bars(m: re.Match) -> str: lnk = m.group(1) if re.match(r"(?si)(File|Image)\s*:", lnk): return "" return clean_value(wxr, m.group(4) or m.group(2) or "", no_strip=True) - def repl_1_sup(m): + def repl_1_sup(m: re.Match) -> str: return to_superscript(clean_value(wxr, m.group(1))) - def repl_1_sub(m): + def repl_1_sub(m: re.Match) -> str: return to_subscript(clean_value(wxr, m.group(1))) - def repl_1_chem(m): + def repl_1_chem(m: re.Match) -> str: return to_chem(clean_value(wxr, m.group(1))) - def repl_1_math(m): + def repl_1_math(m: re.Match) -> str: v = to_math(m.group(1)) # print("to_math:", ascii(v)) return v - def repl_1_syntaxhighlight(m): + def repl_1_syntaxhighlight(m: re.Match) -> str: # Content is preformatted return "\n" + m.group(1).strip() + "\n" @@ -1423,9 +1439,12 @@ def repl_1_syntaxhighlight(m): title = re.sub(r"\[//[^]\s]+\s+edit\s*\]", "", title) # Replace links by their text - category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", {}) - category_ns_names = {category_ns_data.get("name")} | set( - category_ns_data.get("aliases") + category_ns_data: Optional[NamespaceDataEntry] + # XXX "Category" -> config variable for portability + category_ns_data = wxr.wtp.NAMESPACE_DATA.get("Category", None) + assert category_ns_data is not None + category_ns_names = {category_ns_data["name"]} | set( + category_ns_data["aliases"] ) category_names_pattern = rf"(?:{'|'.join(category_ns_names)})" while True: @@ -1489,7 +1508,10 @@ def repl_1_syntaxhighlight(m): return title -def clean_template_args(wxr, ht, no_strip=False): +def clean_template_args(wxr: WiktextractContext, + ht: dict[Union[int, str], str], # XXX -> "TemplateArgs" + no_strip=False +) -> dict[str, str]: """Cleans all values in a template argument dictionary and returns the cleaned dictionary.""" assert isinstance(wxr, WiktextractContext) diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 801df8bf6..8652ac78e 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -6,15 +6,33 @@ import collections import json import sys -from typing import Callable, Optional +from typing import ( + Callable, + Iterable, + Optional, + TypedDict, + Union, +) -from wikitextprocessor.core import CollatedErrorReturnData +from wikitextprocessor.core import ErrorMessageData, CollatedErrorReturnData if sys.version_info < (3, 10): from importlib_resources import files else: from importlib.resources import files +SoundFileRedirects = dict[str, str] + +POSSubtitleData = TypedDict( + "POSSubtitleData", + { + "pos": str, + "debug": str, + "tags": list[str], + }, + total=False, +) + class WiktionaryConfig: """This class holds configuration data for Wiktionary parsing.""" @@ -54,19 +72,19 @@ class WiktionaryConfig: def __init__( self, - dump_file_lang_code="en", - capture_language_codes={"en", "mul"}, - capture_translations=True, - capture_pronunciation=True, - capture_linkages=True, - capture_compounds=True, - capture_redirects=True, - capture_examples=True, - capture_etymologies=True, - capture_inflections=True, - capture_descendants=True, - verbose=False, - expand_tables=False, + dump_file_lang_code: str = "en", + capture_language_codes: Optional[Iterable[str]] = {"en", "mul"}, + capture_translations = True, + capture_pronunciation = True, + capture_linkages = True, + capture_compounds = True, + capture_redirects = True, + capture_examples = True, + capture_etymologies = True, + capture_inflections = True, + capture_descendants = True, + verbose = False, + expand_tables = False, ): if capture_language_codes is not None: assert isinstance(capture_language_codes, (list, tuple, set)) @@ -101,13 +119,19 @@ def __init__( self.section_counts: dict[str, int] = collections.defaultdict(int) # Some fields related to errors # The word currently being processed. - self.word = None - self.errors = [] - self.warnings = [] - self.debugs = [] - self.redirects = {} + self.word: Optional[str] = None + self.errors: list[ErrorMessageData] = [] + self.warnings: list[ErrorMessageData] = [] + self.debugs: list[ErrorMessageData] = [] + self.redirects: SoundFileRedirects = {} self.data_folder = files("wiktextract") / "data" / dump_file_lang_code + self.POS_SUBTITLES: Optional[dict[str, POSSubtitleData]] = None + self.POS_TYPES: Optional[set[str]] = None + self.LINKAGE_SUBTITLES: Optional[dict[str, str]] = None + self.OTHER_SUBTITLES: Optional[dict[str, Union[str, list[str]]]] = None + # set the above three in the function below self.init_subtitles() + self.ZH_PRON_TAGS: Optional[list[str]] = None self.set_attr_from_json("ZH_PRON_TAGS", "zh_pron_tags.json") self.analyze_templates = True # find templates that need pre-expand self.extract_thesaurus_pages = True @@ -149,13 +173,14 @@ def set_attr_from_json( def init_subtitles(self) -> None: self.set_attr_from_json("LINKAGE_SUBTITLES", "linkage_subtitles.json") self.set_attr_from_json("POS_SUBTITLES", "pos_subtitles.json") - self.POS_TYPES = set(x["pos"] for x in self.POS_SUBTITLES.values()) - for k, v in self.POS_SUBTITLES.items(): - if "tags" in v: - assert isinstance(v["tags"], (list, tuple)) + if self.POS_SUBTITLES is not None: + self.POS_TYPES = set(x["pos"] for x in self.POS_SUBTITLES.values()) + for k, v in self.POS_SUBTITLES.items(): + if "tags" in v: + assert isinstance(v["tags"], (list, tuple)) self.set_attr_from_json("OTHER_SUBTITLES", "other_subtitles.json") - def load_edition_settings(self): + def load_edition_settings(self) -> None: file_path = self.data_folder / "config.json" if file_path.exists(): with file_path.open(encoding="utf-8") as f: