From 003aab59d0e46eb186e0ac592dcd3b809ac5d73f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristian=20J=C3=A4rventaus?= Date: Thu, 4 Jan 2024 09:10:27 +0200 Subject: [PATCH] Temp commit --- src/wiktextract/extractor/en/page.py | 11 ++--- src/wiktextract/type_utils.py | 60 +++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/src/wiktextract/extractor/en/page.py b/src/wiktextract/extractor/en/page.py index 2a0a0408a..d7d15e60e 100644 --- a/src/wiktextract/extractor/en/page.py +++ b/src/wiktextract/extractor/en/page.py @@ -681,7 +681,7 @@ def parse_sense_linkage( w = clean_node(wxr, data, w) for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): if w.startswith(alias): - w = w[len(alias):] + w = w[len(alias) :] break if not w: break @@ -795,8 +795,7 @@ def merge_base(data: WordData, base: WordData) -> None: sortid="page/904", ) - def complementary_pop(pron: WordData, key: str - ) -> WordData: + def complementary_pop(pron: WordData, key: str) -> WordData: """Remove unnecessary keys from dict values in a list comprehension...""" if key in pron: @@ -807,7 +806,9 @@ def complementary_pop(pron: WordData, key: str # does not match "word" or one of "forms" if "sounds" in data and "word" in data: accepted = [data["word"]] - accepted.extend(f["form"] for f in data.get("forms", dict())) # type:ignore + accepted.extend( + f["form"] for f in data.get("forms", dict()) # type:ignore + ) data["sounds"] = list( s for s in data["sounds"] # type:ignore @@ -822,7 +823,7 @@ def complementary_pop(pron: WordData, key: str if "pos" not in s or s["pos"] == data["pos"] # type:ignore ) - def push_sense(): + def push_sense() -> bool: """Starts collecting data for a new word sense. This returns True if a sense was added.""" nonlocal sense_data diff --git a/src/wiktextract/type_utils.py b/src/wiktextract/type_utils.py index 389b541e9..4f02fe8fb 100644 --- a/src/wiktextract/type_utils.py +++ b/src/wiktextract/type_utils.py @@ -1,14 +1,56 @@ from typing import ( + TypedDict, Union, ) -WordData = dict[str, Union[ - str, - int, - list[str], - list[list[str]], - "WordData", - list["WordData"] - ] - ] +# WordData = dict[str, Union[ +# str, +# int, +# list[str], +# list[list[str]], +# "WordData", +# list["WordData"] +# ] +# ] + +class WordData(TypedDict, total=False): + word: str + etymology_number: int + etymology_text: str + hyphenation: str + lang: str + lang_code: str + original_title: str + pos: str + source: str + + categories: list[str] + redirects: list[str] + topics: list[str] + wikidata: list[WikidataReference] + wikipedia: list[WikipediaReference] + + abbreviations: list[Abbreviation] + alt_of: list[Alt] + antonym: list[Antonym] + coordinate_terms: list[CoordinateTerm] + derived: list[Derived] + descendants: list[Descendant] + etymology_templates: list[EtymologyTemplate] + form_of: list[OriginalForm] + forms: list[Form] + head_templates: list[HeadTemplate] + holonyms: list[Holonym] + hypernyms: list[Hypernym] + hyponyms: list[Hyponym] + inflection_templates: list[InflectionTemplate] + instances: list[Instance] + meronyms: list[Meronym] + proverbs: list[Proverb] + related: list[Related] + senses: list[Sense] + sounds: list[Sound] + synonyms: list[Synonym] + translations: list[Translation] + troponyms: list[Troponym]