From 029c80da437ed2318ef27b1e122ec81cb48bc490 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 18 Jan 2024 10:08:09 +0800 Subject: [PATCH 1/2] Remove Optional types in German pydantic models --- pyproject.toml | 1 - src/wiktextract/extractor/de/example.py | 13 +- src/wiktextract/extractor/de/gloss.py | 1 - src/wiktextract/extractor/de/linkage.py | 1 - src/wiktextract/extractor/de/models.py | 165 ++++++++---------- src/wiktextract/extractor/de/page.py | 1 - src/wiktextract/extractor/de/pronunciation.py | 1 - src/wiktextract/extractor/de/translation.py | 1 - tests/test_de_example.py | 23 +-- tests/test_de_gloss.py | 4 +- tests/test_de_linkages.py | 1 - tests/test_de_page.py | 1 - tests/test_de_pronunciation.py | 1 - tests/test_de_translation.py | 1 - tools/generate_schema.py | 4 + 15 files changed, 94 insertions(+), 125 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ef42ff744..181f303dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,6 @@ dependencies = [ [project.optional-dependencies] dev = [ - "black", "coverage[toml]", "mypy", "ruff", diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py index 67b261e6d..63d8e9a51 100644 --- a/src/wiktextract/extractor/de/example.py +++ b/src/wiktextract/extractor/de/example.py @@ -2,8 +2,7 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - -from wiktextract.extractor.de.models import Example, Reference, WordEntry +from wiktextract.extractor.de.models import Example, WordEntry from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -80,9 +79,7 @@ def extract_examples( def extract_reference( wxr: WiktextractContext, example_data: Example, ref_node: WikiNode ): - reference_data = Reference() - - reference_data.raw_ref = clean_node(wxr, {}, ref_node.children) + example_data.raw_ref = clean_node(wxr, {}, ref_node.children) template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE)) @@ -100,9 +97,9 @@ def extract_reference( for key, value in template_node.template_parameters.items(): if isinstance(key, str): key_english = REF_KEY_MAP.get(key.lower(), key.lower()) - if key_english in reference_data.model_fields: + if key_english in example_data.model_fields: setattr( - reference_data, key_english, clean_node(wxr, {}, value) + example_data, key_english, clean_node(wxr, {}, value) ) else: wxr.wtp.debug( @@ -112,5 +109,3 @@ def extract_reference( # XXX: Treat other templates as well. # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID - - example_data.ref = reference_data diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index 1e9e5e2ad..7e5d67d91 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -3,7 +3,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.de.models import Sense, WordEntry from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node diff --git a/src/wiktextract/extractor/de/linkage.py b/src/wiktextract/extractor/de/linkage.py index 36b4804e8..8434b5fdf 100644 --- a/src/wiktextract/extractor/de/linkage.py +++ b/src/wiktextract/extractor/de/linkage.py @@ -2,7 +2,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.de.models import Linkage, WordEntry from wiktextract.extractor.share import split_senseids from wiktextract.page import clean_node diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py index f6ecaa6b5..a5ebfbcca 100644 --- a/src/wiktextract/extractor/de/models.py +++ b/src/wiktextract/extractor/de/models.py @@ -1,10 +1,13 @@ -from typing import Optional - from pydantic import BaseModel, ConfigDict, Field class BaseModelWrap(BaseModel): - model_config = ConfigDict(validate_assignment=True, extra="forbid") + model_config = ConfigDict( + extra="forbid", + strict=True, + validate_assignment=True, + validate_default=True, + ) class Linkage(BaseModelWrap): @@ -12,22 +15,20 @@ class Linkage(BaseModelWrap): class Translation(BaseModelWrap): - sense: Optional[str] = Field( - default=None, description="A gloss of the sense being translated" + sense: str = Field( + default="", description="A gloss of the sense being translated" ) - word: Optional[str] = Field(default=None, description="Translation term") - lang_code: Optional[str] = Field( - default=None, + word: str = Field(default="", description="Translation term") + lang_code: str = Field( + default="", description="Wiktionary language code of the translation term", ) - lang: Optional[str] = Field( - default=None, description="Localized language name" - ) - uncertain: Optional[bool] = Field( + lang: str = Field(default="", description="Localized language name") + uncertain: bool = Field( default=False, description="Translation marked as uncertain" ) - roman: Optional[str] = Field( - default=None, description="Transliteration to Roman characters" + roman: str = Field( + default="", description="Transliteration to Roman characters" ) # senseids: list[str] = Field( # default=[], @@ -38,63 +39,49 @@ class Translation(BaseModelWrap): description="Tags specifying the translated term, usually gender information", ) notes: list[str] = Field(default=[], description="A list of notes") - roman: Optional[str] = Field( - default=None, description="Transliteration in roman characters" + roman: str = Field( + default="", description="Transliteration in roman characters" ) -class Reference(BaseModelWrap): - raw_ref: str = Field(default=None, description="Raw reference string") - url: Optional[str] = Field( - default=None, description="A web link. Not necessarily well-formated." - ) - author: Optional[str] = Field(default=None, description="Author's name") - - title: Optional[str] = Field( - default=None, description="Title of the reference" - ) - title_complement: Optional[str] = Field( - default=None, description="Complement to the title" - ) - pages: Optional[str] = Field(default=None, description="Page numbers") - year: Optional[str] = Field(default=None, description="Year of publication") - publisher: Optional[str] = Field(default=None, description="Published by") - editor: Optional[str] = Field(default=None, description="Editor") - translator: Optional[str] = Field(default=None, description="Translator") - collection: Optional[str] = Field( - default=None, +class Example(BaseModelWrap): + text: str = Field(default="", description="Example usage sentence") + # translation: Optional[str] = Field( + # default=None, description="Spanish translation of the example sentence" + # ) + raw_ref: str = Field(default="", description="Raw reference string") + url: str = Field( + default="", description="A web link. Not necessarily well-formated." + ) + author: str = Field(default="", description="Author's name") + title: str = Field(default="", description="Title of the reference") + title_complement: str = Field( + default="", description="Complement to the title" + ) + pages: str = Field(default="", description="Page numbers") + year: str = Field(default="", description="Year of publication") + publisher: str = Field(default="", description="Published by") + editor: str = Field(default="", description="Editor") + translator: str = Field(default="", description="Translator") + collection: str = Field( + default="", description="Name of collection that reference was published in", ) - volume: Optional[str] = Field(default=None, description="Volume number") - comment: Optional[str] = Field( - default=None, description="Comment on the reference" - ) - day: Optional[str] = Field(default=None, description="Day of publication") - month: Optional[str] = Field( - default=None, description="Month of publication" - ) - accessdate: Optional[str] = Field( - default=None, description="Date of access of online reference" + volume: str = Field(default="", description="Volume number") + comment: str = Field(default="", description="Comment on the reference") + day: str = Field(default="", description="Day of publication") + month: str = Field(default="", description="Month of publication") + accessdate: str = Field( + default="", description="Date of access of online reference" ) - - date: Optional[str] = Field(default=None, description="Date of publication") - number: Optional[str] = Field(default=None, description="Issue number") + date: str = Field(default="", description="Date of publication") + number: str = Field(default="", description="Issue number") # journal: Optional[str] = Field(default=None, description="Name of journal") # chapter: Optional[str] = Field(default=None, description="Chapter name") - place: Optional[str] = Field( - default=None, description="Place of publication" - ) + place: str = Field(default="", description="Place of publication") # editor: Optional[str] = Field(default=None, description="Editor") - edition: Optional[str] = Field(default=None, description="Edition number") - isbn: Optional[str] = Field(default=None, description="ISBN number") - - -class Example(BaseModelWrap): - text: str = Field(default=None, description="Example usage sentence") - # translation: Optional[str] = Field( - # default=None, description="Spanish translation of the example sentence" - # ) - ref: Optional["Reference"] = Field(default=None, description="") + edition: str = Field(default="", description="Edition number") + isbn: str = Field(default="", description="ISBN number") class Sense(BaseModelWrap): @@ -120,19 +107,19 @@ class Sense(BaseModelWrap): # subsenses: list["Sense"] = Field( # default=[], description="List of subsenses" # ) - senseid: Optional[str] = Field( - default=None, description="Sense number used in Wiktionary" + senseid: str = Field( + default="", description="Sense number used in Wiktionary" ) - translations: Optional[list[Translation]] = [] - antonyms: Optional[list[Linkage]] = [] - derived: Optional[list[Linkage]] = [] - hyponyms: Optional[list[Linkage]] = [] - hypernyms: Optional[list[Linkage]] = [] - holonyms: Optional[list[Linkage]] = [] - expressions: Optional[list[Linkage]] = [] - coordinate_terms: Optional[list[Linkage]] = [] - proverbs: Optional[list[Linkage]] = [] - synonyms: Optional[list[Linkage]] = [] + translations: list[Translation] = [] + antonyms: list[Linkage] = [] + derived: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + holonyms: list[Linkage] = [] + expressions: list[Linkage] = [] + coordinate_terms: list[Linkage] = [] + proverbs: list[Linkage] = [] + synonyms: list[Linkage] = [] class Sound(BaseModelWrap): @@ -161,18 +148,18 @@ class Sound(BaseModelWrap): tags: list[str] = Field( default=[], description="Specifying the variant of the pronunciation" ) - pass class WordEntry(BaseModelWrap): """ - WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract. + WordEntry is a dictionary containing lexical information of a single word + extracted from Wiktionary with wiktextract. """ model_config = ConfigDict(title="German Wiktionary") word: str = Field(description="word string") - pos: str = Field(default=None, description="Part of speech type") + pos: str = Field(default="", description="Part of speech type") # pos_title: str = Field(default=None, description="Original POS title") lang_code: str = Field( description="Wiktionary language code", examples=["es"] @@ -180,19 +167,19 @@ class WordEntry(BaseModelWrap): lang: str = Field( description="Localized language name of the word", examples=["español"] ) - senses: Optional[list[Sense]] = [] + senses: list[Sense] = [] # categories: list[str] = Field( # default=[], # description="list of non-disambiguated categories for the word", # ) - translations: Optional[list[Translation]] = [] - sounds: Optional[list[Sound]] = [] - antonyms: Optional[list[Linkage]] = [] - derived: Optional[list[Linkage]] = [] - hyponyms: Optional[list[Linkage]] = [] - hypernyms: Optional[list[Linkage]] = [] - holonyms: Optional[list[Linkage]] = [] - expressions: Optional[list[Linkage]] = [] - coordinate_terms: Optional[list[Linkage]] = [] - proverbs: Optional[list[Linkage]] = [] - synonyms: Optional[list[Linkage]] = [] + translations: list[Translation] = [] + sounds: list[Sound] = [] + antonyms: list[Linkage] = [] + derived: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + holonyms: list[Linkage] = [] + expressions: list[Linkage] = [] + coordinate_terms: list[Linkage] = [] + proverbs: list[Linkage] = [] + synonyms: list[Linkage] = [] diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index eda765405..2a6071112 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -5,7 +5,6 @@ from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.de.models import WordEntry from wiktextract.wxr_context import WiktextractContext diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index 43563fa29..7c3e09195 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -3,7 +3,6 @@ from mediawiki_langcodes import code_to_name from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode - from wiktextract.extractor.de.models import Sound, WordEntry from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py index cf58373f7..e0710a25d 100644 --- a/src/wiktextract/extractor/de/translation.py +++ b/src/wiktextract/extractor/de/translation.py @@ -5,7 +5,6 @@ from mediawiki_langcodes import code_to_name from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode - from wiktextract.extractor.de.models import Translation, WordEntry from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext diff --git a/tests/test_de_example.py b/tests/test_de_example.py index a95667f0a..3ba757f8e 100644 --- a/tests/test_de_example.py +++ b/tests/test_de_example.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.example import extract_examples, extract_reference from wiktextract.extractor.de.models import Example, Sense, WordEntry @@ -72,7 +71,7 @@ def test_de_extract_example_with_reference(self): "examples": [ { "text": "example1", - "ref": {"raw_ref": "ref1A"}, + "raw_ref": "ref1A", }, ], "senseid": "1", @@ -95,15 +94,13 @@ def test_de_extract_reference_from_literatur_template(self): self.assertEqual( example_data.model_dump(exclude_defaults=True), { - "ref": { - "raw_ref": "Expanded template, Seite 273. ISBN 978-3-89029-459-9.", - "title": "Viva Warszawa", - "author": "Steffen Möller", - "title_complement": "Polen für Fortgeschrittene", - "publisher": "Piper", - "place": "München/Berlin", - "year": "2015", - } + "raw_ref": "Expanded template, Seite 273. ISBN 978-3-89029-459-9.", + "title": "Viva Warszawa", + "author": "Steffen Möller", + "title_complement": "Polen für Fortgeschrittene", + "publisher": "Piper", + "place": "München/Berlin", + "year": "2015", }, ) @@ -124,8 +121,6 @@ def test_de_extract_reference_from_templates_without_named_args(self): self.assertEqual( example_data.model_dump(exclude_defaults=True), { - "ref": { - "raw_ref": "Expanded template", - } + "raw_ref": "Expanded template", }, ) diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index 6eeb264fa..39cdcee82 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -1,9 +1,7 @@ import unittest -from collections import defaultdict from unittest.mock import patch from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.gloss import ( extract_glosses, @@ -265,7 +263,7 @@ def test_de_extract_tags_from_gloss_text(self): "input": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg", "expected_tags": None, "expected_gloss": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg", - } + }, # Add more test cases as needed ] for case in test_cases: diff --git a/tests/test_de_linkages.py b/tests/test_de_linkages.py index 7de216833..e74d92b4a 100644 --- a/tests/test_de_linkages.py +++ b/tests/test_de_linkages.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.linkage import extract_linkages from wiktextract.extractor.de.models import Sense, WordEntry diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 77e11e7a1..925214e5b 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -3,7 +3,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.models import WordEntry from wiktextract.extractor.de.page import parse_page, parse_section diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py index 19047dc8b..773d9b30d 100644 --- a/tests/test_de_pronunciation.py +++ b/tests/test_de_pronunciation.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.models import Sound from wiktextract.extractor.de.pronunciation import ( diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py index 21e204939..badd6f7ed 100644 --- a/tests/test_de_translation.py +++ b/tests/test_de_translation.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.models import Sense, Translation, WordEntry from wiktextract.extractor.de.translation import ( diff --git a/tools/generate_schema.py b/tools/generate_schema.py index e224406f2..3fd0a084e 100644 --- a/tools/generate_schema.py +++ b/tools/generate_schema.py @@ -27,6 +27,10 @@ def main() -> None: model_schema[ "$schema" ] = "https://json-schema.org/draft/2020-12/schema" + if "description" in model_schema: + model_schema["description"] = model_schema[ + "description" + ].replace("\n", " ") with (output_path / f"{lang_code}.json").open( "w", encoding="utf-8" ) as f: From fb1f63b44e16334d0fa8269440c85479e7bbcf7c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 18 Jan 2024 13:29:46 +0800 Subject: [PATCH 2/2] Remove Optional types in Spanish pydantic models --- src/wiktextract/extractor/es/example.py | 52 +++--- src/wiktextract/extractor/es/gloss.py | 3 +- src/wiktextract/extractor/es/models.py | 150 ++++++++---------- src/wiktextract/extractor/es/page.py | 7 +- src/wiktextract/extractor/es/pronunciation.py | 6 +- tests/test_es_etymology.py | 1 - tests/test_es_example.py | 28 ++-- tests/test_es_gloss.py | 1 - tests/test_es_page.py | 1 - tests/test_es_pronunciation.py | 1 - tests/test_es_translation.py | 1 - 11 files changed, 106 insertions(+), 145 deletions(-) diff --git a/src/wiktextract/extractor/es/example.py b/src/wiktextract/extractor/es/example.py index b9996d945..82f680884 100644 --- a/src/wiktextract/extractor/es/example.py +++ b/src/wiktextract/extractor/es/example.py @@ -1,9 +1,9 @@ import re -from typing import Optional, Tuple, Union +from typing import Optional, Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import WikiNodeChildrenList -from wiktextract.extractor.es.models import Example, Reference, Sense +from wiktextract.extractor.es.models import Example, Sense from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -24,7 +24,7 @@ def clean_text_and_url_from_text_nodes( wxr: WiktextractContext, nodes: WikiNodeChildrenList -) -> Tuple[str, Optional[str]]: +) -> tuple[str, Optional[str]]: if not nodes: return "", None @@ -45,7 +45,7 @@ def clean_text_and_url_from_text_nodes( return text, url -def add_template_params_to_reference( +def add_template_params_to_example( wxr: WiktextractContext, params: Optional[ dict[ @@ -53,15 +53,15 @@ def add_template_params_to_reference( Union[str, WikiNode, list[Union[str, WikiNode]]], ] ], - reference: Reference, + example: Example, ): for key in params.keys(): if isinstance(key, int): continue ref_key = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key) - if ref_key in reference.model_fields: - setattr(reference, ref_key, clean_node(wxr, {}, params.get(key))) + if ref_key in example.model_fields: + setattr(example, ref_key, clean_node(wxr, {}, params.get(key))) else: wxr.wtp.debug( f"Unknown key {key} in example template {params}", @@ -73,7 +73,6 @@ def process_example_template( wxr: WiktextractContext, sense_data: Sense, template_node: WikiNode, - reference: Reference, ): params = template_node.template_parameters text_nodes = params.get(1) @@ -86,13 +85,13 @@ def process_example_template( example = Example(text=text) - if url: - example.ref = Reference(url=url) + if url is not None: + example.url = url if template_node.template_name == "ejemplo_y_trad": example.translation = clean_node(wxr, {}, params.get(2)) - add_template_params_to_reference(wxr, params, reference) + add_template_params_to_example(wxr, params, example) sense_data.examples.append(example) @@ -104,15 +103,15 @@ def extract_example( ): rest: WikiNodeChildrenList = [] - reference = Reference() for node in nodes: if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE: if node.template_name in ["ejemplo", "ejemplo_y_trad"]: - process_example_template(wxr, sense_data, node, reference) + process_example_template(wxr, sense_data, node) else: rest.append(node) elif isinstance(node, WikiNode) and node.kind == NodeKind.URL: - reference.url = clean_node(wxr, {}, node) + if len(sense_data.examples) > 0: + sense_data.examples[-1].url = clean_node(wxr, {}, node) else: rest.append(node) @@ -125,9 +124,6 @@ def extract_example( sortid="extractor/es/example/extract_example/87", ) - if sense_data.examples and reference.model_dump(exclude_defaults=True): - sense_data.examples[-1].ref = reference - def process_example_list( wxr: WiktextractContext, @@ -145,34 +141,28 @@ def process_example_list( text, url = clean_text_and_url_from_text_nodes(wxr, text_nodes) - if not text: + if len(text) == 0: continue example = Example(text=text) - if url: - example.ref = Reference(url=url) + if url is not None: + example.url = url for template_node in template_nodes: - reference = Reference() if template_node.template_name == "cita libro": - add_template_params_to_reference( - wxr, template_node.template_parameters, reference + add_template_params_to_example( + wxr, template_node.template_parameters, example ) - if reference.model_dump(exclude_defaults=True): - example.ref = reference sense_data.examples.append(example) # If no example was found in sublists, assume example is in list_item.children directly. if not sense_data.examples: text, url = clean_text_and_url_from_text_nodes(wxr, list_item.children) - text = re.sub(r"^(Ejemplos?:?)", "", text).strip() - - if not text: + if len(text) == 0: return example = Example(text=text) - if url: - example.ref = Reference(url=url) - + if url is not None: + example.url = url sense_data.examples.append(example) diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py index b8bb567c4..a8dfb649f 100644 --- a/src/wiktextract/extractor/es/gloss.py +++ b/src/wiktextract/extractor/es/gloss.py @@ -2,7 +2,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import WikiNodeChildrenList - from wiktextract.extractor.es.models import Sense, WordEntry from wiktextract.extractor.es.sense_data import process_sense_data_list from wiktextract.page import clean_node @@ -15,7 +14,7 @@ def extract_gloss( list_node: WikiNode, ) -> None: for list_item in list_node.find_child(NodeKind.LIST_ITEM): - gloss_data = Sense(glosses=[]) + gloss_data = Sense() definition: WikiNodeChildrenList = [] other: WikiNodeChildrenList = [] diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index bc5104a37..afcb27954 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -1,17 +1,20 @@ -from typing import Optional - from pydantic import BaseModel, ConfigDict, Field class BaseModelWrap(BaseModel): - model_config = ConfigDict(validate_assignment=True, extra="forbid") + model_config = ConfigDict( + extra="forbid", + strict=True, + validate_assignment=True, + validate_default=True, + ) class Linkage(BaseModelWrap): word: str - note: Optional[str] = Field(default=None) - alternative_spelling: Optional[str] = Field( - default=None, description="Alternative spelling of the word" + note: str = "" + alternative_spelling: str = Field( + default="", description="Alternative spelling of the word" ) @@ -29,15 +32,15 @@ class Translation(BaseModelWrap): description="Tags specifying the translated term, usually gender information", ) notes: list[str] = Field(default=[], description="A list of notes") - roman: Optional[str] = Field( - default=None, description="Transliteration in roman characters" + roman: str = Field( + default="", description="Transliteration in roman characters" ) class EtymologyTemplate(BaseModelWrap): name: str = Field(default="", description="Template's name.") - args: Optional[dict[str, str]] = Field( - default=None, description="Arguments given to the template, if any." + args: dict[str, str] = Field( + default={}, description="Arguments given to the template, if any." ) expansion: str = Field( default="", @@ -45,39 +48,28 @@ class EtymologyTemplate(BaseModelWrap): ) -class Reference(BaseModelWrap): - url: Optional[str] = Field(default=None, description="A web link") - first_name: Optional[str] = Field( - default=None, description="Author's first name" - ) - last_name: Optional[str] = Field( - default=None, description="Author's last name" - ) - title: Optional[str] = Field( - default=None, description="Title of the reference" - ) - pages: Optional[str] = Field(default=None, description="Page numbers") - year: Optional[str] = Field(default=None, description="Year of publication") - date: Optional[str] = Field(default=None, description="Date of publication") - journal: Optional[str] = Field(default=None, description="Name of journal") - chapter: Optional[str] = Field(default=None, description="Chapter name") - place: Optional[str] = Field( - default=None, description="Place of publication" - ) - editor: Optional[str] = Field(default=None, description="Editor") - - class Example(BaseModelWrap): text: str = Field(description="Example usage sentence") - translation: Optional[str] = Field( - default=None, description="Spanish translation of the example sentence" - ) - ref: Optional["Reference"] = Field(default=None, description="") + translation: str = Field( + default="", description="Spanish translation of the example sentence" + ) + url: str = Field(default="", description="A web link") + first_name: str = Field(default="", description="Author's first name") + last_name: str = Field(default="", description="Author's last name") + title: str = Field(default="", description="Title of the reference") + pages: str = Field(default="", description="Page numbers") + year: str = Field(default="", description="Year of publication") + date: str = Field(default="", description="Date of publication") + journal: str = Field(default="", description="Name of journal") + chapter: str = Field(default="", description="Chapter name") + place: str = Field(default="", description="Place of publication") + editor: str = Field(default="", description="Editor") class Sense(BaseModelWrap): glosses: list[str] = Field( - description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging." + default=[], + description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", ) tags: list[str] = Field( default=[], @@ -93,37 +85,35 @@ class Sense(BaseModelWrap): # subsenses: list["Sense"] = Field( # default=[], description="List of subsenses" # ) - senseid: Optional[str] = Field( - default=None, description="Sense number used in Wiktionary" + senseid: str = Field( + default="", description="Sense number used in Wiktionary" ) - antonyms: Optional[list[Linkage]] = [] - compounds: Optional[list[Linkage]] = [] - derived: Optional[list[Linkage]] = [] - hyponyms: Optional[list[Linkage]] = [] - hypernyms: Optional[list[Linkage]] = [] - idioms: Optional[list[Linkage]] = [] - meronyms: Optional[list[Linkage]] = [] - related: Optional[list[Linkage]] = [] - synonyms: Optional[list[Linkage]] = [] + antonyms: list[Linkage] = [] + compounds: list[Linkage] = [] + derived: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + idioms: list[Linkage] = [] + meronyms: list[Linkage] = [] + related: list[Linkage] = [] + synonyms: list[Linkage] = [] class Spelling(BaseModelWrap): - alternative: Optional[str] = Field( - default=None, description="Alternative spelling with same pronunciation" + alternative: str = Field( + default="", description="Alternative spelling with same pronunciation" ) - note: Optional[str] = Field( - default=None, description="Note regarding alternative spelling" + note: str = Field( + default="", description="Note regarding alternative spelling" ) - same_pronunciation: Optional[bool] = Field( - default=None, + same_pronunciation: bool = Field( + default="", description="Whether the alternative spelling has the same pronunciation as the default spelling", ) class Sound(BaseModelWrap): - ipa: str = Field( - default="", description="International Phonetic Alphabet" - ) + ipa: str = Field(default="", description="International Phonetic Alphabet") phonetic_transcription: str = Field( default="", description="Phonetic transcription, less exact than IPA." ) @@ -135,9 +125,7 @@ class Sound(BaseModelWrap): roman: str = Field( default="", description="Translitaration to Roman characters" ) - syllabic: str = Field( - default="", description="Syllabic transcription" - ) + syllabic: str = Field(default="", description="Syllabic transcription") tags: list[str] = Field( default=[], description="Specifying the variant of the pronunciation" ) @@ -151,39 +139,39 @@ class WordEntry(BaseModelWrap): model_config = ConfigDict(title="Spanish Wiktionary") word: str = Field(description="word string") - pos: str = Field(default=None, description="Part of speech type") - pos_title: str = Field(default=None, description="Original POS title") + pos: str = Field(default="", description="Part of speech type") + pos_title: str = Field(default="", description="Original POS title") lang_code: str = Field( description="Wiktionary language code", examples=["es"] ) lang: str = Field( description="Localized language name of the word", examples=["español"] ) - senses: Optional[list[Sense]] = [] + senses: list[Sense] = [] categories: list[str] = Field( default=[], description="list of non-disambiguated categories for the word", ) - sounds: Optional[list[Sound]] = [] - spellings: Optional[list[Spelling]] = [] - translations: Optional[list[Translation]] = [] - etymology_text: Optional[str] = Field( - default=None, description="Etymology section as cleaned text." + sounds: list[Sound] = [] + spellings: list[Spelling] = [] + translations: list[Translation] = [] + etymology_text: str = Field( + default="", description="Etymology section as cleaned text." ) - etymology_templates: Optional[list[EtymologyTemplate]] = Field( - default=None, + etymology_templates: list[EtymologyTemplate] = Field( + default=[], description="Templates and their arguments and expansions from the etymology section.", ) - etymology_number: Optional[int] = Field( - default=None, + etymology_number: int = Field( + default=0, description="For words with multiple numbered etymologies, this contains the number of the etymology under which this entry appeared.", ) - antonyms: Optional[list[Linkage]] = [] - compounds: Optional[list[Linkage]] = [] - derived: Optional[list[Linkage]] = [] - hyponyms: Optional[list[Linkage]] = [] - hypernyms: Optional[list[Linkage]] = [] - idioms: Optional[list[Linkage]] = [] - meronyms: Optional[list[Linkage]] = [] - related: Optional[list[Linkage]] = [] - synonyms: Optional[list[Linkage]] = [] + antonyms: list[Linkage] = [] + compounds: list[Linkage] = [] + derived: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + idioms: list[Linkage] = [] + meronyms: list[Linkage] = [] + related: list[Linkage] = [] + synonyms: list[Linkage] = [] diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 6602cc38a..4ee7872e5 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -4,7 +4,6 @@ from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import WikiNodeChildrenList - from wiktextract.extractor.es.etymology import process_etymology_block from wiktextract.extractor.es.example import extract_example from wiktextract.extractor.es.gloss import extract_gloss @@ -194,10 +193,8 @@ def process_pos_block( """ child_nodes = list(pos_level_node.filter_empty_str_child()) - - sense_children: WikiNodeChildrenList = ( - [] - ) # All non-gloss nodes that add additional information to a sense + # All non-gloss nodes that add additional information to a sense + sense_children: WikiNodeChildrenList = [] for child in child_nodes: if ( diff --git a/src/wiktextract/extractor/es/pronunciation.py b/src/wiktextract/extractor/es/pronunciation.py index 7d62eb945..64ede513c 100644 --- a/src/wiktextract/extractor/es/pronunciation.py +++ b/src/wiktextract/extractor/es/pronunciation.py @@ -126,8 +126,8 @@ def process_pron_graf_template( main_sound = variations[0] for key in main_sound.model_fields_set: - # because "tags" is a field that is never 'set' (just appended to) - # it apparently doesn't appear in mode_fields_set. + # because "tags" is a field that is never 'set' (just appended to) + # it apparently doesn't appear in mode_fields_set. for i, other_variation in variations.items(): if i == 0: continue @@ -139,7 +139,7 @@ def process_pron_graf_template( continue if not other_variation.tags: other_variation.tags = main_sound.tags.copy() - + for sound in variations.values(): if len(sound.model_dump(exclude_defaults=True)) > 0: sound_data.append(sound) diff --git a/tests/test_es_etymology.py b/tests/test_es_etymology.py index dbe69b371..7527f1438 100644 --- a/tests/test_es_etymology.py +++ b/tests/test_es_etymology.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.etymology import process_etymology_block from wiktextract.extractor.es.models import WordEntry diff --git a/tests/test_es_example.py b/tests/test_es_example.py index e81a71d6c..eaf922b46 100644 --- a/tests/test_es_example.py +++ b/tests/test_es_example.py @@ -36,9 +36,7 @@ def test_es_extract_example(self): "expected": [ { "text": "Nos gusta lo oscuro, y por eso triunfa la Necroporra, sea ético o no", - "ref": { - "url": "https://www.menzig.es/a/necroporra-fantamorto-porra-famosos-muertos/" - }, + "url": "https://www.menzig.es/a/necroporra-fantamorto-porra-famosos-muertos/", } ], }, @@ -48,9 +46,7 @@ def test_es_extract_example(self): "expected": [ { "text": """Papel: más viejo que Matusalén, pero graduado "cum laude" en eficacia publicitaria""", - "ref": { - "url": "https://www.marketingdirecto.com/marketing-general/publicidad/papel-mas-viejo-matusalen-pero-graduado-cum-laude-eficacia-publicitaria" - }, + "url": "https://www.marketingdirecto.com/marketing-general/publicidad/papel-mas-viejo-matusalen-pero-graduado-cum-laude-eficacia-publicitaria", } ], }, @@ -60,14 +56,12 @@ def test_es_extract_example(self): "expected": [ { "text": "Era persona inteligente, culta, que me permitía zapotear los libros y me hacía comentarios sobre ellos y sus autores", - "ref": { - "title": "Memorias intelectuales", - "first_name": "Jaime", - "last_name": "Jaramillo Uribe", - "pages": "19", - "url": "https://books.google.com.co/books?id=X9MSAQAAIAAJ&q=zapotear", - "year": "2007", - }, + "title": "Memorias intelectuales", + "first_name": "Jaime", + "last_name": "Jaramillo Uribe", + "pages": "19", + "url": "https://books.google.com.co/books?id=X9MSAQAAIAAJ&q=zapotear", + "year": "2007", } ], }, @@ -119,10 +113,8 @@ def test_es_process_example_list(self): "expected": [ { "text": "«Apoyado contra el quicio de la puerta, adivina, de pronto, a su marido.»", - "ref": { - "first_name": "María Luisa", - "last_name": "Bombal", - }, + "first_name": "María Luisa", + "last_name": "Bombal", } ], }, diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py index 5105821ad..6aff6cfe3 100644 --- a/tests/test_es_gloss.py +++ b/tests/test_es_gloss.py @@ -2,7 +2,6 @@ from typing import List from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.gloss import extract_gloss from wiktextract.extractor.es.models import WordEntry diff --git a/tests/test_es_page.py b/tests/test_es_page.py index 9e98a043b..fb8125501 100644 --- a/tests/test_es_page.py +++ b/tests/test_es_page.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.page import parse_entries diff --git a/tests/test_es_pronunciation.py b/tests/test_es_pronunciation.py index 20f5167ed..f12d622da 100644 --- a/tests/test_es_pronunciation.py +++ b/tests/test_es_pronunciation.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.pronunciation import ( diff --git a/tests/test_es_translation.py b/tests/test_es_translation.py index a902e95dc..49c499824 100644 --- a/tests/test_es_translation.py +++ b/tests/test_es_translation.py @@ -1,7 +1,6 @@ import unittest from wikitextprocessor import Wtp - from wiktextract.config import WiktionaryConfig from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.translation import extract_translation