From 4c5503fd9aa907427b1d41e22be6e897a968fddf Mon Sep 17 00:00:00 2001 From: Empiriker <till.ueberfries@gmail.com> Date: Tue, 5 Dec 2023 15:53:36 +0100 Subject: [PATCH] Add pydantic and json_schema to German extractor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- json_schema/de.json | 932 ++++++++++++++++++ src/wiktextract/extractor/de/example.py | 62 +- src/wiktextract/extractor/de/gloss.py | 59 +- src/wiktextract/extractor/de/linkage.py | 39 +- src/wiktextract/extractor/de/models.py | 196 ++++ src/wiktextract/extractor/de/page.py | 84 +- src/wiktextract/extractor/de/pronunciation.py | 78 +- src/wiktextract/extractor/de/translation.py | 72 +- tests/test_de_example.py | 92 +- tests/test_de_gloss.py | 151 +-- tests/test_de_linkages.py | 111 +-- tests/test_de_page.py | 36 +- tests/test_de_pronunciation.py | 46 +- tests/test_de_translation.py | 220 +++-- tests/test_desc.py | 1 + 15 files changed, 1707 insertions(+), 472 deletions(-) create mode 100644 json_schema/de.json create mode 100644 src/wiktextract/extractor/de/models.py diff --git a/json_schema/de.json b/json_schema/de.json new file mode 100644 index 00000000..a5d645d4 --- /dev/null +++ b/json_schema/de.json @@ -0,0 +1,932 @@ +{ + "$defs": { + "Example": { + "additionalProperties": false, + "properties": { + "ref": { + "anyOf": [ + { + "$ref": "#/$defs/Reference" + }, + { + "type": "null" + } + ], + "default": null, + "description": "" + }, + "text": { + "default": null, + "description": "Example usage sentence", + "title": "Text", + "type": "string" + } + }, + "title": "Example", + "type": "object" + }, + "Reference": { + "additionalProperties": false, + "properties": { + "accessdate": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Date of access of online reference", + "title": "Accessdate" + }, + "author": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Author's name", + "title": "Author" + }, + "collection": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Name of collection that reference was published in", + "title": "Collection" + }, + "comment": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Comment on the reference", + "title": "Comment" + }, + "date": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Date of publication", + "title": "Date" + }, + "day": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Day of publication", + "title": "Day" + }, + "edition": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Edition number", + "title": "Edition" + }, + "editor": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Editor", + "title": "Editor" + }, + "isbn": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "ISBN number", + "title": "Isbn" + }, + "month": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Month of publication", + "title": "Month" + }, + "number": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Issue number", + "title": "Number" + }, + "pages": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Page numbers", + "title": "Pages" + }, + "place": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Place of publication", + "title": "Place" + }, + "publisher": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Published by", + "title": "Publisher" + }, + "raw_ref": { + "default": null, + "description": "Raw reference string", + "title": "Raw Ref", + "type": "string" + }, + "title": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Title of the reference", + "title": "Title" + }, + "title_complement": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Complement to the title", + "title": "Title Complement" + }, + "translator": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Translator", + "title": "Translator" + }, + "url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "A web link. Not necessarily well-formated.", + "title": "Url" + }, + "volume": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Volume number", + "title": "Volume" + }, + "year": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Year of publication", + "title": "Year" + } + }, + "title": "Reference", + "type": "object" + }, + "Sense": { + "additionalProperties": false, + "properties": { + "antonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Antonyms" + }, + "categories": { + "default": [], + "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + "items": { + "type": "string" + }, + "title": "Categories", + "type": "array" + }, + "coordinate_terms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Coordinate Terms" + }, + "derived": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Derived" + }, + "examples": { + "default": [], + "description": "List of examples", + "items": { + "$ref": "#/$defs/Example" + }, + "title": "Examples", + "type": "array" + }, + "expressions": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Expressions" + }, + "glosses": { + "default": [], + "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + "items": { + "type": "string" + }, + "title": "Glosses", + "type": "array" + }, + "holonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Holonyms" + }, + "hypernyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hypernyms" + }, + "hyponyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hyponyms" + }, + "proverbs": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Proverbs" + }, + "raw_glosses": { + "default": [], + "description": "list of uncleaned raw glosses for the word sense (usually only one).", + "items": { + "type": "string" + }, + "title": "Raw Glosses", + "type": "array" + }, + "senseid": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Sense number used in Wiktionary", + "title": "Senseid" + }, + "synonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Synonyms" + }, + "tags": { + "default": [], + "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + "items": { + "type": "string" + }, + "title": "Tags", + "type": "array" + }, + "translations": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Translation" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Translations" + } + }, + "title": "Sense", + "type": "object" + }, + "Sound": { + "additionalProperties": false, + "properties": { + "audio": { + "default": [], + "description": "Audio file name", + "items": { + "type": "string" + }, + "title": "Audio", + "type": "array" + }, + "flac_url": { + "default": [], + "items": { + "type": "string" + }, + "title": "Flac Url", + "type": "array" + }, + "ipa": { + "default": [], + "description": "International Phonetic Alphabet", + "items": { + "type": "string" + }, + "title": "Ipa", + "type": "array" + }, + "lang_code": { + "default": [], + "description": "Wiktionary language code", + "items": { + "type": "string" + }, + "title": "Lang Code", + "type": "array" + }, + "lang_name": { + "default": [], + "description": "Localized language name", + "items": { + "type": "string" + }, + "title": "Lang Name", + "type": "array" + }, + "mp3_url": { + "default": [], + "items": { + "type": "string" + }, + "title": "Mp3 Url", + "type": "array" + }, + "oga_url": { + "default": [], + "items": { + "type": "string" + }, + "title": "Oga Url", + "type": "array" + }, + "ogg_url": { + "default": [], + "items": { + "type": "string" + }, + "title": "Ogg Url", + "type": "array" + }, + "tags": { + "default": [], + "description": "Specifying the variant of the pronunciation", + "items": { + "type": "string" + }, + "title": "Tags", + "type": "array" + }, + "wav_url": { + "default": [], + "items": { + "type": "string" + }, + "title": "Wav Url", + "type": "array" + } + }, + "title": "Sound", + "type": "object" + }, + "Translation": { + "additionalProperties": false, + "properties": { + "lang_code": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Wiktionary language code of the translation term", + "title": "Lang Code" + }, + "lang_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Localized language name", + "title": "Lang Name" + }, + "notes": { + "default": [], + "description": "A list of notes", + "items": { + "type": "string" + }, + "title": "Notes", + "type": "array" + }, + "roman": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Transliteration in roman characters", + "title": "Roman" + }, + "sense": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "A gloss of the sense being translated", + "title": "Sense" + }, + "tags": { + "default": [], + "description": "Tags specifying the translated term, usually gender information", + "items": { + "type": "string" + }, + "title": "Tags", + "type": "array" + }, + "uncertain": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "null" + } + ], + "default": false, + "description": "Translation marked as uncertain", + "title": "Uncertain" + }, + "word": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Translation term", + "title": "Word" + } + }, + "title": "Translation", + "type": "object" + } + }, + "$id": "https://kaikki.org/de.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "additionalProperties": false, + "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", + "properties": { + "antonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Antonyms" + }, + "coordinate_terms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Coordinate Terms" + }, + "derived": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Derived" + }, + "expressions": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Expressions" + }, + "holonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Holonyms" + }, + "hypernyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hypernyms" + }, + "hyponyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Hyponyms" + }, + "lang_code": { + "description": "Wiktionary language code", + "examples": [ + "es" + ], + "title": "Lang Code", + "type": "string" + }, + "lang_name": { + "description": "Localized language name of the word", + "examples": [ + "español" + ], + "title": "Lang Name", + "type": "string" + }, + "pos": { + "default": null, + "description": "Part of speech type", + "title": "Pos", + "type": "string" + }, + "proverbs": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Proverbs" + }, + "senses": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Sense" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Senses" + }, + "sounds": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Sound" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Sounds" + }, + "synonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Synonyms" + }, + "translations": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Translation" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Translations" + }, + "word": { + "description": "word string", + "title": "Word", + "type": "string" + } + }, + "required": [ + "word", + "lang_code", + "lang_name" + ], + "title": "German Wiktionary", + "type": "object" +} \ No newline at end of file diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py index 9c7e247f..67b261e6 100644 --- a/src/wiktextract/extractor/de/example.py +++ b/src/wiktextract/extractor/de/example.py @@ -1,21 +1,48 @@ -from collections import defaultdict -from typing import Dict, List +import copy from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode + +from wiktextract.extractor.de.models import Example, Reference, WordEntry from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +REF_KEY_MAP = { + "autor": "author", + "a": "author", + "titel": "title", + "titelerg": "title_complement", + "auflage": "edition", + "verlag": "publisher", + "ort": "place", + "jahr": "year", + "seiten": "pages", + "isbn": "isbn", + "übersetzer": "translator", + "herausgeber": "editor", + "sammelwerk": "collection", + "werk": "collection", + "band": "volume", + "kommentar": "comment", + "online": "url", + "tag": "day", + "monat": "month", + "zugriff": "accessdate", + "nummer": "number", + "datum": "date", + "hrsg": "editor", +} + def extract_examples( wxr: WiktextractContext, - page_data: List[Dict], + word_entry: WordEntry, level_node: LevelNode, ) -> None: for list_node in level_node.find_child(NodeKind.LIST): for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): - example_data = defaultdict(str) + example_data = Example() ref_nodes = find_and_remove_child( list_item_node, @@ -30,12 +57,12 @@ def extract_examples( senseid, example_text = match_senseid(example_text) if example_text: - example_data["text"] = example_text + example_data.text = example_text if senseid: - for sense in page_data[-1]["senses"]: - if sense["senseid"] == senseid: - sense["examples"].append(example_data) + for sense in word_entry.senses: + if sense.senseid == senseid: + sense.examples.append(copy.deepcopy(example_data)) else: if example_data: @@ -51,11 +78,11 @@ def extract_examples( def extract_reference( - wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode + wxr: WiktextractContext, example_data: Example, ref_node: WikiNode ): - reference_data = defaultdict() + reference_data = Reference() - reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children) + reference_data.raw_ref = clean_node(wxr, {}, ref_node.children) template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE)) @@ -72,9 +99,18 @@ def extract_reference( # https://de.wiktionary.org/wiki/Vorlage:Literatur for key, value in template_node.template_parameters.items(): if isinstance(key, str): - reference_data[key.lower()] = clean_node(wxr, {}, value) + key_english = REF_KEY_MAP.get(key.lower(), key.lower()) + if key_english in reference_data.model_fields: + setattr( + reference_data, key_english, clean_node(wxr, {}, value) + ) + else: + wxr.wtp.debug( + f"Unexpected key in reference: {key_english}", + sortid="extractor/de/examples/extract_examples/77", + ) # XXX: Treat other templates as well. # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID - example_data["ref"] = reference_data + example_data.ref = reference_data diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index ad9183f3..53a11351 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -1,9 +1,10 @@ +import copy import re -from collections import defaultdict -from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode + +from wiktextract.extractor.de.models import Sense, WordEntry from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -11,25 +12,25 @@ def extract_glosses( wxr: WiktextractContext, - page_data: List[Dict], + word_entry: WordEntry, level_node: LevelNode, ) -> None: for list_node in level_node.find_child(NodeKind.LIST): - process_gloss_list_item(wxr, page_data, list_node) + process_gloss_list_item(wxr, word_entry, list_node) for non_list_node in level_node.invert_find_child(NodeKind.LIST): wxr.wtp.debug( - f"Found unexpected non-list node in pronunciation section: {non_list_node}", - sortid="extractor/de/pronunciation/extract_pronunciation/64", + f"Found unexpected non-list node in gloss section: {non_list_node}", + sortid="extractor/de/gloss/extract_gloss/24", ) def process_gloss_list_item( wxr: WiktextractContext, - page_data: List[Dict], + word_entry: WordEntry, list_node: WikiNode, parent_senseid: str = "", - parent_gloss_data: defaultdict(list) = None, + parent_gloss_data: Sense = None, ) -> None: for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): item_type = list_item_node.sarg @@ -48,10 +49,10 @@ def process_gloss_list_item( ): continue - gloss_data = ( - defaultdict(list) + sense_data = ( + Sense() if parent_gloss_data is None - else parent_gloss_data.copy() + else copy.deepcopy(parent_gloss_data) ) # Extract sub-glosses for later processing @@ -60,11 +61,11 @@ def process_gloss_list_item( ) raw_gloss = clean_node(wxr, {}, list_item_node.children) - gloss_data["raw_glosses"] = [raw_gloss] + sense_data.raw_glosses = [raw_gloss] - process_K_template(wxr, gloss_data, list_item_node) + process_K_template(wxr, sense_data, list_item_node) - gloss_text = clean_node(wxr, gloss_data, list_item_node.children) + gloss_text = clean_node(wxr, sense_data, list_item_node.children) senseid, gloss_text = match_senseid(gloss_text) @@ -74,7 +75,7 @@ def process_gloss_list_item( if senseid[0].isnumeric() else parent_senseid + senseid ) - gloss_data["senseid"] = senseid + sense_data.senseid = senseid else: wxr.wtp.debug( f"Failed to extract sense number from gloss node: {list_item_node}", @@ -82,19 +83,19 @@ def process_gloss_list_item( ) # XXX: Extract tags from nodes instead using Italic and Template - gloss_text = extract_tags_from_gloss_text(gloss_data, gloss_text) + gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text) if gloss_text or not sub_glosses_list_nodes: - gloss_data["glosses"] = [gloss_text] - page_data[-1]["senses"].append(gloss_data) + sense_data.glosses = [gloss_text] + word_entry.senses.append(sense_data) for sub_list_node in sub_glosses_list_nodes: process_gloss_list_item( wxr, - page_data, + word_entry, sub_list_node, senseid, - gloss_data if not gloss_text else None, + sense_data if not gloss_text else None, ) else: @@ -105,7 +106,7 @@ def process_gloss_list_item( continue -def handle_sense_modifier(wxr, list_item_node): +def handle_sense_modifier(wxr: WiktextractContext, list_item_node: WikiNode): wxr.wtp.debug( f"Skipped a sense modifier in gloss list: {list_item_node}", sortid="extractor/de/glosses/extract_glosses/19", @@ -117,14 +118,16 @@ def handle_sense_modifier(wxr, list_item_node): def process_K_template( wxr: WiktextractContext, - gloss_data: defaultdict(list), + sense_data: Sense, list_item_node: NodeKind.LIST_ITEM, ) -> None: for template_node in list_item_node.find_child(NodeKind.TEMPLATE): if template_node.template_name == "K": - text = clean_node(wxr, gloss_data, template_node).removesuffix(":") + categories = {"categories": []} + text = clean_node(wxr, categories, template_node).removesuffix(":") + sense_data.categories.extend(categories["categories"]) tags = re.split(r";|,", text) - gloss_data["tags"] = [t.strip() for t in tags] + sense_data.tags = [t.strip() for t in tags] # Prepositional and case information is sometimes only expanded to # category links and not present in cleaned node. We still want it @@ -133,7 +136,7 @@ def process_K_template( case = template_node.template_parameters.get("Kas") category = (prep if prep else "") + (" + " + case if case else "") if category: - gloss_data["tags"].append(category) + sense_data.tags.append(category) # XXX: Investigate better ways to handle free text in K template ft = template_node.template_parameters.get("ft") @@ -149,16 +152,14 @@ def process_K_template( ] -def extract_tags_from_gloss_text( - gloss_data: defaultdict(list), gloss_text: str -) -> None: +def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None: parts = gloss_text.split(":", 1) if len(parts) > 1: tags_part = parts[0].strip() categories = [c.strip() for c in re.split(",", tags_part)] if all(c.isalnum() for c in categories): - gloss_data["tags"].extend(categories) + sense_data.tags.extend(categories) return parts[1].strip() return gloss_text diff --git a/src/wiktextract/extractor/de/linkage.py b/src/wiktextract/extractor/de/linkage.py index 10de977e..d15ff47d 100644 --- a/src/wiktextract/extractor/de/linkage.py +++ b/src/wiktextract/extractor/de/linkage.py @@ -1,15 +1,16 @@ import re -from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode + +from wiktextract.extractor.de.models import WordEntry from wiktextract.extractor.de.utils import split_senseids from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext def extract_linkages( - wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode + wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode ): linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0]) for list_node in level_node.find_child(NodeKind.LIST): @@ -25,7 +26,7 @@ def extract_linkages( ) # Extract links - linkages = [] + linkages: list[str] = [] if linkage_type == "expressions": for child in list_item.children: if isinstance(child, str) and contains_dash(child): @@ -43,15 +44,33 @@ def extract_linkages( process_link(wxr, linkages, link) # Add links to the page data - if len(page_data[-1]["senses"]) == 1: - page_data[-1]["senses"][0][linkage_type].extend(linkages) + if len(word_entry.senses) == 1: + if linkage_type in word_entry.senses[0].model_fields: + getattr(word_entry.senses[0], linkage_type).extend(linkages) + else: + wxr.wtp.debug( + f"Linkage type {linkage_type} not in sense model fields", + sortid="extractor/de/linkages/extract_linkages/54}", + ) elif len(senseids) > 0: for senseid in senseids: - for sense in page_data[-1]["senses"]: - if sense["senseid"] == senseid: - sense[linkage_type].extend(linkages) + for sense in word_entry.senses: + if sense.senseid == senseid: + if linkage_type in sense.model_fields: + getattr(sense, linkage_type).extend(linkages) + else: + wxr.wtp.debug( + f"Linkage type {linkage_type} not in sense model fields", + sortid="extractor/de/linkages/extract_linkages/54}", + ) else: - page_data[-1][linkage_type].extend(linkages) + if linkage_type in word_entry.model_fields: + getattr(word_entry, linkage_type).extend(linkages) + else: + wxr.wtp.debug( + f"Linkage type {linkage_type} not in entry model fields", + sortid="extractor/de/linkages/extract_linkages/54}", + ) # Check for potentially missed data for non_link in list_item.invert_find_child(NodeKind.LINK): @@ -72,7 +91,7 @@ def extract_linkages( def process_link( - wxr: WiktextractContext, semantic_links: List[str], link: WikiNode + wxr: WiktextractContext, semantic_links: list[str], link: WikiNode ): clean_link = clean_node(wxr, {}, link) if clean_link.startswith("Verzeichnis:"): diff --git a/src/wiktextract/extractor/de/models.py b/src/wiktextract/extractor/de/models.py new file mode 100644 index 00000000..137442b3 --- /dev/null +++ b/src/wiktextract/extractor/de/models.py @@ -0,0 +1,196 @@ +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field + + +class BaseModelWrap(BaseModel): + model_config = ConfigDict(validate_assignment=True, extra="forbid") + + +class Translation(BaseModelWrap): + sense: Optional[str] = Field( + default=None, description="A gloss of the sense being translated" + ) + word: Optional[str] = Field(default=None, description="Translation term") + lang_code: Optional[str] = Field( + default=None, + description="Wiktionary language code of the translation term", + ) + lang_name: Optional[str] = Field( + default=None, description="Localized language name" + ) + uncertain: Optional[bool] = Field( + default=False, description="Translation marked as uncertain" + ) + roman: Optional[str] = Field( + default=None, description="Transliteration to Roman characters" + ) + # senseids: list[str] = Field( + # default=[], + # description="List of senseids where this translation applies", + # ) + tags: list[str] = Field( + default=[], + description="Tags specifying the translated term, usually gender information", + ) + notes: list[str] = Field(default=[], description="A list of notes") + roman: Optional[str] = Field( + default=None, description="Transliteration in roman characters" + ) + + +class Reference(BaseModelWrap): + raw_ref: str = Field(default=None, description="Raw reference string") + url: Optional[str] = Field( + default=None, description="A web link. Not necessarily well-formated." + ) + author: Optional[str] = Field(default=None, description="Author's name") + + title: Optional[str] = Field( + default=None, description="Title of the reference" + ) + title_complement: Optional[str] = Field( + default=None, description="Complement to the title" + ) + pages: Optional[str] = Field(default=None, description="Page numbers") + year: Optional[str] = Field(default=None, description="Year of publication") + publisher: Optional[str] = Field(default=None, description="Published by") + editor: Optional[str] = Field(default=None, description="Editor") + translator: Optional[str] = Field(default=None, description="Translator") + collection: Optional[str] = Field( + default=None, + description="Name of collection that reference was published in", + ) + volume: Optional[str] = Field(default=None, description="Volume number") + comment: Optional[str] = Field( + default=None, description="Comment on the reference" + ) + day: Optional[str] = Field(default=None, description="Day of publication") + month: Optional[str] = Field( + default=None, description="Month of publication" + ) + accessdate: Optional[str] = Field( + default=None, description="Date of access of online reference" + ) + + date: Optional[str] = Field(default=None, description="Date of publication") + number: Optional[str] = Field(default=None, description="Issue number") + # journal: Optional[str] = Field(default=None, description="Name of journal") + # chapter: Optional[str] = Field(default=None, description="Chapter name") + place: Optional[str] = Field( + default=None, description="Place of publication" + ) + # editor: Optional[str] = Field(default=None, description="Editor") + edition: Optional[str] = Field(default=None, description="Edition number") + isbn: Optional[str] = Field(default=None, description="ISBN number") + + +class Example(BaseModelWrap): + text: str = Field(default=None, description="Example usage sentence") + # translation: Optional[str] = Field( + # default=None, description="Spanish translation of the example sentence" + # ) + ref: Optional["Reference"] = Field(default=None, description="") + + +class Sense(BaseModelWrap): + glosses: list[str] = Field( + default=[], + description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + ) + raw_glosses: list[str] = Field( + default=[], + description="list of uncleaned raw glosses for the word sense (usually only one).", + ) + tags: list[str] = Field( + default=[], + description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + ) + categories: list[str] = Field( + default=[], + description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + ) + examples: list["Example"] = Field( + default=[], description="List of examples" + ) + # subsenses: list["Sense"] = Field( + # default=[], description="List of subsenses" + # ) + senseid: Optional[str] = Field( + default=None, description="Sense number used in Wiktionary" + ) + translations: Optional[list[Translation]] = [] + antonyms: Optional[list[str]] = [] + derived: Optional[list[str]] = [] + hyponyms: Optional[list[str]] = [] + hypernyms: Optional[list[str]] = [] + holonyms: Optional[list[str]] = [] + expressions: Optional[list[str]] = [] + coordinate_terms: Optional[list[str]] = [] + proverbs: Optional[list[str]] = [] + synonyms: Optional[list[str]] = [] + + +class Sound(BaseModelWrap): + ipa: list[str] = Field( + default=[], description="International Phonetic Alphabet" + ) + # phonetic_transcription: list[str] = Field( + # default=[], description="Phonetic transcription, less exact than IPA." + # ) + audio: list[str] = Field(default=[], description="Audio file name") + wav_url: list[str] = Field(default=[]) + ogg_url: list[str] = Field(default=[]) + mp3_url: list[str] = Field(default=[]) + oga_url: list[str] = Field(default=[]) + flac_url: list[str] = Field(default=[]) + lang_code: list[str] = Field( + default=[], description="Wiktionary language code" + ) + lang_name: list[str] = Field( + default=[], description="Localized language name" + ) + # roman: list[str] = Field( + # default=[], description="Translitaration to Roman characters" + # ) + # syllabic: list[str] = Field( + # default=[], description="Syllabic transcription" + # ) + tags: list[str] = Field( + default=[], description="Specifying the variant of the pronunciation" + ) + pass + + +class WordEntry(BaseModelWrap): + """ + WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract. + """ + + model_config = ConfigDict(title="German Wiktionary") + + word: str = Field(description="word string") + pos: str = Field(default=None, description="Part of speech type") + # pos_title: str = Field(default=None, description="Original POS title") + lang_code: str = Field( + description="Wiktionary language code", examples=["es"] + ) + lang_name: str = Field( + description="Localized language name of the word", examples=["español"] + ) + senses: Optional[list[Sense]] = [] + # categories: list[str] = Field( + # default=[], + # description="list of non-disambiguated categories for the word", + # ) + translations: Optional[list[Translation]] = [] + sounds: Optional[list[Sound]] = [] + antonyms: Optional[list[str]] = [] + derived: Optional[list[str]] = [] + hyponyms: Optional[list[str]] = [] + hypernyms: Optional[list[str]] = [] + holonyms: Optional[list[str]] = [] + expressions: Optional[list[str]] = [] + coordinate_terms: Optional[list[str]] = [] + proverbs: Optional[list[str]] = [] + synonyms: Optional[list[str]] = [] diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 8b276de2..52993e1c 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -1,12 +1,12 @@ import copy import logging -from collections import defaultdict -from typing import Dict, List, Union +from typing import Union from mediawiki_langcodes import name_to_code from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode -from wiktextract.datautils import append_base_data + +from wiktextract.extractor.de.models import WordEntry from wiktextract.wxr_context import WiktextractContext from .example import extract_examples @@ -30,56 +30,70 @@ def parse_section( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, - level_node: Union[WikiNode, List[Union[WikiNode, str]]], + page_data: list[WordEntry], + base_data: WordEntry, + level_node_or_children: Union[WikiNode, list[Union[WikiNode, str]]], ) -> None: # Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage - - if isinstance(level_node, list): - for x in level_node: + if isinstance(level_node_or_children, list): + for x in level_node_or_children: parse_section(wxr, page_data, base_data, x) return - elif not isinstance(level_node, WikiNode): - if not isinstance(level_node, str) or not level_node.strip() == "": + elif not isinstance(level_node_or_children, WikiNode): + if ( + not isinstance(level_node_or_children, str) + or not level_node_or_children.strip() == "" + ): wxr.wtp.debug( - f"Unexpected node type in parse_section: {level_node}", + f"Unexpected node type in parse_section: {level_node_or_children}", sortid="extractor/de/page/parse_section/31", ) return # Level 3 headings are used to start POS sections like # === {{Wortart|Verb|Deutsch}} === - elif level_node.kind == NodeKind.LEVEL3: - for template_node in level_node.find_content(NodeKind.TEMPLATE): + elif level_node_or_children.kind == NodeKind.LEVEL3: + for template_node in level_node_or_children.find_content( + NodeKind.TEMPLATE + ): # German Wiktionary uses a `Wortart` template to define the POS if template_node.template_name == "Wortart": process_pos_section( - wxr, page_data, base_data, level_node, template_node + wxr, + page_data, + base_data, + level_node_or_children, + template_node, ) return # Level 4 headings were introduced by overriding the default templates. # See overrides/de.json for details. - elif level_node.kind == NodeKind.LEVEL4: - section_name = level_node.largs[0][0] + elif level_node_or_children.kind == NodeKind.LEVEL4: + section_name = level_node_or_children.largs[0][0] wxr.wtp.start_subsection(section_name) + if not len(page_data) > 0: + wxr.wtp.debug( + f"Reached section without extracting some page data first: {level_node_or_children}", + sortid="extractor/de/page/parse_section/55", + ) + return if section_name == "Bedeutungen": - extract_glosses(wxr, page_data, level_node) + extract_glosses(wxr, page_data[-1], level_node_or_children) elif wxr.config.capture_pronunciation and section_name == "Aussprache": - extract_pronunciation(wxr, page_data, level_node) + extract_pronunciation(wxr, page_data[-1], level_node_or_children) elif wxr.config.capture_examples and section_name == "Beispiele": - extract_examples(wxr, page_data, level_node) + extract_examples(wxr, page_data[-1], level_node_or_children) elif ( wxr.config.capture_translations and section_name == "Übersetzungen" ): - extract_translation(wxr, page_data, level_node) + extract_translation(wxr, page_data[-1], level_node_or_children) elif ( wxr.config.capture_linkages and section_name in wxr.config.LINKAGE_SUBTITLES ): - extract_linkages(wxr, page_data, level_node) + extract_linkages(wxr, page_data[-1], level_node_or_children) FORM_POS = { @@ -103,8 +117,8 @@ def parse_section( def process_pos_section( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: list[WordEntry], + base_data: WordEntry, level_node: LevelNode, pos_template_node: WikiNode, ) -> None: @@ -127,10 +141,10 @@ def process_pos_section( return pos = pos_type["pos"] - wxr.wtp.start_section(page_data[-1]["lang_code"] + "_" + pos) + base_data.pos = pos + page_data.append(copy.deepcopy(base_data)) - base_data["pos"] = pos - append_base_data(page_data, "pos", pos, base_data) + wxr.wtp.start_section(page_data[-1].lang_code + "_" + pos) # There might be other templates in the level node that define grammatical # features other than the POS. Extract them here. @@ -239,7 +253,7 @@ def process_pos_section( def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, str]]: +) -> list[dict[str, any]]: if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") @@ -254,7 +268,7 @@ def parse_page( additional_expand=ADDITIONAL_EXPAND_TEMPLATES, ) - page_data = [] + page_data: list[WordEntry] = [] for level2_node in tree.find_child(NodeKind.LEVEL2): for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): # The language sections are marked with @@ -275,15 +289,9 @@ def parse_page( ): continue - base_data = defaultdict( - list, - { - "lang": lang_name, - "lang_code": lang_code, - "word": wxr.wtp.title, - }, + base_data = WordEntry( + lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title ) - page_data.append(copy.deepcopy(base_data)) parse_section(wxr, page_data, base_data, level2_node.children) - return page_data + return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py index 2fb63e6d..545d016f 100644 --- a/src/wiktextract/extractor/de/pronunciation.py +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -1,9 +1,10 @@ -from collections import defaultdict -from typing import Dict, List, Union +from typing import Union from mediawiki_langcodes import code_to_name from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import LevelNode + +from wiktextract.extractor.de.models import Sound, WordEntry from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -11,11 +12,11 @@ def extract_pronunciation( wxr: WiktextractContext, - page_data: List[Dict], + word_entry: WordEntry, level_node: LevelNode, ): for list_node in level_node.find_child(NodeKind.LIST): - sound_data = [defaultdict(list)] + sound_data: list[Sound] = [Sound()] for not_list_item_node in list_node.invert_find_child( NodeKind.LIST_ITEM @@ -44,7 +45,7 @@ def extract_pronunciation( if head_template.template_name == "IPA": process_ipa(wxr, sound_data, rest) elif head_template.template_name == "Hörbeispiele": - sound_data.append(defaultdict(list)) + sound_data.append(Sound()) process_hoerbeispiele(wxr, sound_data, rest) elif head_template.template_name == "Reime": process_rhymes(wxr, sound_data, rest) @@ -55,9 +56,13 @@ def extract_pronunciation( ) # Remove empty entries - sound_data = [entry for entry in sound_data if entry != {}] + sound_data = [ + entry + for entry in sound_data + if entry.model_dump(exclude_defaults=True) != {} + ] if len(sound_data) > 0: - page_data[-1]["sounds"].extend(sound_data) + word_entry.sounds.extend(sound_data) for non_list_node in level_node.invert_find_child(NodeKind.LIST): wxr.wtp.debug( @@ -68,16 +73,16 @@ def extract_pronunciation( def process_ipa( wxr: WiktextractContext, - sound_data: List[Dict], - nodes: List[Union[WikiNode, str]], + sound_data: list[Sound], + nodes: list[Union[WikiNode, str]], ): for node in nodes: if is_template_node_with_name(node, "Lautschrift"): process_lautschrift_template(wxr, sound_data, node) elif is_tag_node(node): - append_tag(wxr, sound_data, node) + append_tag(wxr, sound_data[-1], node) elif is_new_sound_data_entry_sep(node): - sound_data.append(defaultdict(list)) + sound_data.append(Sound()) else: wxr.wtp.debug( f"Found unexpected non-Lautschrift node in IPA section: {node}", @@ -86,7 +91,7 @@ def process_ipa( def process_lautschrift_template( - wxr: WiktextractContext, sound_data: List[Dict], node + wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode ): template_parameters = node.template_parameters @@ -94,29 +99,30 @@ def process_lautschrift_template( lang_code = template_parameters.get("spr") if lang_code: - language = code_to_name(lang_code, "de") + lang_name = code_to_name(lang_code, "de") add_sound_data_without_appending_to_existing_properties( + wxr, sound_data, { "ipa": [ipa], "lang_code": lang_code, - "language": language, + "lang_name": lang_name, }, ) else: - sound_data[-1]["ipa"].append(ipa) + sound_data[-1].ipa.append(ipa) def process_hoerbeispiele( - wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode] + wxr: WiktextractContext, sound_data: list[Sound], nodes: list[WikiNode] ): for node in nodes: if is_template_node_with_name(node, "Audio"): process_audio_template(wxr, sound_data, node) elif is_tag_node(node): - append_tag(wxr, sound_data, node) + append_tag(wxr, sound_data[-1], node) elif is_new_sound_data_entry_sep(node): - sound_data.append(defaultdict(list)) + sound_data.append(Sound()) else: wxr.wtp.debug( f"Found unexpected node in Hoerbeispiele section: {node}", @@ -125,17 +131,17 @@ def process_hoerbeispiele( def process_audio_template( - wxr: WiktextractContext, sound_data: List[Dict], node + wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode ): audio_file = node.template_parameters.get(1) if audio_file: add_sound_data_without_appending_to_existing_properties( - sound_data, create_audio_url_dict(audio_file) + wxr, sound_data, create_audio_url_dict(audio_file) ) def process_rhymes( - wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode] + wxr: WiktextractContext, sound_data: list[Sound], nodes: list[WikiNode] ): # XXX: Extract rhymes from the referenced rhymes page pass @@ -150,18 +156,30 @@ def is_template_node_with_name(node: Union[WikiNode, str], template_name: str): def add_sound_data_without_appending_to_existing_properties( - sound_data: List[Dict], - new_sound_data: Dict, + wxr: WiktextractContext, + sound_data: list[Sound], + new_sound_data: list[dict], ): """Creates a new IPA data entry if properties exist in previous entry.""" - if any([key in sound_data[-1] for key in new_sound_data.keys()]): - sound_data.append(defaultdict(list)) + if any( + [ + key in sound_data[-1].model_dump(exclude_defaults=True) + for key in new_sound_data.keys() + ] + ): + sound_data.append(Sound()) for key, value in new_sound_data.items(): - if isinstance(value, str): - sound_data[-1][key] = value + if key in sound_data[-1].model_fields: + if isinstance(value, str): + getattr(sound_data[-1], key).append(value) + else: + getattr(sound_data[-1], key).extend(value) else: - sound_data[-1][key].extend(value) + wxr.wtp.debug( + f"Unexpected key {key} for Sound", + sortid="extractor/de/pronunciation/add_sound_data_without_appending_to_existing_properties/167", + ) def is_tag_node(node: Union[WikiNode, str]): @@ -171,10 +189,10 @@ def is_tag_node(node: Union[WikiNode, str]): ] -def append_tag(wxr: WiktextractContext, sound_data: Dict, node: WikiNode): +def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode): tag = clean_node(wxr, {}, node).strip() if tag: - sound_data[-1]["tags"].append(tag) + sound_data.tags.append(tag) def is_new_sound_data_entry_sep(node: Union[WikiNode, str]): diff --git a/src/wiktextract/extractor/de/translation.py b/src/wiktextract/extractor/de/translation.py index 77e598e1..4f9c744b 100644 --- a/src/wiktextract/extractor/de/translation.py +++ b/src/wiktextract/extractor/de/translation.py @@ -1,16 +1,18 @@ +import copy import re -from collections import defaultdict -from typing import Dict, List, Union +from typing import Union from mediawiki_langcodes import code_to_name from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode + +from wiktextract.extractor.de.models import Translation, WordEntry from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext def extract_translation( - wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode + wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode ) -> None: for level_node_child in level_node.filter_empty_str_child(): if not ( @@ -24,7 +26,7 @@ def extract_translation( ) else: sense_translations = [] - base_translation_data = defaultdict(list) + base_translation_data = Translation() senseid = level_node_child.template_parameters.get(1) if senseid == None: # XXX: Sense-disambiguate where senseids are in Ü-Liste (ca. 0.03% of pages), e.g.: @@ -48,7 +50,7 @@ def extract_translation( # """ continue - base_translation_data["sense"] = clean_node(wxr, {}, sense_text) + base_translation_data.sense = clean_node(wxr, {}, sense_text) translation_list = level_node_child.template_parameters.get( "Ü-Liste" @@ -69,9 +71,9 @@ def extract_translation( matched_senseid = False if senseid: - for sense in page_data[-1]["senses"]: - if sense["senseid"] == senseid.strip(): - sense["translations"].extend(sense_translations) + for sense in word_entry.senses: + if sense.senseid == senseid.strip(): + sense.translations.extend(sense_translations) matched_senseid = True if not matched_senseid: @@ -79,14 +81,14 @@ def extract_translation( f"Unknown senseid: {senseid}.", sortid="extractor/de/translation/extract_translation/65", ) - page_data[-1]["translations"].extend(sense_translations) + word_entry.translations.extend(sense_translations) def process_translation_list( wxr: WiktextractContext, - sense_translations: List[Dict], - base_translation_data: Dict[str, List], - translation_list: List[Union[WikiNode, str]], + sense_translations: list[dict], + base_translation_data: Translation, + translation_list: list[Union[WikiNode, str]], ): modifiers = [] for node in translation_list: @@ -94,23 +96,23 @@ def process_translation_list( modifiers.append(node) else: - translation_data = base_translation_data.copy() + translation_data = copy.deepcopy(base_translation_data) process_modifiers( wxr, sense_translations, translation_data, modifiers ) lang_code = node.template_parameters.get(1) - translation_data["code"] = lang_code - translation_data["lang"] = code_to_name(lang_code, "de") - if translation_data["lang"] == "": + translation_data.lang_code = lang_code + translation_data.lang_name = code_to_name(lang_code, "de") + if translation_data.lang_name == "": wxr.wtp.debug( - f"Unknown language code: {translation_data['lang']}", + f"Unknown language code: {translation_data.lang_name}", sortid="extractor/de/translation/process_translation_list/70", ) if node.template_name[-1] == "?": - translation_data["uncertain"] = True + translation_data.uncertain = True - translation_data["word"] = clean_node( + translation_data.word = clean_node( wxr, {}, node.template_parameters.get(2) ) @@ -122,7 +124,7 @@ def process_translation_list( sense_translations.append(translation_data) # Process modifiers at the end of the list - process_modifiers(wxr, sense_translations, defaultdict, modifiers) + process_modifiers(wxr, sense_translations, Translation(), modifiers) def is_translation_template(node: any) -> bool: @@ -135,7 +137,7 @@ def is_translation_template(node: any) -> bool: def process_Ü_template( wxr: WiktextractContext, - translation_data: Dict[str, Union[str, List, bool]], + translation_data: dict[str, Union[str, list, bool]], template_node: TemplateNode, ): overwrite_word( @@ -145,19 +147,19 @@ def process_Ü_template( def process_Üt_template( wxr: WiktextractContext, - translation_data: Dict[str, Union[str, List, bool]], + translation_data: dict[str, Union[str, list, bool]], template_node: TemplateNode, ): transcription = template_node.template_parameters.get(3) if transcription: - translation_data["roman"] = clean_node(wxr, {}, transcription) + translation_data.roman = clean_node(wxr, {}, transcription) # Look for automatic transcription else: cleaned_node = clean_node(wxr, {}, template_node) match = re.search(r"\(([^)]+?)\^\☆\)", cleaned_node) if match: - translation_data["roman"] = match.group(1) + translation_data.roman = match.group(1) overwrite_word( wxr, translation_data, template_node.template_parameters.get(4) @@ -166,45 +168,47 @@ def process_Üt_template( def overwrite_word( wxr: WiktextractContext, - translation_data: Dict[str, Union[str, List, bool]], - nodes: Union[List[Union[WikiNode, str]], WikiNode, str, None], + translation_data: Translation, + nodes: Union[list[Union[WikiNode, str]], WikiNode, str, None], ): if nodes == None: return overwrite_word = clean_node(wxr, {}, nodes).strip() if overwrite_word: - translation_data["word"] = overwrite_word + translation_data.word = overwrite_word def process_modifiers( wxr: WiktextractContext, - sense_translations: List[Dict], - translation_data: Dict[str, Union[str, List, bool]], + sense_translations: list[Translation], + base_translation_data: Translation, modifiers, ): + if not modifiers: + return # Get rid of the "*" and language template nodes that start each translation for i, elem in enumerate(modifiers): if isinstance(elem, str) and "*" in elem: del modifiers[i:] break - clean_text = clean_node(wxr, {}, modifiers).strip() if clean_text: tags = re.split(r";|,|\(|\)|:", clean_text) tags = [tag.strip() for tag in tags if tag.strip()] if tags: if clean_text.endswith(":"): - translation_data["tags"].extend(tags) + base_translation_data.tags.extend(tags) elif sense_translations: - sense_translations[-1]["tags"].extend(tags) + sense_translations[-1].tags.extend(tags) + # Reset modifiers modifiers.clear() def process_dialect_table( wxr: WiktextractContext, - base_translation_data: Dict[str, Union[str, List, bool]], - dialect_table: List[Union[WikiNode, str]], + base_translation_data: Translation, + dialect_table: list[Union[WikiNode, str]], ): wxr.wtp.debug("Dialect table not implemented yet.", sortid="TODO") # XXX: Extract dialect information (ca. 0.12% of pages), e.g.: diff --git a/tests/test_de_example.py b/tests/test_de_example.py index 3a40b3c5..29a903f6 100644 --- a/tests/test_de_example.py +++ b/tests/test_de_example.py @@ -1,10 +1,10 @@ import unittest -from collections import defaultdict from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.example import extract_examples, extract_reference +from wiktextract.extractor.de.models import Example, Sense, WordEntry from wiktextract.wxr_context import WiktextractContext @@ -19,38 +19,37 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() + def get_default_page_data(self) -> list[WordEntry]: + return [WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch")] + def test_de_extract_examples(self): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse( ":[1] example1A \n:[1] example1B\n:[2] example2\n:[3] example3" ) - page_data = [defaultdict(list)] - page_data[-1]["senses"] = [ - defaultdict(list, {"senseid": "1"}), - defaultdict(list, {"senseid": "2"}), - ] + page_data = self.get_default_page_data() + page_data[-1].senses = [Sense(senseid="1"), Sense(senseid="2")] - extract_examples(self.wxr, page_data, root) + extract_examples(self.wxr, page_data[-1], root) + senses = [ + s.model_dump(exclude_defaults=True) for s in page_data[-1].senses + ] self.assertEqual( - page_data, + senses, [ { - "senses": [ - { - "examples": [ - {"text": "example1A"}, - {"text": "example1B"}, - ], - "senseid": "1", - }, - { - "examples": [{"text": "example2"}], - "senseid": "2", - }, - ] - } + "examples": [ + {"text": "example1A"}, + {"text": "example1B"}, + ], + "senseid": "1", + }, + { + "examples": [{"text": "example2"}], + "senseid": "2", + }, ], ) @@ -58,29 +57,26 @@ def test_de_extract_example_with_reference(self): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse(":[1] example1 <ref>ref1A</ref>") - page_data = [defaultdict(list)] - page_data[-1]["senses"] = [ - defaultdict(list, {"senseid": "1"}), - ] + page_data = self.get_default_page_data() + page_data[-1].senses = [Sense(senseid="1")] - extract_examples(self.wxr, page_data, root) + extract_examples(self.wxr, page_data[-1], root) + senses = [ + s.model_dump(exclude_defaults=True) for s in page_data[-1].senses + ] self.assertEqual( - page_data, + senses, [ { - "senses": [ + "examples": [ { - "examples": [ - { - "text": "example1", - "ref": {"raw_ref": "ref1A"}, - }, - ], - "senseid": "1", + "text": "example1", + "ref": {"raw_ref": "ref1A"}, }, - ] - } + ], + "senseid": "1", + }, ], ) @@ -92,21 +88,21 @@ def test_de_extract_reference_from_literatur_template(self): "<ref>{{Literatur|Autor=Steffen Möller|Titel=Viva Warszawa|TitelErg=Polen für Fortgeschrittene|Verlag=Piper|Ort=München/Berlin|Jahr=2015}}, Seite 273. ISBN 978-3-89029-459-9.</ref>" ) - example_data = defaultdict(str) + example_data = Example() extract_reference(self.wxr, example_data, root.children[0]) self.assertEqual( - example_data, + example_data.model_dump(exclude_defaults=True), { "ref": { "raw_ref": "Expanded template, Seite 273. ISBN 978-3-89029-459-9.", - "titel": "Viva Warszawa", - "autor": "Steffen Möller", - "titelerg": "Polen für Fortgeschrittene", - "verlag": "Piper", - "ort": "München/Berlin", - "jahr": "2015", + "title": "Viva Warszawa", + "author": "Steffen Möller", + "title_complement": "Polen für Fortgeschrittene", + "publisher": "Piper", + "place": "München/Berlin", + "year": "2015", } }, ) @@ -121,12 +117,12 @@ def test_de_extract_reference_from_templates_without_named_args(self): "<ref>{{Ref-OWID|Sprichwörter|401781|Schlechte Beispiele verderben gute Sitten.}}</ref>" ) - example_data = defaultdict(str) + example_data = Example() extract_reference(self.wxr, example_data, root.children[0]) self.assertEqual( - example_data, + example_data.model_dump(exclude_defaults=True), { "ref": { "raw_ref": "Expanded template", diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index af1a43c9..58a95b6d 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -5,12 +5,11 @@ from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.de.gloss import ( - extract_glosses, - extract_tags_from_gloss_text, - process_K_template, -) -from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.extractor.de.gloss import (extract_glosses, + extract_tags_from_gloss_text, + process_K_template) +from wiktextract.extractor.de.models import Sense +from wiktextract.extractor.es.models import WordEntry from wiktextract.wxr_context import WiktextractContext @@ -26,31 +25,34 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() + def get_default_word_entry(self): + return WordEntry(lang_code="de", lang_name="Deutsch", word="Beispiel") + def test_de_extract_glosses(self): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse(":[1] gloss1 \n:[2] gloss2") - page_data = [defaultdict(list)] + word_entry = self.get_default_word_entry() + + extract_glosses(self.wxr, word_entry, root) - extract_glosses(self.wxr, page_data, root) + senses = [ + s.model_dump(exclude_defaults=True) for s in word_entry.senses + ] self.assertEqual( - page_data, + senses, [ { - "senses": [ - { - "glosses": ["gloss1"], - "raw_glosses": ["[1] gloss1"], - "senseid": "1", - }, - { - "glosses": ["gloss2"], - "raw_glosses": ["[2] gloss2"], - "senseid": "2", - }, - ] - } + "glosses": ["gloss1"], + "raw_glosses": ["[1] gloss1"], + "senseid": "1", + }, + { + "glosses": ["gloss2"], + "raw_glosses": ["[2] gloss2"], + "senseid": "2", + }, ], ) @@ -60,32 +62,32 @@ def test_de_extract_glosses_with_subglosses(self): ":[1] gloss1\n::[a] subglossA\n::[b] subglossB" ) - page_data = [defaultdict(list)] + word_entry = self.get_default_word_entry() - extract_glosses(self.wxr, page_data, root) + extract_glosses(self.wxr, word_entry, root) + + senses = [ + s.model_dump(exclude_defaults=True) for s in word_entry.senses + ] self.assertEqual( - page_data, + senses, [ { - "senses": [ - { - "glosses": ["gloss1"], - "raw_glosses": ["[1] gloss1"], - "senseid": "1", - }, - { - "glosses": ["subglossA"], - "raw_glosses": ["[a] subglossA"], - "senseid": "1a", - }, - { - "glosses": ["subglossB"], - "raw_glosses": ["[b] subglossB"], - "senseid": "1b", - }, - ] - } + "glosses": ["gloss1"], + "raw_glosses": ["[1] gloss1"], + "senseid": "1", + }, + { + "glosses": ["subglossA"], + "raw_glosses": ["[a] subglossA"], + "senseid": "1a", + }, + { + "glosses": ["subglossB"], + "raw_glosses": ["[b] subglossB"], + "senseid": "1b", + }, ], ) @@ -96,28 +98,29 @@ def test_de_extract_glosses_with_only_subglosses(self): ":[1] {{K|tag}}\n::[a] subglossA\n::[1b] subglossB" ) - page_data = [defaultdict(list)] + word_entry = self.get_default_word_entry() + + extract_glosses(self.wxr, word_entry, root) + + senses = [ + s.model_dump(exclude_defaults=True) for s in word_entry.senses + ] - extract_glosses(self.wxr, page_data, root) self.assertEqual( - page_data, + senses, [ { - "senses": [ - { - "tags": ["tag"], - "glosses": ["subglossA"], - "raw_glosses": ["[a] subglossA"], - "senseid": "1a", - }, - { - "tags": ["tag"], - "glosses": ["subglossB"], - "raw_glosses": ["[1b] subglossB"], - "senseid": "1b", - }, - ] - } + "tags": ["tag"], + "glosses": ["subglossA"], + "raw_glosses": ["[a] subglossA"], + "senseid": "1a", + }, + { + "tags": ["tag"], + "glosses": ["subglossB"], + "raw_glosses": ["[1b] subglossB"], + "senseid": "1b", + }, ], ) @@ -126,14 +129,14 @@ def test_process_K_template_removes_K_template_nodes(self): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse("{{K|tag1|tag2}} gloss1") - gloss_data = defaultdict(list) + sense_data = Sense() self.assertEqual(len(root.children), 2) - process_K_template(self.wxr, gloss_data, root) + process_K_template(self.wxr, sense_data, root) self.assertEqual( - gloss_data, + sense_data.model_dump(exclude_defaults=True), { "tags": ["tag1", "tag2"], }, @@ -225,7 +228,7 @@ def test_process_K_template(self): for case in test_cases: with self.subTest(case=case): - gloss_data = defaultdict(list) + sense_data = Sense() self.wxr.wtp.start_page("") @@ -235,9 +238,9 @@ def test_process_K_template(self): "wiktextract.extractor.de.gloss.clean_node", self.get_mock(case["mock_return"]), ): - process_K_template(self.wxr, gloss_data, root) + process_K_template(self.wxr, sense_data, root) self.assertEqual( - gloss_data["tags"], + sense_data.tags, case["expected_tags"], ) @@ -265,19 +268,19 @@ def test_de_extract_tags_from_gloss_text(self): ] for case in test_cases: with self.subTest(case=case): - gloss_data = defaultdict(list) + sense_data = Sense() gloss_text = extract_tags_from_gloss_text( - gloss_data, case["input"] + sense_data, case["input"] ) if case["expected_tags"] is None: - self.assertEqual(gloss_data, {}) + self.assertEqual( + sense_data.model_dump(exclude_defaults=True), {} + ) else: self.assertEqual( - gloss_data, - { - "tags": case["expected_tags"], - }, + sense_data.tags, + case["expected_tags"], ) self.assertEqual(gloss_text, case["expected_gloss"]) diff --git a/tests/test_de_linkages.py b/tests/test_de_linkages.py index 8f73cc6c..70a73a31 100644 --- a/tests/test_de_linkages.py +++ b/tests/test_de_linkages.py @@ -1,10 +1,10 @@ import unittest -from collections import defaultdict from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.linkage import extract_linkages +from wiktextract.extractor.de.models import Sense, WordEntry from wiktextract.wxr_context import WiktextractContext @@ -19,86 +19,66 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() + def get_default_word_entry(self) -> WordEntry: + return WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch") + def test_de_extract_linkages(self): test_cases = [ # https://de.wiktionary.org/wiki/Beispiel # Extracts linkages and places them in the correct sense. { "input": "==== Sinnverwandte Wörter ====\n:[1] [[Beleg]], [[Exempel]]\n:[2] [[Muster]], [[Vorbild]]", - "page_data": [ - defaultdict( - list, + "senses": [Sense(senseid="1"), Sense(senseid="2")], + "expected": { + "senses": [ + { + "senseid": "1", + "coordinate_terms": ["Beleg", "Exempel"], + }, { - "senses": [ - defaultdict(list, {"senseid": "1"}), - defaultdict(list, {"senseid": "2"}), - ] + "senseid": "2", + "coordinate_terms": ["Muster", "Vorbild"], }, - ) - ], - "expected": [ - { - "senses": [ - { - "senseid": "1", - "coordinate_terms": ["Beleg", "Exempel"], - }, - { - "senseid": "2", - "coordinate_terms": ["Muster", "Vorbild"], - }, - ] - } - ], + ] + }, }, # https://de.wiktionary.org/wiki/Beispiel # Cleans explanatory text from expressions. { "input": "====Redewendungen====\n:[[ein gutes Beispiel geben|ein gutes ''Beispiel'' geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]", - "page_data": [defaultdict(list)], - "expected": [ - { - "expressions": ["ein gutes Beispiel geben"], - "senses": [], - }, - ], + "senses": [Sense()], + "expected": { + "senses": [ + { + "expressions": ["ein gutes Beispiel geben"], + } + ] + }, }, # Always places relations in first sense if just one sense. { "input": "====Synonyme====\n:[[Synonym1]]", - "page_data": [ - defaultdict( - list, {"senses": [defaultdict(list, {"senseid": "1"})]} - ) - ], - "expected": [ - { - "senses": [{"senseid": "1", "synonyms": ["Synonym1"]}], - }, - ], + "senses": [Sense(senseid="1")], + "expected": { + "senses": [{"senseid": "1", "synonyms": ["Synonym1"]}], + }, }, # https://de.wiktionary.org/wiki/Kokospalme # Ignores modifiers of relations and all other text. { "input": "====Synonyme====\n:[1] [[Kokosnusspalme]], ''wissenschaftlich:'' [[Cocos nucifera]]", - "page_data": [ - defaultdict( - list, {"senses": [defaultdict(list, {"senseid": "1"})]} - ) - ], - "expected": [ - { - "senses": [ - { - "senseid": "1", - "synonyms": [ - "Kokosnusspalme", - "Cocos nucifera", - ], - } - ], - }, - ], + "senses": [Sense(senseid="1")], + "expected": { + "senses": [ + { + "senseid": "1", + "synonyms": [ + "Kokosnusspalme", + "Cocos nucifera", + ], + } + ], + }, }, ] @@ -107,6 +87,15 @@ def test_de_extract_linkages(self): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse(case["input"]) - extract_linkages(self.wxr, case["page_data"], root.children[0]) + word_entry = self.get_default_word_entry() + word_entry.senses = case["senses"] + + extract_linkages(self.wxr, word_entry, root.children[0]) - self.assertEqual(case["page_data"], case["expected"]) + self.assertEqual( + word_entry.model_dump( + exclude_defaults=True, + exclude={"word", "lang_code", "lang_name"}, + ), + case["expected"], + ) diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 3401145a..33f0e64f 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -1,11 +1,11 @@ # Tests for parsing a page from the German Wiktionary import unittest -from collections import defaultdict from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.de.models import WordEntry from wiktextract.extractor.de.page import parse_page, parse_section from wiktextract.wxr_context import WiktextractContext @@ -28,22 +28,27 @@ def setUp(self): def tearDown(self) -> None: self.wxr.wtp.close_db_conn() + def get_default_base_data(self): + return WordEntry(lang_code="de", lang_name="Deutsch", word="Beispiel") + def test_de_parse_page(self): self.wxr.wtp.add_page("Vorlage:Sprache", 10, "") + self.wxr.wtp.add_page("Vorlage:Wortart", 10, "") lst = parse_page( self.wxr, "Beispiel", - """ -== Beispiel ({{Sprache|Deutsch}}) == + """== Beispiel ({{Sprache|Deutsch}}) == +=== {{Wortart|Substantiv|Deutsch}} === """, ) self.assertEqual( lst, [ { - "lang": "Deutsch", + "lang_name": "Deutsch", "lang_code": "de", "word": "Beispiel", + "pos": "noun", } ], ) @@ -52,22 +57,24 @@ def test_de_parse_page_skipping_head_templates(self): self.wxr.wtp.add_page("Vorlage:Wort der Woche", 10, "") self.wxr.wtp.add_page("Vorlage:Siehe auch", 10, "") self.wxr.wtp.add_page("Vorlage:Sprache", 10, "") + self.wxr.wtp.add_page("Vorlage:Wortart", 10, "") lst = parse_page( self.wxr, "Beispiel", - """ -{{Wort der Woche|46|2020}} + """{{Wort der Woche|46|2020}} {{Siehe auch|[[cát]]}} == Beispiel ({{Sprache|Deutsch}}) == +=== {{Wortart|Substantiv|Deutsch}} === """, ) self.assertEqual( lst, [ { - "lang": "Deutsch", + "lang_name": "Deutsch", "lang_code": "de", "word": "Beispiel", + "pos": "noun", } ], ) @@ -86,15 +93,18 @@ def test_de_parse_section(self): pre_expand=True, ) - base_data = defaultdict(list, {"lang_code": "de"}) - page_data = [defaultdict(list, {"lang_code": "de"})] + base_data = self.get_default_base_data() + page_data = [] parse_section(self.wxr, page_data, base_data, root.children) + pages = [p.model_dump(exclude_defaults=True) for p in page_data] self.assertEqual( - page_data, + pages, [ { + "word": "Beispiel", "lang_code": "de", + "lang_name": "Deutsch", "pos": "adj", "senses": [ { @@ -105,8 +115,10 @@ def test_de_parse_section(self): ], }, { + "word": "Beispiel", "lang_code": "de", "pos": "adv", + "lang_name": "Deutsch", "senses": [ { "glosses": ["gloss1"], @@ -116,8 +128,10 @@ def test_de_parse_section(self): ], }, { + "word": "Beispiel", "lang_code": "de", "pos": "verb", + "lang_name": "Deutsch", "senses": [ { "glosses": ["gloss2"], @@ -127,8 +141,10 @@ def test_de_parse_section(self): ], }, { + "word": "Beispiel", "lang_code": "de", "pos": "noun", + "lang_name": "Deutsch", "senses": [ { "glosses": ["gloss3"], diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py index cc9419a8..ccf288de 100644 --- a/tests/test_de_pronunciation.py +++ b/tests/test_de_pronunciation.py @@ -1,13 +1,11 @@ import unittest -from collections import defaultdict from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.de.pronunciation import ( - process_hoerbeispiele, - process_ipa, -) +from wiktextract.extractor.de.models import Sound +from wiktextract.extractor.de.pronunciation import (process_hoerbeispiele, + process_ipa) from wiktextract.wxr_context import WiktextractContext @@ -35,14 +33,22 @@ def test_de_process_ipa(self): { "input": "{{Lautschrift|ipa1|spr=de}}", "expected": [ - {"ipa": ["ipa1"], "language": "Deutsch", "lang_code": "de"} + { + "ipa": ["ipa1"], + "lang_name": ["Deutsch"], + "lang_code": ["de"], + } ], }, { "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}", "expected": [ {"ipa": ["ipa1", "ipa2"]}, - {"ipa": ["ipa3"], "language": "Deutsch", "lang_code": "de"}, + { + "ipa": ["ipa3"], + "lang_name": ["Deutsch"], + "lang_code": ["de"], + }, ], }, { @@ -62,13 +68,16 @@ def test_de_process_ipa(self): root = self.wxr.wtp.parse(case["input"]) - sound_data = [defaultdict(list)] + sound_data = [Sound()] process_ipa( self.wxr, sound_data, list(root.filter_empty_str_child()) ) - self.assertEqual(sound_data, case["expected"]) + sounds = [ + s.model_dump(exclude_defaults=True) for s in sound_data + ] + self.assertEqual(sounds, case["expected"]) def test_de_process_hoerbeispiele(self): # https://de.wiktionary.org/wiki/Beispiel @@ -80,7 +89,7 @@ def test_de_process_hoerbeispiele(self): "input": "{{Audio|" + filename1 + "}}", "expected": [ { - "audio": filename1, + "audio": [filename1], "mp3_url": None, # None indicates we don't care about the exact value "ogg_url": None, } @@ -94,12 +103,12 @@ def test_de_process_hoerbeispiele(self): + "}}", "expected": [ { - "audio": filename1, + "audio": [filename1], "mp3_url": None, "ogg_url": None, }, { - "audio": filename2, + "audio": [filename2], "ogg_url": None, "mp3_url": None, "wav_url": None, @@ -114,13 +123,13 @@ def test_de_process_hoerbeispiele(self): + "}}", "expected": [ { - "audio": filename1, + "audio": [filename1], "mp3_url": None, "ogg_url": None, "tags": ["tag1"], }, { - "audio": filename2, + "audio": [filename2], "mp3_url": None, "ogg_url": None, "wav_url": None, @@ -138,15 +147,16 @@ def test_de_process_hoerbeispiele(self): root = self.wxr.wtp.parse(case["input"]) - sound_data = [defaultdict(list)] + sound_data = [Sound()] process_hoerbeispiele( self.wxr, sound_data, list(root.filter_empty_str_child()) ) - self.assertSoundDataMatchesExpected( - sound_data, case["expected"] - ) + sounds = [ + s.model_dump(exclude_defaults=True) for s in sound_data + ] + self.assertSoundDataMatchesExpected(sounds, case["expected"]) def assertSoundDataMatchesExpected(self, sound_data, expected): self.assertEqual( diff --git a/tests/test_de_translation.py b/tests/test_de_translation.py index 7ddf5418..5bd65ed3 100644 --- a/tests/test_de_translation.py +++ b/tests/test_de_translation.py @@ -1,13 +1,11 @@ import unittest -from collections import defaultdict from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.de.translation import ( - extract_translation, - process_translation_list, -) +from wiktextract.extractor.de.models import Sense, Translation, WordEntry +from wiktextract.extractor.de.translation import (extract_translation, + process_translation_list) from wiktextract.wxr_context import WiktextractContext @@ -22,85 +20,70 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() + def get_default_word_entry(self): + return WordEntry(word="Beispiel", lang_code="de", lang_name="Deutsch") + def test_de_extract_translation(self): test_cases = [ # Adds sense data to correct sense { "input": "{{Ü-Tabelle|1|G=Beispiel|Ü-Liste=*{{en}}: {{Ü|en|example}}}}", - "page_data": [ - defaultdict( - list, {"senses": [defaultdict(list, {"senseid": "1"})]} - ) - ], - "expected": [ - { - "senses": [ - { - "senseid": "1", - "translations": [ - { - "sense": "Beispiel", - "code": "en", - "lang": "Englisch", - "word": "example", - } - ], - } - ] - } - ], + "senses": [Sense(senseid="1")], + "expected": { + "senses": [ + { + "senseid": "1", + "translations": [ + { + "sense": "Beispiel", + "lang_code": "en", + "lang_name": "Englisch", + "word": "example", + } + ], + } + ] + }, }, # Adds sense data to page_data root if no senseid is given { "input": "{{Ü-Tabelle||G=Beispiel|Ü-Liste=*{{en}}: {{Ü|en|example}}}}", - "page_data": [ - defaultdict( - list, {"senses": [defaultdict(list, {"senseid": "1"})]} - ) - ], - "expected": [ - { - "senses": [ - { - "senseid": "1", - } - ], - "translations": [ - { - "sense": "Beispiel", - "code": "en", - "lang": "Englisch", - "word": "example", - } - ], - } - ], + "senses": [Sense(senseid="1")], + "expected": { + "senses": [ + { + "senseid": "1", + } + ], + "translations": [ + { + "sense": "Beispiel", + "lang_code": "en", + "lang_name": "Englisch", + "word": "example", + } + ], + }, }, # Adds sense data to page_data root if senseid could not be matched { "input": "{{Ü-Tabelle|2|G=Beispiel|Ü-Liste=*{{en}}: {{Ü|en|example}}}}", - "page_data": [ - defaultdict( - list, {"senses": [defaultdict(list, {"senseid": "1"})]} - ) - ], - "expected": [ - { - "senses": [ - { - "senseid": "1", - } - ], - "translations": [ - { - "sense": "Beispiel", - "code": "en", - "lang": "Englisch", - "word": "example", - } - ], - } - ], + "senses": [Sense(senseid="1")], + "expected": { + "senses": [ + { + "senseid": "1", + } + ], + "translations": [ + { + "sense": "Beispiel", + "lang_code": "en", + "lang_name": "Englisch", + "word": "example", + } + ], + }, }, ] @@ -109,11 +92,18 @@ def test_de_extract_translation(self): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse(case["input"]) - page_data = case["page_data"] + word_entry = self.get_default_word_entry() + word_entry.senses = case.get("senses", []) - extract_translation(self.wxr, page_data, root) + extract_translation(self.wxr, word_entry, root) - self.assertEqual(page_data, case["expected"]) + self.assertEqual( + word_entry.model_dump( + exclude_defaults=True, + exclude={"word", "lang_code", "lang_name"}, + ), + case["expected"], + ) def test_de_process_translation_list(self): test_cases = [ @@ -122,7 +112,11 @@ def test_de_process_translation_list(self): { "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{en}}: {{Ü|en|example}}}}", "expected_sense_translations": [ - {"code": "en", "lang": "Englisch", "word": "example"} + { + "lang_code": "en", + "lang_name": "Englisch", + "word": "example", + } ], }, # https://de.wiktionary.org/wiki/Beispiel @@ -131,8 +125,8 @@ def test_de_process_translation_list(self): "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{hy}}: {{Üt|hy|օրինակ|orinak}}}}", "expected_sense_translations": [ { - "code": "hy", - "lang": "Armenisch", + "lang_code": "hy", + "lang_name": "Armenisch", "word": "օրինակ", "roman": "orinak", } @@ -145,8 +139,8 @@ def test_de_process_translation_list(self): "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{ru}}: {{Üt|ru|пример}}}}", "expected_sense_translations": [ { - "code": "ru", - "lang": "Russisch", + "lang_code": "ru", + "lang_name": "Russisch", "word": "пример", "roman": "primer", } @@ -159,8 +153,8 @@ def test_de_process_translation_list(self): "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{ar}}: {{Üt?|ar|عريضة|}}}}", "expected_sense_translations": [ { - "code": "ar", - "lang": "Arabisch", + "lang_code": "ar", + "lang_name": "Arabisch", "word": "عريضة", "uncertain": True, } @@ -178,7 +172,7 @@ def test_de_process_translation_list(self): root = self.wxr.wtp.parse(case["input"]) sense_translations = [] - base_translation_data = defaultdict(list) + base_translation_data = Translation() translation_list = root.children[0].template_parameters.get( "Ü-Liste" @@ -190,8 +184,12 @@ def test_de_process_translation_list(self): base_translation_data, translation_list, ) + translations = [ + t.model_dump(exclude_defaults=True) + for t in sense_translations + ] self.assertEqual( - sense_translations, case["expected_sense_translations"] + translations, case["expected_sense_translations"] ) def test_de_process_translation_list_with_modifiers(self): @@ -199,12 +197,16 @@ def test_de_process_translation_list_with_modifiers(self): # https://de.wiktionary.org/wiki/Beispiel # Modifying the following translation { - "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{en}}: {{Ü|en|instance}}, ''Vorbild:'' {{Ü|en|model}}}}", + "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{en}}: {{Ü|en|instance}}, '''Vorbild:''' {{Ü|en|model}}}}", "expected_sense_translations": [ - {"code": "en", "lang": "Englisch", "word": "instance"}, { - "code": "en", - "lang": "Englisch", + "lang_code": "en", + "lang_name": "Englisch", + "word": "instance", + }, + { + "lang_code": "en", + "lang_name": "Englisch", "word": "model", "tags": ["Vorbild"], }, @@ -217,8 +219,8 @@ def test_de_process_translation_list_with_modifiers(self): "input": "{{Ü-Tabelle|||Ü-Liste=\n**{{fr}}: {{Ü|fr|exemple}} {{m}}}}", "expected_sense_translations": [ { - "code": "fr", - "lang": "Französisch", + "lang_code": "fr", + "lang_name": "Französisch", "word": "exemple", "tags": ["m"], } @@ -231,20 +233,20 @@ def test_de_process_translation_list_with_modifiers(self): "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{la}}: {{Ü|la|crus}} {{f}}, {{Ü|la|camba}} (vulgärlateinisch) {{f}}, {{Ü|la|gamba}} (vulgärlateinisch) {{f}}}}", "expected_sense_translations": [ { - "code": "la", - "lang": "Latein", + "lang_code": "la", + "lang_name": "Latein", "word": "crus", "tags": ["f"], }, { - "code": "la", - "lang": "Latein", + "lang_code": "la", + "lang_name": "Latein", "word": "camba", "tags": ["vulgärlateinisch", "f"], }, { - "code": "la", - "lang": "Latein", + "lang_code": "la", + "lang_name": "Latein", "word": "gamba", "tags": ["vulgärlateinisch", "f"], }, @@ -259,31 +261,31 @@ def test_de_process_translation_list_with_modifiers(self): "input": "{{Ü-Tabelle|||Ü-Liste=\n*{{en}}: [1] {{Ü|en|subscription}}; [1a] {{Ü|en|dues}}, {{Ü|en|membership fee}}; [1, 2] {{Ü|en|contribution}}; [3] {{Ü|en|article}}}}", "expected_sense_translations": [ { - "code": "en", - "lang": "Englisch", + "lang_code": "en", + "lang_name": "Englisch", "word": "subscription", "tags": ["[1a]"], }, { - "code": "en", - "lang": "Englisch", + "lang_code": "en", + "lang_name": "Englisch", "word": "dues", }, { - "code": "en", - "lang": "Englisch", + "lang_code": "en", + "lang_name": "Englisch", "word": "membership fee", "tags": ["[1", "2]"], }, { - "code": "en", - "lang": "Englisch", + "lang_code": "en", + "lang_name": "Englisch", "word": "contribution", "tags": ["[3]"], }, { - "code": "en", - "lang": "Englisch", + "lang_code": "en", + "lang_name": "Englisch", "word": "article", }, ], @@ -300,7 +302,7 @@ def test_de_process_translation_list_with_modifiers(self): root = self.wxr.wtp.parse(case["input"]) sense_translations = [] - base_translation_data = defaultdict(list) + base_translation_data = Translation() translation_list = root.children[0].template_parameters.get( "Ü-Liste" @@ -312,6 +314,10 @@ def test_de_process_translation_list_with_modifiers(self): base_translation_data, translation_list, ) + translations = [ + t.model_dump(exclude_defaults=True) + for t in sense_translations + ] self.assertEqual( - sense_translations, case["expected_sense_translations"] + translations, case["expected_sense_translations"] ) diff --git a/tests/test_desc.py b/tests/test_desc.py index e12640b6..99c79182 100644 --- a/tests/test_desc.py +++ b/tests/test_desc.py @@ -1,6 +1,7 @@ import unittest from wikitextprocessor import Wtp + from wiktextract.config import WiktionaryConfig from wiktextract.datautils import split_at_comma_semi from wiktextract.thesaurus import close_thesaurus_db