diff --git a/json_schema/fr.json b/json_schema/fr.json deleted file mode 100644 index 80567ca9b..000000000 --- a/json_schema/fr.json +++ /dev/null @@ -1,338 +0,0 @@ -{ - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://kaikki.org/fr.json", - "title": "French Wiktionary", - "description": "JSON schema of the French Wiktionary extractor", - "type": "object", - "properties": { - "lang_name": { - "description": "Localized language name of the word", - "type": "string" - }, - "lang_code": { - "description": "Wiktionary language code", - "type": "string" - }, - "word": { - "description": "word string", - "type": "string" - }, - "pos": { - "description": "Part of speech type", - "type": "string" - }, - "pos_title": { - "description": "Original POS title for matching etymology texts", - "type": "string" - }, - "etymology_texts": { - "description": "Etymology list", - "type": "array", - "items": { - "type": "string" - } - }, - "senses": { - "description": "Sense list", - "type": "array", - "items": { - "$ref": "#/$defs/sense" - } - }, - "forms": { - "description": "Inflection forms list", - "type": "array", - "items": { - "$ref": "#/$defs/form" - } - }, - "sounds": { - "type": "array", - "items": { - "$ref": "#/$defs/sound" - } - }, - "translations": { - "type": "array", - "items": { - "$ref": "#/$defs/translation" - } - }, - "synonyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "hyponyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "hypernyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "holonyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "meronyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "derived": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "troponyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "paronyms": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "related": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "abbreviation": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "proverbs": { - "type": "array", - "items": { - "$ref": "#/$defs/linkage" - } - }, - "title": { - "description": "Redirect page source title", - "type": "string" - }, - "redirect": { - "description": "Redirect page target title", - "type": "string" - }, - "categories": { - "type": "array", - "items": { - "type": "string" - } - }, - "notes": { - "type": "array", - "items": { - "type": "string" - } - } - }, - "$defs": { - "sense": { - "type": "object", - "properties": { - "glosses": { - "type": "array", - "items": { - "type": "string" - } - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "categories": { - "type": "array", - "items": { - "type": "string" - } - }, - "examples": { - "type": "array", - "items": { - "$ref": "#/$defs/example" - } - } - } - }, - "example": { - "type": "object", - "properties": { - "text": { - "description": "Example usage sentence", - "type": "string" - }, - "translation": { - "description": "French translation of the example sentence", - "type": "string" - }, - "roman": { - "description": "Romanization of the example sentence", - "type": "string" - }, - "ref": { - "description": "Source of the sentence, like book title and page number", - "type": "string" - }, - "type": { - "description": "This value is 'quotation' if 'source' exists", - "type": "string", - "enum": [ - "example", - "quotation" - ] - } - } - }, - "form": { - "type": "object", - "properties": { - "form": { - "type": "string" - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "ipas": { - "description": "has more than one ipa", - "type": "array", - "items": { - "type": "string" - } - }, - "ipa": { - "description": "only has one ipa", - "type": "string" - }, - "source": { - "description": "form line template name", - "type": "string" - } - } - }, - "sound": { - "type": "object", - "properties": { - "zh-pron": { - "description": "Chinese word pronunciation", - "type": "string" - }, - "ipa": { - "description": "International Phonetic Alphabet", - "type": "string" - }, - "audio": { - "description": "Audio file name", - "type": "string" - }, - "wav_url": { - "type": "string" - }, - "ogg_url": { - "type": "string" - }, - "mp3_url": { - "type": "string" - } - } - }, - "translation": { - "type": "object", - "properties": { - "lang_code": { - "description": "Wiktionary language code of the translation term", - "type": "string" - }, - "lang_name": { - "description": "Translation language name", - "type": "string" - }, - "word": { - "description": "Translation term", - "type": "string" - }, - "sense": { - "description": "Translation gloss", - "type": "string" - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "roman": { - "type": "string" - }, - "traditional_writing": { - "description": "Alternative writting for Chinese, Korean and Mongolian", - "type": "string" - } - } - }, - "linkage": { - "type": "object", - "properties": { - "word": { - "type": "string" - }, - "tags": { - "type": "array", - "items": { - "type": "string" - } - }, - "roman": { - "type": "string" - }, - "alt": { - "description": "ALternative form", - "type": "string" - }, - "translation": { - "description": "French translation", - "type": "string" - }, - "sense": { - "description": "Definition of the word", - "type": "string" - }, - "sense_index": { - "description": "Number of the definition, start from 1", - "type": "integer" - }, - "lang_name": { - "description": "Localized language name of the word, for the 'Dérivés dans d’autres langues' section", - "type": "string" - }, - "lang_code": { - "description": "Wiktionary language code, for the 'Dérivés dans d’autres langues' section", - "type": "string" - } - } - } - } -} diff --git a/src/wiktextract/extractor/fr/etymology.py b/src/wiktextract/extractor/fr/etymology.py index 09b979ecb..26ca0afd8 100644 --- a/src/wiktextract/extractor/fr/etymology.py +++ b/src/wiktextract/extractor/fr/etymology.py @@ -1,17 +1,19 @@ from collections import defaultdict -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import LEVEL_KINDS, clean_node from wiktextract.wxr_context import WiktextractContext -EtymologyData = Dict[str, List[str]] +from .models import WordEntry + +EtymologyData = dict[str, list[str]] def extract_etymology( wxr: WiktextractContext, - nodes: List[Union[WikiNode, str]], + nodes: list[Union[WikiNode, str]], ) -> Optional[EtymologyData]: etymology_dict: EtymologyData = defaultdict(list) level_node_index = len(nodes) @@ -62,7 +64,7 @@ def extract_etymology( def find_pos_in_etymology_list( wxr: WiktextractContext, list_item_node: WikiNode -) -> Optional[Tuple[str, str]]: +) -> Optional[tuple[str, str]]: """ Return tuple of POS title and etymology text if the passed lis item node starts with italic POS node or POS template, otherwise return None. @@ -96,26 +98,27 @@ def find_pos_in_etymology_list( def insert_etymology_data( - lang_code: str, page_data: List[Dict], etymology_data: EtymologyData + lang_code: str, page_data: list[WordEntry], etymology_data: EtymologyData ) -> None: """ Insert list of etymology data extracted from the level 3 node to each sense dictionary matches the language and POS. """ - sense_dict = {} # group by pos title + sense_dict = defaultdict(list) # group by pos title for sense_data in page_data: - if sense_data.get("lang_code") == lang_code: - sense_dict[sense_data.get("pos_title")] = sense_data + if sense_data.lang_code == lang_code: + sense_dict[sense_data.pos_title].append(sense_data) for pos_title, etymology_texts in etymology_data.items(): if pos_title is None: # add to all sense dictionaries - for sense_data in sense_dict.values(): - sense_data["etymology_texts"] = etymology_texts + for sense_data_list in sense_dict.values(): + for sense_data in sense_data_list: + sense_data.etymology_texts = etymology_texts elif pos_title in sense_dict: - sense_dict[pos_title]["etymology_texts"] = etymology_texts + for sense_data in sense_dict[pos_title]: + sense_data.etymology_texts = etymology_texts elif pos_title.removesuffix(" 1") in sense_dict: # an index number is added in the etymology section but not added in # POS title - sense_dict[pos_title.removesuffix(" 1")][ - "etymology_texts" - ] = etymology_texts + for sense_data in sense_dict[pos_title.removesuffix(" 1")]: + sense_data.etymology_texts = etymology_texts diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py index 097d4039e..e315e0007 100644 --- a/src/wiktextract/extractor/fr/form_line.py +++ b/src/wiktextract/extractor/fr/form_line.py @@ -1,18 +1,18 @@ -from collections import defaultdict -from typing import Dict, List, Union +from typing import Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import HTMLNode, TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import Form, Sound, WordEntry from .pronunciation import PRON_TEMPLATES, process_pron_template def extract_form_line( wxr: WiktextractContext, - page_data: List[Dict], - nodes: List[Union[WikiNode, str]], + page_data: list[WordEntry], + nodes: list[Union[WikiNode, str]], ) -> None: """ Ligne de forme @@ -27,9 +27,7 @@ def extract_form_line( if node.template_name in PRON_TEMPLATES: ipa_text = process_pron_template(wxr, node) if len(ipa_text) > 0: - page_data[-1]["sounds"].append( - defaultdict(list, {"ipa": ipa_text}) - ) + page_data[-1].sounds.append(Sound(ipa=ipa_text)) elif node.template_name == "équiv-pour": process_equiv_pour_template(wxr, node, page_data) elif node.template_name.startswith("zh-mot"): @@ -42,18 +40,18 @@ def extract_form_line( tag.startswith("(") and tag.endswith(")") and pre_template_name in PRON_TEMPLATES - and len(page_data[-1].get("sounds", [])) > 0 + and len(page_data[-1].sounds) > 0 ): # it's the location of the previous IPA template - page_data[-1]["sounds"][-1]["tags"].append(tag.strip("()")) + page_data[-1].sounds[-1].tags.append(tag.strip("()")) elif len(tag.strip("()")) > 0: - page_data[-1]["tags"].append(tag.strip("()")) + page_data[-1].tags.append(tag.strip("()")) pre_template_name = node.template_name def process_equiv_pour_template( - wxr: WiktextractContext, node: TemplateNode, page_data: List[Dict] + wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry] ) -> None: # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour expanded_node = wxr.wtp.parse( @@ -64,20 +62,20 @@ def process_equiv_pour_template( if child.kind == NodeKind.ITALIC: form_tag = clean_node(wxr, None, child).strip("() ") elif isinstance(child, HTMLNode) and child.tag == "bdi": - form_data = { - "form": clean_node(wxr, None, child), - "source": "form line template 'équiv-pour'", - } + form_data = Form( + form=clean_node(wxr, None, child), + source="form line template 'équiv-pour'", + ) if len(form_tag) > 0: - form_data["tags"] = [form_tag] - if len(form_data["form"]) > 0: - page_data[-1]["forms"].append(form_data) + form_data.tags = [form_tag] + if len(form_data.form) > 0: + page_data[-1].forms.append(form_data) def process_zh_mot_template( wxr: WiktextractContext, node: TemplateNode, - page_data: List[Dict], + page_data: list[WordEntry], ) -> None: # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t # https://fr.wiktionary.org/wiki/Modèle:zh-mot @@ -88,30 +86,29 @@ def process_zh_mot_template( ) for template_node in node.find_child(NodeKind.TEMPLATE): if template_node.template_name.lower() == "lang": - page_data[-1]["sounds"].append( - { - "zh-pron": clean_node(wxr, None, template_node), - "tags": ["Pinyin"], - } + page_data[-1].sounds.append( + Sound( + zh_pron=clean_node(wxr, None, template_node), + tags=["Pinyin"], + ) ) elif template_node.template_name in ("pron", "prononciation"): - page_data[-1]["sounds"].append( - {"ipa": clean_node(wxr, None, template_node)} + page_data[-1].sounds.append( + Sound(ipa=clean_node(wxr, None, template_node)) ) def process_ja_mot_template( wxr: WiktextractContext, template_node: TemplateNode, - page_data: List[Dict], + page_data: list[WordEntry], ) -> None: # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(template_node), expand_all=True ) existing_forms = { - existing_form.get("form") - for existing_form in page_data[-1].get("forms", []) + existing_form.form for existing_form in page_data[-1].forms } for index, node in expanded_node.find_html("span", with_index=True): # the first span tag is the word, the second is Hepburn romanization @@ -120,7 +117,7 @@ def process_ja_mot_template( if form_text not in existing_forms: # avoid adding duplicated form data extracted from # inflection table before the form line - page_data[-1]["forms"].append( - {"form": form_text, "tags": ["romanization"]} + page_data[-1].forms.append( + Form(form=form_text, tags=["romanization"]) ) break diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index c63a1abbd..8b69d5b97 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -1,17 +1,18 @@ from collections import defaultdict -from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import Example, Sense, WordEntry + def extract_gloss( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], list_node: WikiNode, - parent_glosses: List[str] = [], + parent_glosses: list[str] = [], ) -> None: for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): gloss_nodes = list( @@ -19,7 +20,7 @@ def extract_gloss( NodeKind.LIST, include_empty_str=True ) ) - gloss_data = defaultdict(list) + gloss_data = Sense() gloss_start = 0 # process modifier, theme tempaltes before gloss text # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens @@ -33,9 +34,9 @@ def extract_gloss( gloss_start = index + 1 tag = expanded_text.strip("() \n") if len(tag) > 0: - gloss_data["tags"].append(tag) + gloss_data.tags.append(tag) if "categories" in categories_data: - gloss_data["categories"].extend( + gloss_data.categories.extend( categories_data["categories"] ) @@ -54,7 +55,7 @@ def extract_gloss( and isinstance(gloss_nodes[index + 1], str) and gloss_nodes[index + 1].strip() == ")" ): - gloss_data["tags"].append(clean_node(wxr, None, node)) + gloss_data.tags.append(clean_node(wxr, None, node)) tag_indexes |= {index - 1, index, index + 1} continue @@ -64,12 +65,12 @@ def extract_gloss( if index not in tag_indexes ] gloss_text = clean_node(wxr, gloss_data, gloss_only_nodes) - gloss_data["glosses"] = parent_glosses + [gloss_text] - page_data[-1]["senses"].append(gloss_data) + gloss_data.glosses = parent_glosses + [gloss_text] + page_data[-1].senses.append(gloss_data) for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): if nest_gloss_list.sarg.endswith("#"): extract_gloss( - wxr, page_data, nest_gloss_list, gloss_data["glosses"] + wxr, page_data, nest_gloss_list, gloss_data.glosses ) elif nest_gloss_list.sarg.endswith("*"): extract_examples(wxr, gloss_data, nest_gloss_list) @@ -77,7 +78,7 @@ def extract_gloss( def extract_examples( wxr: WiktextractContext, - gloss_data: Dict, + gloss_data: Sense, example_list_node: WikiNode, ) -> None: for example_node in example_list_node.find_child(NodeKind.LIST_ITEM): @@ -102,18 +103,17 @@ def extract_examples( for node in example_node_children if node != source_template ] - example_data = {"type": "example"} - example_data["text"] = clean_node(wxr, None, example_nodes) + example_data = Example() + example_data.text = clean_node(wxr, None, example_nodes) if source_template is not None: - example_data["ref"] = clean_node( - wxr, None, source_template - ).strip("— ()") - example_data["type"] = "quotation" - gloss_data["examples"].append(example_data) + example_data.ref = clean_node(wxr, None, source_template).strip( + "— ()" + ) + gloss_data.examples.append(example_data) def process_exemple_template( - wxr: WiktextractContext, node: TemplateNode, gloss_data: Dict + wxr: WiktextractContext, node: TemplateNode, gloss_data: Sense ) -> None: # https://fr.wiktionary.org/wiki/Modèle:exemple # https://fr.wiktionary.org/wiki/Modèle:ja-exemple @@ -132,15 +132,11 @@ def process_exemple_template( node.template_parameters.get(3, node.template_parameters.get("tr", "")), ) source = clean_node(wxr, None, node.template_parameters.get("source", "")) - example_data = {"type": "example"} - if len(text) > 0: - example_data["text"] = clean_node(wxr, None, text) - if len(translation) > 0: - example_data["translation"] = clean_node(wxr, None, translation) - if len(transcription) > 0: - example_data["roman"] = clean_node(wxr, None, transcription) - if len(source) > 0: - example_data["ref"] = clean_node(wxr, None, source) - example_data["type"] = "quotation" - if "text" in example_data: - gloss_data["examples"].append(example_data) + example_data = Example( + text=clean_node(wxr, None, text), + translation=clean_node(wxr, None, translation), + roman=clean_node(wxr, None, transcription), + ref=clean_node(wxr, None, source), + ) + if len(example_data.text) > 0: + gloss_data.examples.append(example_data) diff --git a/src/wiktextract/extractor/fr/inflection.py b/src/wiktextract/extractor/fr/inflection.py index a7817fc5e..f21d81f75 100644 --- a/src/wiktextract/extractor/fr/inflection.py +++ b/src/wiktextract/extractor/fr/inflection.py @@ -1,19 +1,17 @@ -from collections import defaultdict -from copy import deepcopy from dataclasses import dataclass -from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -from .pronunciation import insert_ipa, is_ipa_text +from .models import Form, WordEntry +from .pronunciation import is_ipa_text def extract_inflection( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], template_node: TemplateNode, ) -> None: # inflection templates @@ -47,7 +45,7 @@ class ColspanHeader: def process_inflection_table( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], node: WikiNode, ) -> None: expanded_node = wxr.wtp.parse( @@ -92,7 +90,7 @@ def process_inflection_table( column_cell_index = 0 for column_num, table_cell in enumerate(table_row_nodes): - form_data = defaultdict(list) + form_data = Form() if isinstance(table_cell, WikiNode): if table_cell.kind == NodeKind.TABLE_HEADER_CELL: if any( @@ -140,38 +138,57 @@ def process_inflection_table( if is_ipa_text(table_cell_line): insert_ipa(form_data, table_cell_line) elif ( - table_cell_line != page_data[-1].get("word") + table_cell_line != page_data[-1].word and table_cell_line not in IGNORE_TABLE_CELL ): - if "form" not in form_data: - form_data["form"] = table_cell_line + if "form" not in form_data.model_fields_set: + form_data.form = table_cell_line else: - form_data["form"] += " " + table_cell_line + form_data.form += " " + table_cell_line for colspan_header in colspan_headers: if ( column_cell_index >= colspan_header.index and column_cell_index < colspan_header.index + colspan_header.span ): - form_data["tags"].append(colspan_header.text) + form_data.tags.append(colspan_header.text) if ( "colspan" not in table_cell.attrs and len(column_headers) > column_cell_index and column_headers[column_cell_index].lower() not in IGNORE_TABLE_HEADERS ): - form_data["tags"].append( - column_headers[column_cell_index] - ) + form_data.tags.append(column_headers[column_cell_index]) if len(row_headers) > 0: - form_data["tags"].extend(row_headers) - if "form" in form_data: - for form in form_data["form"].split(" ou "): - new_form_data = deepcopy(form_data) - new_form_data["form"] = form - page_data[-1]["forms"].append(new_form_data) + form_data.tags.extend(row_headers) + if "form" in form_data.model_fields_set: + for form in form_data.form.split(" ou "): + new_form_data = form_data.model_copy(deep=True) + new_form_data.form = form + page_data[-1].forms.append(new_form_data) colspan_text = table_cell.attrs.get("colspan", "1") if colspan_text.isdigit(): column_cell_index += int(colspan_text) + + +def split_ipa(text: str) -> list[str]: + # break IPA text if it contains "ou"(or) + if " ou " in text: + # two ipa texts in the same line: "en-conj-rég" template + return text.split(" ou ") + if text.startswith("ou "): + return [text.removeprefix("ou ")] + if text.endswith(" Prononciation ?\\"): + # inflection table templates use a edit link when the ipa data is + # missing, and the link usually ends with " Prononciation ?" + return "" + return [text] + + +def insert_ipa(form: Form, ipa_text: str) -> None: + ipa_data = split_ipa(ipa_text) + if len(ipa_data) == 0: + return + form.ipas.extend(ipa_data) diff --git a/src/wiktextract/extractor/fr/linkage.py b/src/wiktextract/extractor/fr/linkage.py index c1b7a3fd6..81fdd981a 100644 --- a/src/wiktextract/extractor/fr/linkage.py +++ b/src/wiktextract/extractor/fr/linkage.py @@ -1,17 +1,15 @@ -from collections import defaultdict -from typing import Union - from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext from ..share import split_tag_text +from .models import Linkage, WordEntry def extract_linkage( wxr: WiktextractContext, - page_data: list[dict], + page_data: list[WordEntry], level_node: WikiNode, section_type: str, ) -> None: @@ -28,7 +26,7 @@ def extract_linkage( def process_derives_autres_list( wxr: WiktextractContext, - page_data: list[dict], + page_data: list[WordEntry], level_node: WikiNode, ): # drrive to other languages list @@ -41,18 +39,14 @@ def process_derives_autres_list( lang_name = clean_node(wxr, None, template_node) elif template_node.template_name == "lien": word = clean_node(wxr, None, template_node) - page_data[-1]["derived"].append( - { - "lang_code": lang_code, - "lang_name": lang_name, - "word": word, - } + page_data[-1].derived.append( + Linkage(lang_code=lang_code, lang_name=lang_name, word=word) ) def process_linkage_list( wxr: WiktextractContext, - page_data: list[dict], + page_data: list[WordEntry], level_node: WikiNode, linkage_type: str, ) -> None: @@ -76,20 +70,20 @@ def process_linkage_list( sense_index = int(sense_index_text) continue - linkage_data = defaultdict(list) + linkage_data = Linkage() if len(sense_text) > 0: - linkage_data["sense"] = sense_text + linkage_data.sense = sense_text if sense_index != 0: - linkage_data["sense_index"] = sense_index + linkage_data.sense_index = sense_index pending_tag = "" for index, child_node in enumerate( # remove nested lists template_or_list_node.invert_find_child(NodeKind.LIST) ): - if index == 0 or "word" not in linkage_data: + if index == 0 or "word" not in linkage_data.model_fields_set: if isinstance(child_node, TemplateNode): process_linkage_template(wxr, child_node, linkage_data) else: - linkage_data["word"] = clean_node(wxr, None, child_node) + linkage_data.word = clean_node(wxr, None, child_node) else: tag_text = ( child_node @@ -108,8 +102,9 @@ def process_linkage_list( pending_tag = "" elif tag_text.strip() in {",", "/"}: # list item has more than one word - page_data[-1][linkage_type].append(linkage_data) - linkage_data = defaultdict(list) + pre_data = getattr(page_data[-1], linkage_type) + pre_data.append(linkage_data) + linkage_data = Linkage() continue elif len(pending_tag) > 0: pending_tag += tag_text @@ -117,18 +112,19 @@ def process_linkage_list( for tag in split_tag_text(tag_text): if tag.startswith("— "): - linkage_data["translation"] = tag.removeprefix("— ") + linkage_data.translation = tag.removeprefix("— ") elif len(tag) > 0: - linkage_data["tags"].append(tag) + linkage_data.tags.append(tag) - if "word" in linkage_data: - page_data[-1][linkage_type].append(linkage_data) + if "word" in linkage_data.model_fields_set: + pre_data = getattr(page_data[-1], linkage_type) + pre_data.append(linkage_data) def process_linkage_template( wxr: WiktextractContext, node: TemplateNode, - linkage_data: dict[str, Union[str, list[str]]], + linkage_data: Linkage, ) -> None: if node.template_name == "lien": process_lien_template(wxr, node, linkage_data) @@ -139,7 +135,7 @@ def process_linkage_template( def process_lien_template( wxr: WiktextractContext, node: TemplateNode, - linkage_data: dict[str, Union[str, list[str]]], + linkage_data: Linkage, ) -> None: # link word template: https://fr.wiktionary.org/wiki/Modèle:lien word = clean_node( @@ -147,13 +143,13 @@ def process_lien_template( None, node.template_parameters.get("dif", node.template_parameters.get(1)), ) - linkage_data["word"] = word + linkage_data.word = word if "tr" in node.template_parameters: - linkage_data["roman"] = clean_node( + linkage_data.roman = clean_node( wxr, None, node.template_parameters.get("tr") ) if "sens" in node.template_parameters: - linkage_data["translation"] = clean_node( + linkage_data.translation = clean_node( wxr, None, node.template_parameters.get("sens") ) @@ -161,17 +157,15 @@ def process_lien_template( def process_zh_lien_template( wxr: WiktextractContext, node: TemplateNode, - linkage_data: dict[str, Union[str, list[str]]], + linkage_data: Linkage, ) -> None: # https://fr.wiktionary.org/wiki/Modèle:zh-lien - linkage_data["word"] = clean_node( - wxr, None, node.template_parameters.get(1) - ) - linkage_data["roman"] = clean_node( + linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1)) + linkage_data.roman = clean_node( wxr, None, node.template_parameters.get(2) ) # pinyin traditional_form = clean_node( wxr, None, node.template_parameters.get(3, "") ) if len(traditional_form) > 0: - linkage_data["alt"] = traditional_form + linkage_data.alt = traditional_form diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py new file mode 100644 index 000000000..7a25d4b0d --- /dev/null +++ b/src/wiktextract/extractor/fr/models.py @@ -0,0 +1,109 @@ +from pydantic import BaseModel, ConfigDict, Field + + +class FrenchBaseModel(BaseModel): + model_config = ConfigDict( + extra="ignore", + strict=True, + validate_assignment=True, + validate_default=True, + ) + + +class Example(FrenchBaseModel): + text: str = Field("", description="Example usage sentence") + translation: str = Field( + "", description="French translation of the example sentence" + ) + roman: str = Field("", description="Romanization of the example sentence") + ref: str = Field( + "", + description="Source of the sentence, like book title and page number", + ) + + +class Form(FrenchBaseModel): + form: str = "" + tags: list[str] = [] + ipas: list[str] = [] + source: str = Field("", description="Form line template name") + + +class Sound(FrenchBaseModel): + zh_pron: str = Field("", description="Chinese word pronunciation") + ipa: str = Field("", description="International Phonetic Alphabet") + audio: str = Field("", description="Audio file name") + wav_url: str = "" + oga_url: str = "" + ogg_url: str = "" + mp3_url: str = "" + opus_url: str = "" + tags: list[str] = [] + + +class Translation(FrenchBaseModel): + lang_code: str = Field( + "", description="Wiktionary language code of the translation term" + ) + lang_name: str = Field("", description="Translation language name") + word: str = Field("", description="Translation term") + sense: str = Field("", description="Translation gloss") + tags: list[str] = [] + roman: str = "" + traditional_writing: str = Field( + "", description="Alternative writting for Chinese, Korean and Mongolian" + ) + + +class Linkage(FrenchBaseModel): + word: str = "" + tags: list[str] = [] + roman: str = "" + alt: str = Field("", description="ALternative form") + translation: str = Field("", description="French translation") + sense: str = Field("", description="Definition of the word") + sense_index: int = Field( + 0, ge=0, description="Number of the definition, start from 1" + ) + lang_name: str = Field("", description="Localized language name") + lang_code: str = Field("", description="Wiktionary language code") + + +class Sense(FrenchBaseModel): + glosses: list[str] = [] + tags: list[str] = [] + categories: list[str] = [] + examples: list[Example] = [] + + +class WordEntry(FrenchBaseModel): + model_config = ConfigDict(title="French Wiktionary") + + word: str = Field(description="Word string") + lang_code: str = Field(description="Wiktionary language code") + lang_name: str = Field(description="Localized language name") + pos: str = Field("", description="Part of speech type") + pos_title: str = Field( + "", description="Original POS title for matching etymology texts" + ) + etymology_texts: list[str] = Field([], description="Etymology list") + senses: list[Sense] = Field([], description="Sense list") + forms: list[Form] = Field([], description="Inflection forms list") + sounds: list[Sound] = [] + translations: list[Translation] = [] + synonyms: list[Linkage] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + holonyms: list[Linkage] = [] + meronyms: list[Linkage] = [] + derived: list[Linkage] = [] + troponyms: list[Linkage] = [] + paronyms: list[Linkage] = [] + related: list[Linkage] = [] + abbreviation: list[Linkage] = [] + proverbs: list[Linkage] = [] + title: str = Field("", description="Redirect page source title") + redirect: str = Field("", description="Redirect page target title") + categories: list[str] = [] + notes: list[str] = [] + tags: list[str] = [] diff --git a/src/wiktextract/extractor/fr/note.py b/src/wiktextract/extractor/fr/note.py index b6c559712..e2b4a1633 100644 --- a/src/wiktextract/extractor/fr/note.py +++ b/src/wiktextract/extractor/fr/note.py @@ -1,14 +1,14 @@ -from typing import Any, Dict, List - from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import WordEntry + def extract_note( wxr: WiktextractContext, - page_data: List[Dict[str, Any]], + page_data: list[WordEntry], level_node: WikiNode, ) -> None: # Save paragraph and list item texts to a list of string. @@ -25,20 +25,20 @@ def extract_note( wxr, page_data[-1], list_item_node.children ) if len(note_text) > 0: - page_data[-1]["notes"].append(note_text) + page_data[-1].notes.append(note_text) continue note_paragraph_nodes.append(child) if isinstance(child, str) and child.endswith("\n"): note_text = clean_node(wxr, page_data[-1], note_paragraph_nodes) if len(note_text) > 0: - page_data[-1]["notes"].append(note_text) + page_data[-1].notes.append(note_text) note_paragraph_nodes.clear() def process_note_template( wxr: WiktextractContext, - page_data: List[Dict[str, Any]], + page_data: list[WordEntry], template_node: TemplateNode, ) -> None: expaned_template = wxr.wtp.parse( diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 9c22d47e2..27704a00d 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -1,7 +1,5 @@ -import copy import logging -from collections import defaultdict -from typing import Dict, List, Optional +from typing import Any, Optional from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode @@ -13,6 +11,7 @@ from .gloss import extract_gloss, process_exemple_template from .inflection import extract_inflection from .linkage import extract_linkage +from .models import WordEntry from .note import extract_note from .pronunciation import extract_pronunciation from .translation import extract_translation @@ -32,10 +31,10 @@ def parse_section( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: list[WordEntry], + base_data: WordEntry, level_node: WikiNode, -) -> Optional[List[EtymologyData]]: +) -> Optional[list[EtymologyData]]: # Page structure: https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages for level_node_template in level_node.find_content(NodeKind.TEMPLATE): if level_node_template.template_name == "S": @@ -78,14 +77,14 @@ def parse_section( and section_type in wxr.config.LINKAGE_SUBTITLES ): if len(page_data) == 0: - page_data.append(copy.deepcopy(base_data)) + page_data.append(base_data.model_copy(deep=True)) extract_linkage( wxr, page_data, level_node, section_type, ) - if page_data[-1].keys() == base_data.keys(): + if page_data[-1] == base_data: page_data.pop() # no data was added elif ( wxr.config.capture_translations @@ -100,41 +99,41 @@ def parse_section( pass elif section_type in wxr.config.OTHER_SUBTITLES["notes"]: if len(page_data) == 0: - page_data.append(copy.deepcopy(base_data)) + page_data.append(base_data.model_copy(deep=True)) extract_note(wxr, page_data, level_node) - if page_data[-1].keys() == base_data.keys(): + if page_data[-1] == base_data: page_data.pop() # no data was added def process_pos_block( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: list[WordEntry], + base_data: WordEntry, pos_title_node: TemplateNode, pos_argument: str, pos_title: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"] - if len(page_data) == 0 or "pos" in page_data[-1]: - page_data.append(copy.deepcopy(base_data)) - page_data[-1]["pos"] = pos_type - page_data[-1]["pos_title"] = pos_title + if len(page_data) == 0 or "pos" not in page_data[-1].model_fields_set: + page_data.append(base_data.model_copy(deep=True)) + page_data[-1].pos = pos_type + page_data[-1].pos_title = pos_title child_nodes = list(pos_title_node.filter_empty_str_child()) form_line_start = 0 # Ligne de forme gloss_start = len(child_nodes) - lang_code = page_data[-1].get("lang_code") + lang_code = page_data[-1].lang_code for index, child in enumerate(child_nodes): if isinstance(child, WikiNode): if child.kind == NodeKind.TEMPLATE: template_name = child.template_name if ( template_name.endswith("-exemple") - and len(page_data[-1].get("senses", [])) > 0 + and len(page_data[-1].senses) > 0 ): # zh-exemple and ja-exemple expand to list thus are not the # child of gloss list item. process_exemple_template( - wxr, child, page_data[-1]["senses"][-1] + wxr, child, page_data[-1].senses[-1] ) elif template_name.startswith(("zh-mot", "ja-mot")): # skip form line templates @@ -155,7 +154,7 @@ def process_pos_block( def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, str]]: +) -> list[dict[str, Any]]: if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") @@ -170,23 +169,27 @@ def parse_page( additional_expand=ADDITIONAL_EXPAND_TEMPLATES, ) - page_data = [] + page_data: list[WordEntry] = [] for level2_node in tree.find_child(NodeKind.LEVEL2): for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): # https://fr.wiktionary.org/wiki/Modèle:langue # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues if subtitle_template.template_name == "langue": - base_data = defaultdict(list, {"word": wxr.wtp.title}) + categories = {} lang_code = subtitle_template.template_parameters.get(1) if ( wxr.config.capture_language_codes is not None and lang_code not in wxr.config.capture_language_codes ): continue - lang_name = clean_node(wxr, base_data, subtitle_template) + lang_name = clean_node(wxr, categories, subtitle_template) wxr.wtp.start_section(lang_name) - base_data["lang_name"] = lang_name - base_data["lang_code"] = lang_code + base_data = WordEntry( + word=wxr.wtp.title, + lang_code=lang_code, + lang_name=lang_name, + categories=categories.get("categories", []), + ) etymology_data: Optional[EtymologyData] = None for level3_node in level2_node.find_child(NodeKind.LEVEL3): new_etymology_data = parse_section( @@ -198,4 +201,4 @@ def parse_page( if etymology_data is not None: insert_etymology_data(lang_code, page_data, etymology_data) - return page_data + return [m.model_dump(exclude_defaults=True) for m in page_data] diff --git a/src/wiktextract/extractor/fr/pronunciation.py b/src/wiktextract/extractor/fr/pronunciation.py index 319ef5aed..61e934d28 100644 --- a/src/wiktextract/extractor/fr/pronunciation.py +++ b/src/wiktextract/extractor/fr/pronunciation.py @@ -1,45 +1,40 @@ -from collections import defaultdict -from copy import deepcopy -from typing import Dict, List, Union - from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.extractor.share import create_audio_url_dict from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import Sound, WordEntry + def extract_pronunciation( wxr: WiktextractContext, - page_data: List[Dict], + page_data: list[WordEntry], level_node: WikiNode, - base_data: Dict[str, str], + base_data: WordEntry, ) -> None: sound_data = [] - lang_code = base_data.get("lang_code") + lang_code = base_data.lang_code for list_node in level_node.find_child(NodeKind.LIST): for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): sound_data.extend( - process_pron_list_item( - wxr, list_item_node, defaultdict(list), lang_code - ) + process_pron_list_item(wxr, list_item_node, Sound(), lang_code) ) if len(sound_data) == 0: return if len(page_data) == 0: - page_data.append(deepcopy(base_data)) + page_data.append(base_data.model_copy(deep=True)) if level_node.kind == NodeKind.LEVEL3: # Add extracted sound data to all sense dictionaries that have the same # language code when the prononciation subtitle is a level 3 title node. # Otherwise only add to the last one. - lang_code = page_data[-1].get("lang_code") for sense_data in page_data: - if sense_data.get("lang_code") == lang_code: - sense_data["sounds"].extend(sound_data) + if sense_data.lang_code == lang_code: + sense_data.sounds.extend(sound_data) else: - page_data[-1]["sounds"].extend(sound_data) + page_data[-1].sounds.extend(sound_data) PRON_TEMPLATES = frozenset( @@ -59,56 +54,57 @@ def extract_pronunciation( def process_pron_list_item( wxr: WiktextractContext, list_item_node: WikiNode, - sound_data: Dict[str, Union[str, List[str]]], + sound_data: Sound, lang_code: str, -) -> List[Dict[str, Union[str, List[str]]]]: - pron_key = "zh-pron" if lang_code == "zh" else "ipa" +) -> list[Sound]: + pron_key = "zh_pron" if lang_code == "zh" else "ipa" for template_node in list_item_node.find_child(NodeKind.TEMPLATE): if template_node.template_name in PRON_TEMPLATES: pron_text = process_pron_template(wxr, template_node) if len(pron_text) > 0: - sound_data[pron_key] = pron_text + setattr(sound_data, pron_key, pron_text) elif template_node.template_name in {"écouter", "audio", "pron-rég"}: process_ecouter_template(wxr, template_node, sound_data) else: sound_tag = clean_node(wxr, None, template_node) if sound_tag.startswith("(") and sound_tag.endswith(")"): sound_tag = sound_tag.strip("()") - sound_data["tags"].append(sound_tag) + sound_data.tags.append(sound_tag) if list_item_node.contain_node(NodeKind.LIST): returned_data = [] for bold_node in list_item_node.find_child(NodeKind.BOLD): - sound_data["tags"].append(clean_node(wxr, None, bold_node)) + sound_data.tags.append(clean_node(wxr, None, bold_node)) for nest_list_item in list_item_node.find_child_recursively( NodeKind.LIST_ITEM ): - new_sound_data = deepcopy(sound_data) + new_sound_data = sound_data.model_copy(deep=True) process_pron_list_item( wxr, nest_list_item, new_sound_data, lang_code ) - if pron_key in new_sound_data: + if pron_key in new_sound_data.model_fields_set: returned_data.append(new_sound_data) return returned_data - elif len(sound_data) > 0: - if pron_key not in sound_data: + elif len(sound_data.model_dump(exclude_defaults=True)) > 0: + if pron_key not in sound_data.model_fields_set: for child in list_item_node.filter_empty_str_child(): if isinstance(child, str): if child.strip().startswith(": "): # IPA text after "language : " - sound_data[pron_key] = ( - child.strip().removeprefix(": ").strip() + setattr( + sound_data, + pron_key, + child.strip().removeprefix(": ").strip(), ) elif len(child.strip()) > 0 and child.strip() != ":": # language text before ":" - sound_data["tags"].append(child.strip()) + sound_data.tags.append(child.strip()) - if pron_key in sound_data or "audio" in sound_data: + if len({pron_key, "audio"} & sound_data.model_fields_set) > 0: return [sound_data] - return [] @@ -129,7 +125,7 @@ def process_pron_template( def process_ecouter_template( wxr: WiktextractContext, template_node: TemplateNode, - sound_data: Dict[str, Union[str, List[str]]], + sound_data: Sound, ) -> None: # sound file template: https://fr.wiktionary.org/wiki/Modèle:écouter location = clean_node( @@ -148,11 +144,19 @@ def process_ecouter_template( wxr, None, template_node.template_parameters.get("audio", "") ) if len(location) > 0: - sound_data["tags"].append(location) + sound_data.tags.append(location) if len(ipa) > 0: - sound_data["ipa"] = ipa + sound_data.ipa = ipa if len(audio_file) > 0: - sound_data.update(create_audio_url_dict(audio_file)) + audio_data = create_audio_url_dict(audio_file) + for key, value in audio_data.items(): + if key in sound_data.model_fields: + setattr(sound_data, key, value) + else: + wxr.wtp.debug( + f"{key=} not defined in Sound", + sortid="fr.pronunciation/156", + ) def is_ipa_text(text: str) -> bool: @@ -164,43 +168,3 @@ def is_ipa_text(text: str) -> bool: # ipa text in a new line return True return False - - -def split_ipa(text: str) -> Union[List[str], str]: - # break IPA text if it contains "ou"(or) - if " ou " in text: - # two ipa texts in the same line: "en-conj-rég" template - return text.split(" ou ") - if text.startswith("ou "): - return text.removeprefix("ou ") - if text.endswith(" Prononciation ?\\"): - # inflection table templates use a edit link when the ipa data is - # missing, and the link usually ends with " Prononciation ?" - return "" - return text - - -def insert_ipa( - target_dict: Dict[str, Union[str, List[str]]], ipa_text: str -) -> None: - # insert IPA text to a dictionary, and merge values of the key "ipa" and - # "ipas", `target_dict` is created by `defaultdict(list)`. - ipa_data = split_ipa(ipa_text) - if len(ipa_data) == 0: - return - - if isinstance(ipa_data, str): - if "ipas" in target_dict: - target_dict["ipas"].append(ipa_data) - elif "ipa" in target_dict: - target_dict["ipas"].append(target_dict["ipa"]) - target_dict["ipas"].append(ipa_data) - del target_dict["ipa"] - else: - target_dict["ipa"] = ipa_data - elif isinstance(ipa_data, list): - if "ipa" in target_dict: - target_dict["ipas"].append(target_dict["ipa"]) - del target_dict["ipa"] - - target_dict["ipas"].extend(ipa_data) diff --git a/src/wiktextract/extractor/fr/translation.py b/src/wiktextract/extractor/fr/translation.py index c5dd112f9..484b8fa07 100644 --- a/src/wiktextract/extractor/fr/translation.py +++ b/src/wiktextract/extractor/fr/translation.py @@ -1,16 +1,17 @@ -from collections import defaultdict -from typing import Dict, List, Union +from typing import Optional from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .models import Translation, WordEntry + def extract_translation( - wxr: WiktextractContext, page_data: List[Dict], level_node: WikiNode + wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode ) -> None: - base_translation_data = defaultdict(list) + base_translation_data = Translation() for level_node_child in level_node.filter_empty_str_child(): if isinstance(level_node_child, WikiNode): if level_node_child.kind == NodeKind.TEMPLATE: @@ -42,8 +43,8 @@ def extract_translation( def process_italic_node( wxr: WiktextractContext, italic_node: WikiNode, - previous_node: Union[WikiNode, None], - page_data: List[Dict], + previous_node: Optional[WikiNode], + page_data: list[WordEntry], ) -> None: # add italic text after a "trad" template as a tag tag = clean_node(wxr, None, italic_node) @@ -53,16 +54,16 @@ def process_italic_node( and previous_node is not None and previous_node.kind == NodeKind.TEMPLATE and previous_node.template_name.startswith("trad") - and len(page_data[-1].get("translations", [])) > 0 + and len(page_data[-1].translations) > 0 ): - page_data[-1]["translations"][-1]["tags"].append(tag.strip("()")) + page_data[-1].translations[-1].tags.append(tag.strip("()")) def process_translation_templates( wxr: WiktextractContext, template_node: TemplateNode, - page_data: List[Dict], - base_translation_data: Dict[str, str], + page_data: list[WordEntry], + base_translation_data: Translation, ) -> None: if template_node.template_name == "trad-fin": # ignore translation end template @@ -73,13 +74,13 @@ def process_translation_templates( if sense_parameter is not None: sense_text = clean_node(wxr, None, sense_parameter) if len(sense_text) > 0: - base_translation_data["sense"] = sense_text + base_translation_data.sense = sense_text elif template_node.template_name == "T": # Translation language: https://fr.wiktionary.org/wiki/Modèle:T - base_translation_data[ - "lang_code" - ] = template_node.template_parameters.get(1) - base_translation_data["lang_name"] = clean_node( + base_translation_data.lang_code = template_node.template_parameters.get( + 1 + ) + base_translation_data.lang_name = clean_node( wxr, page_data[-1], template_node ) elif template_node.template_name.startswith("trad"): @@ -104,22 +105,22 @@ def process_translation_templates( translation_traditional_writing = clean_node( wxr, None, template_node.template_parameters.get("tradi", "") ) - translation_data = base_translation_data.copy() - translation_data["word"] = translation_term + translation_data = base_translation_data.model_copy(deep=True) + translation_data.word = translation_term if len(translation_roman) > 0: - translation_data["roman"] = translation_roman + translation_data.roman = translation_roman if len(translation_traditional_writing) > 0: - translation_data[ - "traditional_writing" - ] = translation_traditional_writing + translation_data.traditional_writing = ( + translation_traditional_writing + ) if 3 in template_node.template_parameters: expaned_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(template_node), expand_all=True ) for gender_node in expaned_node.find_child(NodeKind.ITALIC): - translation_data["tags"] = [clean_node(wxr, None, gender_node)] + translation_data.tags = [clean_node(wxr, None, gender_node)] break - page_data[-1]["translations"].append(translation_data) - elif len(page_data[-1].get("translations", [])) > 0: + page_data[-1].translations.append(translation_data) + elif len(page_data[-1].translations) > 0: tag = clean_node(wxr, None, template_node).strip("()") - page_data[-1]["translations"][-1]["tags"].append(tag) + page_data[-1].translations[-1].tags.append(tag) diff --git a/tests/test_fr_etymology.py b/tests/test_fr_etymology.py index 3b49b1ce4..a31e8b7b5 100644 --- a/tests/test_fr_etymology.py +++ b/tests/test_fr_etymology.py @@ -1,5 +1,4 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig @@ -7,10 +6,11 @@ extract_etymology, insert_etymology_data, ) +from wiktextract.extractor.fr.models import WordEntry from wiktextract.wxr_context import WiktextractContext -class TestEtymology(unittest.TestCase): +class TestEtymology(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -51,21 +51,29 @@ def test_list_etymologies(self): }, ) page_data = [ - defaultdict( - list, - {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"}, + WordEntry( + word="test", + lang_code="fr", + lang_name="Français", + pos="noun", + pos_title="Nom commun 1", ), - defaultdict( - list, - {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"}, + WordEntry( + word="test", + lang_code="fr", + lang_name="Français", + pos="noun", + pos_title="Nom commun 2", ), ] insert_etymology_data("fr", page_data, etymology_data) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data], [ { + "word": "test", "lang_code": "fr", + "lang_name": "Français", "pos": "noun", "pos_title": "Nom commun 1", "etymology_texts": [ @@ -74,7 +82,9 @@ def test_list_etymologies(self): ], }, { + "word": "test", "lang_code": "fr", + "lang_name": "Français", "pos": "noun", "pos_title": "Nom commun 2", "etymology_texts": [ @@ -106,25 +116,36 @@ def test_indent_etymology_with_pos_template(self): }, ) page_data = [ - defaultdict( - list, - {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 1"}, + WordEntry( + word="test", + lang_code="fr", + lang_name="Français", + pos="noun", + pos_title="Nom commun 1", ), - defaultdict( - list, - {"lang_code": "fr", "pos": "noun", "pos_title": "Nom commun 2"}, + WordEntry( + word="test", + lang_code="fr", + lang_name="Français", + pos="noun", + pos_title="Nom commun 2", ), - defaultdict( - list, - {"lang_code": "fr", "pos": "intj", "pos_title": "Interjection"}, + WordEntry( + word="test", + lang_code="fr", + lang_name="Français", + pos="intj", + pos_title="Interjection", ), ] insert_etymology_data("fr", page_data, etymology_data) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data], [ { + "word": "test", "lang_code": "fr", + "lang_name": "Français", "pos": "noun", "pos_title": "Nom commun 1", "etymology_texts": [ @@ -132,7 +153,9 @@ def test_indent_etymology_with_pos_template(self): ], }, { + "word": "test", "lang_code": "fr", + "lang_name": "Français", "pos": "noun", "pos_title": "Nom commun 2", "etymology_texts": [ @@ -140,7 +163,9 @@ def test_indent_etymology_with_pos_template(self): ], }, { + "word": "test", "lang_code": "fr", + "lang_name": "Français", "pos": "intj", "pos_title": "Interjection", "etymology_texts": [ diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py index 5f2c59b75..0f702e4f8 100644 --- a/tests/test_fr_form_line.py +++ b/tests/test_fr_form_line.py @@ -1,5 +1,4 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from unittest.mock import patch from wikitextprocessor import Wtp @@ -8,10 +7,11 @@ extract_form_line, process_zh_mot_template, ) +from wiktextract.extractor.fr.models import WordEntry from wiktextract.wxr_context import WiktextractContext -class TestFormLine(unittest.TestCase): +class TestFormLine(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -20,50 +20,50 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - @patch( - "wiktextract.extractor.fr.pronunciation.clean_node", - return_value="/lə nɔ̃/", - ) - def test_ipa(self, mock_clean_node): - self.wxr.wtp.start_page("") - root = self.wxr.wtp.parse("'''le nom''' {{pron|lə nɔ̃|fr}}") - page_data = [defaultdict(list)] + def test_ipa(self): + self.wxr.wtp.start_page("bonjour") + self.wxr.wtp.add_page("Modèle:pron", 10, "\\bɔ̃.ʒuʁ\\") + root = self.wxr.wtp.parse("'''bonjour''' {{pron|bɔ̃.ʒuʁ|fr}}") + page_data = [ + WordEntry(word="bonjour", lang_code="fr", lang_name="Français") + ] extract_form_line(self.wxr, page_data, root.children) - self.assertEqual(page_data, [{"sounds": [{"ipa": "/lə nɔ̃/"}]}]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].sounds], + [{"ipa": "\\bɔ̃.ʒuʁ\\"}], + ) - @patch( - "wiktextract.extractor.fr.form_line.clean_node", return_value="masculin" - ) - def test_gender(self, mock_clean_node): - self.wxr.wtp.start_page("") - root = self.wxr.wtp.parse("'''le nom''' {{m}}") - page_data = [defaultdict(list)] + def test_gender(self): + self.wxr.wtp.start_page("bonjour") + self.wxr.wtp.add_page("Modèle:m", 10, "masculin") + root = self.wxr.wtp.parse("'''bonjour''' {{m}}") + page_data = [ + WordEntry(word="bonjour", lang_code="fr", lang_name="Français") + ] extract_form_line(self.wxr, page_data, root.children) - self.assertEqual(page_data, [{"tags": ["masculin"]}]) + self.assertEqual(page_data[-1].tags, ["masculin"]) def test_zh_mot(self): - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("马") self.wxr.wtp.add_page("Modèle:zh-mot", 10, body="{{lang}} {{pron}}") self.wxr.wtp.add_page("Modèle:lang", 10, body="mǎ") self.wxr.wtp.add_page("Modèle:pron", 10, body="\\ma̠˨˩˦\\") root = self.wxr.wtp.parse("{{zh-mot|马|mǎ}}") - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] process_zh_mot_template(self.wxr, root.children[0], page_data) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[-1].sounds], [ - { - "sounds": [ - {"tags": ["Pinyin"], "zh-pron": "mǎ"}, - {"ipa": "\\ma̠˨˩˦\\"}, - ] - } + {"tags": ["Pinyin"], "zh_pron": "mǎ"}, + {"ipa": "\\ma̠˨˩˦\\"}, ], ) def test_ipa_location_tag(self): # https://fr.wiktionary.org/wiki/basket-ball - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("basket-ball") self.wxr.wtp.add_page("Modèle:pron", 10, body="{{{1}}}") self.wxr.wtp.add_page("Modèle:FR", 10, body="(France)") self.wxr.wtp.add_page("Modèle:CA", 10, body="(Canada)") @@ -71,35 +71,40 @@ def test_ipa_location_tag(self): root = self.wxr.wtp.parse( "{{pron|bas.kɛt.bol|fr}} {{FR|nocat=1}} ''ou'' {{pron|bas.kɛt.bɔl|fr}} {{FR|nocat=1}} ''ou'' {{pron|bas.kɛt.bɑl|fr}} {{CA|nocat=1}} {{m}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="basket-ball", lang_code="fr", lang_name="Français") + ] extract_form_line(self.wxr, page_data, root.children) self.assertEqual( - page_data, - [ - { - "tags": ["masculin"], - "sounds": [ - {"ipa": "bas.kɛt.bol", "tags": ["France"]}, - {"ipa": "bas.kɛt.bɔl", "tags": ["France"]}, - {"ipa": "bas.kɛt.bɑl", "tags": ["Canada"]}, - ], - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "basket-ball", + "lang_code": "fr", + "lang_name": "Français", + "tags": ["masculin"], + "sounds": [ + {"ipa": "bas.kɛt.bol", "tags": ["France"]}, + {"ipa": "bas.kɛt.bɔl", "tags": ["France"]}, + {"ipa": "bas.kɛt.bɑl", "tags": ["Canada"]}, + ], + }, ) def test_template_in_pron_argument(self): - # https://fr.wiktionary.org/wiki/minéral argileux + # https://fr.wiktionary.org/wiki/minéral_argileux self.wxr.wtp.start_page("") self.wxr.wtp.add_page("Modèle:pron", 10, body="{{{1}}}") self.wxr.wtp.add_page("Modèle:liaison", 10, body="‿") root = self.wxr.wtp.parse( "'''minéral argileux''' {{pron|mi.ne.ʁa.l{{liaison|fr}}aʁ.ʒi.lø|fr}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_form_line(self.wxr, page_data, root.children) self.assertEqual( - page_data, - [{"sounds": [{"ipa": "mi.ne.ʁa.l‿aʁ.ʒi.lø"}]}], + page_data[-1].sounds[0].model_dump(exclude_defaults=True), + {"ipa": "mi.ne.ʁa.l‿aʁ.ʒi.lø"}, ) @patch( @@ -112,40 +117,37 @@ def test_equiv_pour_template(self, mock_node_to_wikitext): root = self.wxr.wtp.parse( "{{équiv-pour|un homme|auteur|2egenre=une personne non-binaire|2egenre1=autaire|2egenre2=auteurice|2egenre3=auteur·ice|lang=fr}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="autrice", lang_code="fr", lang_name="Français") + ] extract_form_line(self.wxr, page_data, root.children) self.assertEqual( - page_data, - [ - { - "forms": [ - { - "form": "auteur", - "tags": ["pour un homme, on dit"], - "source": "form line template 'équiv-pour'", - }, - { - "form": "autaire", - "tags": [ - "pour une personne non-binaire, on peut dire" - ], - "source": "form line template 'équiv-pour'", - }, - { - "form": "auteurice", - "tags": [ - "pour une personne non-binaire, on peut dire" - ], - "source": "form line template 'équiv-pour'", - }, - { - "form": "auteur·ice", - "tags": [ - "pour une personne non-binaire, on peut dire" - ], - "source": "form line template 'équiv-pour'", - }, - ] - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "autrice", + "lang_code": "fr", + "lang_name": "Français", + "forms": [ + { + "form": "auteur", + "tags": ["pour un homme, on dit"], + "source": "form line template 'équiv-pour'", + }, + { + "form": "autaire", + "tags": ["pour une personne non-binaire, on peut dire"], + "source": "form line template 'équiv-pour'", + }, + { + "form": "auteurice", + "tags": ["pour une personne non-binaire, on peut dire"], + "source": "form line template 'équiv-pour'", + }, + { + "form": "auteur·ice", + "tags": ["pour une personne non-binaire, on peut dire"], + "source": "form line template 'équiv-pour'", + }, + ], + }, ) diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index dbcfa7c96..573f20f0c 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -1,15 +1,15 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from unittest.mock import patch from wikitextprocessor import Page, Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.gloss import extract_gloss +from wiktextract.extractor.fr.models import WordEntry from wiktextract.extractor.fr.page import process_pos_block from wiktextract.wxr_context import WiktextractContext -class TestFrGloss(unittest.TestCase): +class TestFrGloss(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -29,22 +29,18 @@ def tearDown(self) -> None: def test_theme_templates(self, mock_get_page): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse("# {{sportifs|fr}} gloss.\n#* example") - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], [ { - "senses": [ - { - "glosses": ["gloss."], - "tags": ["Sport"], - "categories": ["Sportifs en français"], - "examples": [ - {"text": "example", "type": "example"} - ], - } - ] + "glosses": ["gloss."], + "tags": ["Sport"], + "categories": ["Sportifs en français"], + "examples": [{"text": "example"}], } ], ) @@ -54,26 +50,23 @@ def test_example_template(self): root = self.wxr.wtp.parse( "# gloss.\n#* {{exemple|text|translation|roman|source=source}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], [ { - "senses": [ + "glosses": ["gloss."], + "examples": [ { - "glosses": ["gloss."], - "examples": [ - { - "text": "text", - "translation": "translation", - "roman": "roman", - "ref": "source", - "type": "quotation", - } - ], + "text": "text", + "translation": "translation", + "roman": "roman", + "ref": "source", } - ] + ], } ], ) @@ -87,31 +80,28 @@ def test_example_source_template(self, mock_node_to_html): root = self.wxr.wtp.parse( "# gloss.\n#* example {{source|source_title}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], [ { - "senses": [ + "glosses": ["gloss."], + "examples": [ { - "glosses": ["gloss."], - "examples": [ - { - "text": "example", - "ref": "source_title", - "type": "quotation", - } - ], + "text": "example", + "ref": "source_title", } - ] + ], } ], ) def test_zh_exemple_template(self): # https://fr.wiktionary.org/wiki/马 - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("马") root = self.wxr.wtp.parse( "=== {{S|nom|zh}} ===\n# Cheval.\n{{zh-exemple|这匹'''马'''很大。|Ce cheval est grand.|Zhè pǐ '''mǎ''' hěn dà.
⠌⠢⠆ ⠏⠊⠄ ⠍⠔⠄ ⠓⠴⠄ ⠙⠔⠆⠐⠆}}" ) @@ -119,32 +109,32 @@ def test_zh_exemple_template(self): process_pos_block( self.wxr, page_data, - defaultdict(list), + WordEntry(word="马", lang_code="zh", lang_name="Chinois"), root.children[0], "nom", "Nom commun", ) self.assertEqual( - page_data, - [ - { - "pos": "noun", - "pos_title": "Nom commun", - "senses": [ - { - "glosses": ["Cheval."], - "examples": [ - { - "text": "这匹马很大。", - "translation": "Ce cheval est grand.", - "roman": "Zhè pǐ mǎ hěn dà.\n⠌⠢⠆ ⠏⠊⠄ ⠍⠔⠄ ⠓⠴⠄ ⠙⠔⠆⠐⠆", - "type": "example", - } - ], - } - ], - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "马", + "lang_code": "zh", + "lang_name": "Chinois", + "pos": "noun", + "pos_title": "Nom commun", + "senses": [ + { + "glosses": ["Cheval."], + "examples": [ + { + "text": "这匹马很大。", + "translation": "Ce cheval est grand.", + "roman": "Zhè pǐ mǎ hěn dà.\n⠌⠢⠆ ⠏⠊⠄ ⠍⠔⠄ ⠓⠴⠄ ⠙⠔⠆⠐⠆", + } + ], + } + ], + }, ) def test_variante_de(self): @@ -161,18 +151,16 @@ def test_variante_de(self): root = self.wxr.wtp.parse( "# {{désuet|en}} {{sports|en}} {{indénombrable|en}} {{variante de|basketball|en}}." ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], [ { - "senses": [ - { - "glosses": ["Variante de basketball."], - "tags": ["Désuet", "Sport", "Indénombrable"], - } - ] + "glosses": ["Variante de basketball."], + "tags": ["Désuet", "Sport", "Indénombrable"], } ], ) @@ -183,17 +171,13 @@ def test_italic_tag(self): root = self.wxr.wtp.parse( "# (''localement'') [[bassin#Nom_commun|Bassin]], [[lavoir#Nom_commun|lavoir]]." ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, - [ - { - "senses": [ - {"glosses": ["Bassin, lavoir."], "tags": ["localement"]} - ] - } - ], + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], + [{"glosses": ["Bassin, lavoir."], "tags": ["localement"]}], ) def test_not_italic_tag(self): @@ -202,18 +186,16 @@ def test_not_italic_tag(self): root = self.wxr.wtp.parse( "# [[oiseau|Oiseau]] aquatique de taille moyenne du genre ''[[Rhynchops]]''." ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], [ { - "senses": [ - { - "glosses": [ - "Oiseau aquatique de taille moyenne du genre Rhynchops." - ] - } + "glosses": [ + "Oiseau aquatique de taille moyenne du genre Rhynchops." ] } ], @@ -224,11 +206,13 @@ def test_preserve_space_between_tags(self): # the space between italic node and the link node should be preserved self.wxr.wtp.start_page("becs-en-ciseaux") root = self.wxr.wtp.parse("# ''Pluriel de'' [[bec-en-ciseaux]].") - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, - [{"senses": [{"glosses": ["Pluriel de bec-en-ciseaux."]}]}], + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], + [{"glosses": ["Pluriel de bec-en-ciseaux."]}], ) @patch( @@ -241,18 +225,16 @@ def test_template_is_not_tag(self, mock_get_page): root = self.wxr.wtp.parse( "# {{lien|autrice|fr|dif=Autrice}}, [[celle]] qui est à l’[[origine]] de [[quelque chose]]." ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], [ { - "senses": [ - { - "glosses": [ - "Autrice, celle qui est à l’origine de quelque chose." - ] - } + "glosses": [ + "Autrice, celle qui est à l’origine de quelque chose." ] } ], @@ -268,16 +250,17 @@ def test_nest_gloss(self): ##* nest example """ ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_gloss(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data[-1]["senses"], + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], [ { "examples": [ { "text": "example 1", - "type": "example", } ], "glosses": [ @@ -288,7 +271,6 @@ def test_nest_gloss(self): "examples": [ { "text": "nest example", - "type": "example", } ], "glosses": [ diff --git a/tests/test_fr_inflection.py b/tests/test_fr_inflection.py index 3a1ed2bc7..20e66eb8a 100644 --- a/tests/test_fr_inflection.py +++ b/tests/test_fr_inflection.py @@ -1,15 +1,15 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from unittest.mock import patch from wikitextprocessor import Wtp from wikitextprocessor.parser import TemplateNode from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.inflection import extract_inflection +from wiktextract.extractor.fr.models import WordEntry from wiktextract.wxr_context import WiktextractContext -class TestInflection(unittest.TestCase): +class TestInflection(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -32,12 +32,14 @@ def tearDown(self) -> None: """, ) def test_fr_reg(self, mock_node_to_wikitext): - page_data = [defaultdict(list, {"word": "productrice"})] + page_data = [ + WordEntry(word="productrice", lang_code="fr", lang_name="Français") + ] node = TemplateNode(0) self.wxr.wtp.start_page("productrice") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"form": "productrices", "tags": ["Pluriel"]}], ) @@ -59,25 +61,27 @@ def test_fr_reg(self, mock_node_to_wikitext): ) def test_fr_accord_al(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/animal#Adjectif - page_data = [defaultdict(list, {"word": "animal", "lang_code": "fr"})] + page_data = [ + WordEntry(word="animal", lang_code="fr", lang_name="Français") + ] node = TemplateNode(0) self.wxr.wtp.start_page("animal") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ { - "ipa": "\\a.ni.mo\\", + "ipas": ["\\a.ni.mo\\"], "tags": ["Pluriel", "Masculin"], "form": "animaux", }, { - "ipa": "\\a.ni.mal\\", + "ipas": ["\\a.ni.mal\\"], "tags": ["Singulier", "Féminin"], "form": "animale", }, { - "ipa": "\\a.ni.mal\\", + "ipas": ["\\a.ni.mal\\"], "tags": ["Pluriel", "Féminin"], "form": "animales", }, @@ -96,12 +100,14 @@ def test_fr_accord_al(self, mock_node_to_wikitext): def test_multiple_lines_ipa(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/ration#Nom_commun_2 # template "en-nom-rég" - page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})] + page_data = [ + WordEntry(word="ration", lang_code="en", lang_name="Anglais") + ] node = TemplateNode(0) self.wxr.wtp.start_page("ration") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ { "ipas": ["\\ˈɹæʃ.ənz\\", "\\ˈɹeɪʃ.ənz\\"], @@ -124,12 +130,14 @@ def test_multiple_lines_ipa(self, mock_node_to_wikitext): def test_single_line_multiple_ipa(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/ration#Verbe # template "en-conj-rég" - page_data = [defaultdict(list, {"lang_code": "en", "word": "ration"})] + page_data = [ + WordEntry(word="ration", lang_code="en", lang_name="Anglais") + ] node = TemplateNode(0) self.wxr.wtp.start_page("ration") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ { "ipas": ["\\ˈɹæʃ.ən\\", "\\ˈɹeɪʃ.ən\\"], @@ -152,12 +160,14 @@ def test_single_line_multiple_ipa(self, mock_node_to_wikitext): def test_invalid_ipa(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/animal#Nom_commun_3 # template "ast-accord-mf" - page_data = [defaultdict(list, {"lang_code": "en", "word": "animal"})] + page_data = [ + WordEntry(word="animal", lang_code="en", lang_name="Français") + ] node = TemplateNode(0) self.wxr.wtp.start_page("animal") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Pluriel"], "form": "animales"}], ) @@ -175,12 +185,12 @@ def test_invalid_ipa(self, mock_node_to_wikitext): def test_no_column_headers(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/一万#Nom_commun # template "zh-formes" - page_data = [defaultdict(list, {"lang_code": "zh", "word": "一万"})] + page_data = [WordEntry(word="一万", lang_code="zh", lang_name="Chinois")] node = TemplateNode(0) self.wxr.wtp.start_page("一万") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Traditionnel"], "form": "一萬"}], ) @@ -198,12 +208,14 @@ def test_no_column_headers(self, mock_node_to_wikitext): ) def test_lt_décl_as(self, mock_node_to_wikitext): # empty table cells should be ignored - page_data = [defaultdict(list, {"lang_code": "lt", "word": "abadai"})] + page_data = [ + WordEntry(word="abadai", lang_code="lt", lang_name="Lituanien") + ] node = TemplateNode(0) self.wxr.wtp.start_page("abadai") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [{"tags": ["Singulier", "Nominatif"], "form": "abadas"}], ) @@ -229,23 +241,24 @@ def test_lt_décl_as(self, mock_node_to_wikitext): |}""", ) def test_fr_accord_s(self, mock_node_to_wikitext): - # https://fr.wiktionary.org/wiki/ - page_data = [defaultdict(list, {"lang_code": "fr", "word": "aastais"})] + page_data = [ + WordEntry(word="aastais", lang_code="fr", lang_name="Français") + ] node = TemplateNode(0) self.wxr.wtp.start_page("aastais") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ { "tags": ["Singulier", "Féminin"], "form": "aastaise", - "ipa": "\\a.a.stɛz\\", + "ipas": ["\\a.a.stɛz\\"], }, { "tags": ["Pluriel", "Féminin"], "form": "aastaises", - "ipa": "\\a.a.stɛz\\", + "ipas": ["\\a.a.stɛz\\"], }, ], ) @@ -268,31 +281,35 @@ def test_fr_accord_s(self, mock_node_to_wikitext): ) def test_fr_accord_personne(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/enculé_de_ta_race - page_data = [defaultdict(list)] + page_data = [ + WordEntry( + word="enculé de ta race", lang_code="fr", lang_name="Français" + ) + ] node = TemplateNode(0) self.wxr.wtp.start_page("enculé de ta race") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ { "form": "enculé de ma race", - "ipa": "\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\", + "ipas": ["\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\"], "tags": ["Singulier", "1ᵉ personne", "Masculin"], }, { "form": "enculés de notre race", - "ipa": "\\ɑ̃.ky.ˌle.də.nɔ.tʁə.ˈʁas\\", + "ipas": ["\\ɑ̃.ky.ˌle.də.nɔ.tʁə.ˈʁas\\"], "tags": ["Pluriel", "1ᵉ personne", "Masculin"], }, { "form": "enculée de ma race", - "ipa": "\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\", + "ipas": ["\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\"], "tags": ["Singulier", "1ᵉ personne", "Féminin"], }, { "form": "enculées de notre race", - "ipa": "\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\", + "ipas": ["\\ɑ̃.ky.ˌle.də.ma.ˈʁas\\"], "tags": ["Pluriel", "1ᵉ personne", "Féminin"], }, ], @@ -320,12 +337,14 @@ def test_fr_accord_personne(self, mock_node_to_wikitext): ) def test_ro_nom_tab(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/fenil#Nom_commun_4 - page_data = [defaultdict(list, {"word": "fenil"})] + page_data = [ + WordEntry(word="fenil", lang_code="fr", lang_name="Français") + ] node = TemplateNode(0) self.wxr.wtp.start_page("fenil") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ { "form": "fenilul", @@ -364,12 +383,14 @@ def test_ro_nom_tab(self, mock_node_to_wikitext): ) def test_sv_nom_c_ar(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/robot#Nom_commun_7 - page_data = [defaultdict(list, {"word": "robot"})] + page_data = [ + WordEntry(word="robot", lang_code="fr", lang_name="Français") + ] node = TemplateNode(0) self.wxr.wtp.start_page("robot") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ {"form": "roboten", "tags": ["Défini", "Singulier"]}, {"form": "robotar", "tags": ["Indéfini", "Pluriel"]}, @@ -392,12 +413,14 @@ def test_sv_nom_c_ar(self, mock_node_to_wikitext): ) def test_cs_decl_nom_ma_dur(self, mock_node_to_wikitext): # https://fr.wiktionary.org/wiki/robot#Nom_commun_1_2 - page_data = [defaultdict(list, {"word": "robot"})] + page_data = [ + WordEntry(word="robot", lang_code="fr", lang_name="Français") + ] node = TemplateNode(0) self.wxr.wtp.start_page("robot") extract_inflection(self.wxr, page_data, node) self.assertEqual( - page_data[-1].get("forms"), + [d.model_dump(exclude_defaults=True) for d in page_data[-1].forms], [ {"form": "roboti", "tags": ["Pluriel", "Nominatif"]}, {"form": "robotové", "tags": ["Pluriel", "Nominatif"]}, diff --git a/tests/test_fr_linkage.py b/tests/test_fr_linkage.py index 498faf3ff..da2aadb2e 100644 --- a/tests/test_fr_linkage.py +++ b/tests/test_fr_linkage.py @@ -1,13 +1,13 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.fr.linkage import extract_linkage +from wiktextract.extractor.fr.models import WordEntry from wiktextract.wxr_context import WiktextractContext -class TestLinkage(unittest.TestCase): +class TestLinkage(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -17,7 +17,9 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def test_tags(self): - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("bonjour") self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)") self.wxr.wtp.add_page("Modèle:Louisiane", 10, body="(Louisiane)") @@ -26,41 +28,33 @@ def test_tags(self): ) extract_linkage(self.wxr, page_data, root, "synonymes") self.assertEqual( - page_data, - [ - { - "synonyms": [ - {"word": "bon matin", "tags": ["Canada", "Louisiane"]} - ] - } - ], + page_data[-1].synonyms[0].model_dump(exclude_defaults=True), + {"word": "bon matin", "tags": ["Canada", "Louisiane"]}, ) def test_zh_synonyms(self): - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("你好") root = self.wxr.wtp.parse( "* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)." ) extract_linkage(self.wxr, page_data, root, "synonymes") self.assertEqual( - page_data, - [ - { - "synonyms": [ - { - "word": "你们好", - "roman": "nǐmen hǎo", - "alt": "你們好", - "translation": "Bonjour (au pluriel).", - } - ] - } - ], + page_data[-1].synonyms[0].model_dump(exclude_defaults=True), + { + "word": "你们好", + "roman": "nǐmen hǎo", + "alt": "你們好", + "translation": "Bonjour (au pluriel).", + }, ) def test_template_as_partial_tag(self): - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("bonjour") self.wxr.wtp.add_page("Modèle:lien", 10, body="kwei") self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)") @@ -70,37 +64,34 @@ def test_template_as_partial_tag(self): ) extract_linkage(self.wxr, page_data, root, "synonymes") self.assertEqual( - page_data, - [ - { - "synonyms": [ - {"word": "kwei", "tags": ["Canada", "mot Atikamekw"]} - ] - } - ], + page_data[-1].synonyms[0].model_dump(exclude_defaults=True), + {"word": "kwei", "tags": ["Canada", "mot Atikamekw"]}, ) def test_list_item_has_two_words(self): - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("masse") root = self.wxr.wtp.parse( "* [[être à la masse]], [[mettre à la masse]]" ) extract_linkage(self.wxr, page_data, root, "dérivés") self.assertEqual( - page_data, [ - { - "derived": [ - {"word": "être à la masse"}, - {"word": "mettre à la masse"}, - ] - } + d.model_dump(exclude_defaults=True) + for d in page_data[-1].derived + ], + [ + {"word": "être à la masse"}, + {"word": "mettre à la masse"}, ], ) def test_sub_list(self): - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("lézard ocellé") root = self.wxr.wtp.parse( """* [[saurien]]s (Sauria) @@ -109,25 +100,26 @@ def test_sub_list(self): ) extract_linkage(self.wxr, page_data, root, "hyper") self.assertEqual( - page_data, [ + d.model_dump(exclude_defaults=True) + for d in page_data[-1].hypernyms + ], + [ + {"tags": ["Sauria"], "word": "sauriens"}, { - "hypernyms": [ - {"tags": ["Sauria"], "word": "sauriens"}, - { - "tags": [ - "Lacertidae", - "famille des lézards typiques", - ], - "word": "lacertidés", - }, - ] - } + "tags": [ + "Lacertidae", + "famille des lézards typiques", + ], + "word": "lacertidés", + }, ], ) def test_sense(self): - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("autrice") root = self.wxr.wtp.parse( """{{(|Celle qui est à l’origine de quelque chose|1}} @@ -136,17 +128,16 @@ def test_sense(self): ) extract_linkage(self.wxr, page_data, root, "synonymes") self.assertEqual( - page_data, + [ + d.model_dump(exclude_defaults=True) + for d in page_data[-1].synonyms + ], [ { - "synonyms": [ - { - "word": "artisane", - "sense": "Celle qui est à l’origine de quelque chose", - "sense_index": 1, - }, - ] - } + "word": "artisane", + "sense": "Celle qui est à l’origine de quelque chose", + "sense_index": 1, + }, ], ) @@ -154,45 +145,47 @@ def test_derives_autres_langues_section(self): # https://fr.wiktionary.org/wiki/eau#Dérivés_dans_d’autres_langues self.wxr.wtp.add_page("Modèle:lien", 10, body="{{{1}}}") self.wxr.wtp.add_page("Modèle:L", 10, body="Karipúna") - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("eau") root = self.wxr.wtp.parse( "* {{L|kmv}} : {{lien|dlo|kmv}}, {{lien|djilo|kmv}}" ) extract_linkage(self.wxr, page_data, root, "dérivés autres langues") self.assertEqual( - page_data, [ + d.model_dump(exclude_defaults=True) + for d in page_data[-1].derived + ], + [ + { + "word": "dlo", + "lang_code": "kmv", + "lang_name": "Karipúna", + }, { - "derived": [ - { - "word": "dlo", - "lang_code": "kmv", - "lang_name": "Karipúna", - }, - { - "word": "djilo", - "lang_code": "kmv", - "lang_name": "Karipúna", - }, - ] - } + "word": "djilo", + "lang_code": "kmv", + "lang_name": "Karipúna", + }, ], ) def test_words_divided_by_slash(self): - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] self.wxr.wtp.start_page("eau") root = self.wxr.wtp.parse("* [[benoîte d’eau]] / [[benoite d’eau]]") extract_linkage(self.wxr, page_data, root, "dérivés") self.assertEqual( - page_data, [ - { - "derived": [ - {"word": "benoîte d’eau"}, - {"word": "benoite d’eau"}, - ] - } + d.model_dump(exclude_defaults=True) + for d in page_data[-1].derived + ], + [ + {"word": "benoîte d’eau"}, + {"word": "benoite d’eau"}, ], ) diff --git a/tests/test_fr_note.py b/tests/test_fr_note.py index 9d08d206a..46105e1e0 100644 --- a/tests/test_fr_note.py +++ b/tests/test_fr_note.py @@ -1,13 +1,13 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.fr.models import WordEntry from wiktextract.extractor.fr.note import extract_note from wiktextract.wxr_context import WiktextractContext -class TestNotes(unittest.TestCase): +class TestNotes(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -28,8 +28,11 @@ def test_list_notes(self): paragrapy 1 {{note-féminisation}}""" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="test", lang_code="fr", lang_name="Français") + ] extract_note(self.wxr, page_data, nodes.children[0]) self.assertEqual( - page_data, [{"notes": ["paragrapy 1", "list 1", "list 2"]}] + page_data[-1].notes, + ["paragrapy 1", "list 1", "list 2"], ) diff --git a/tests/test_fr_page.py b/tests/test_fr_page.py index f1c53b98d..d94e6d11a 100644 --- a/tests/test_fr_page.py +++ b/tests/test_fr_page.py @@ -2,7 +2,7 @@ # # Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org -import unittest +from unittest import TestCase from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig @@ -10,7 +10,7 @@ from wiktextract.wxr_context import WiktextractContext -class FrPageTests(unittest.TestCase): +class TestFrPage(TestCase): def setUp(self): self.maxDiff = None conf1 = WiktionaryConfig( @@ -25,7 +25,7 @@ def tearDown(self) -> None: def test_fr_parse_page(self): self.wxr.wtp.add_page("Modèle:langue", 10, "Français") self.wxr.wtp.add_page("Modèle:S", 10, "Nom commun") - lst = parse_page( + page_data = parse_page( self.wxr, "exemple", """ @@ -35,7 +35,7 @@ def test_fr_parse_page(self): """, ) self.assertEqual( - lst, + page_data, [ { "lang_name": "Français", diff --git a/tests/test_fr_pronunciation.py b/tests/test_fr_pronunciation.py index 3473a2b73..ff9063748 100644 --- a/tests/test_fr_pronunciation.py +++ b/tests/test_fr_pronunciation.py @@ -1,13 +1,15 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.fr.models import WordEntry from wiktextract.extractor.fr.pronunciation import extract_pronunciation from wiktextract.wxr_context import WiktextractContext -class TestPronunciation(unittest.TestCase): +class TestPronunciation(TestCase): + maxDiff = None + def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -18,22 +20,29 @@ def tearDown(self) -> None: def test_pron_list(self): page_data = [ - defaultdict(list, {"lang_code": "en"}), - defaultdict(list, {"lang_code": "fr"}), - defaultdict(list, {"lang_code": "fr"}), + WordEntry(word="bonjour", lang_code="en", lang_name="Anglais"), + WordEntry(word="bonjour", lang_code="fr", lang_name="Français"), + WordEntry(word="bonjour", lang_code="fr", lang_name="Français"), ] self.wxr.wtp.add_page("Modèle:pron", 10, body="\\bɔ̃.ʒuʁ\\") - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("bonjour") root = self.wxr.wtp.parse( "=== Prononciation ===\n* {{pron|bɔ̃.ʒuʁ|fr}}\n** {{écouter|France (Paris)|bõ.ʒuːʁ|audio=Fr-bonjour.ogg|lang=fr}}" ) - extract_pronunciation(self.wxr, page_data, root.children[0], {}) - self.assertEqual( + extract_pronunciation( + self.wxr, page_data, + root.children[0], + WordEntry(word="bonjour", lang_code="fr", lang_name="Français"), + ) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data], [ - {"lang_code": "en"}, + {"word": "bonjour", "lang_code": "en", "lang_name": "Anglais"}, { + "word": "bonjour", "lang_code": "fr", + "lang_name": "Français", "sounds": [ { "ipa": "bõ.ʒuːʁ", @@ -45,7 +54,9 @@ def test_pron_list(self): ], }, { + "word": "bonjour", "lang_code": "fr", + "lang_name": "Français", "sounds": [ { "ipa": "bõ.ʒuːʁ", @@ -62,7 +73,7 @@ def test_pron_list(self): def test_str_pron(self): page_data = [] self.wxr.wtp.add_page("Modèle:Yale-zh", 10, body="Yale") - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("你好") root = self.wxr.wtp.parse( "=== {{S|prononciation}} ===\n* '''cantonais''' {{pron||yue}}\n** {{Yale-zh}} : nei⁵hou²" ) @@ -70,11 +81,14 @@ def test_str_pron(self): self.wxr, page_data, root.children[0], - defaultdict(list, {"lang_code": "zh"}), + WordEntry(word="你好", lang_code="zh", lang_name="Chinois"), ) self.assertEqual( - page_data[0].get("sounds"), - [{"tags": ["cantonais", "Yale"], "zh-pron": "nei⁵hou²"}], + [ + sound.model_dump(exclude_defaults=True) + for sound in page_data[-1].sounds + ], + [{"tags": ["cantonais", "Yale"], "zh_pron": "nei⁵hou²"}], ) def test_no_ipa(self): @@ -84,24 +98,25 @@ def test_no_ipa(self): Test wikitext from https://fr.wiktionary.org/wiki/mars """ page_data = [] - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("mars") root = self.wxr.wtp.parse( """=== {{S|prononciation}} === {{ébauche-pron|sv}} * {{écouter|lang=sv|Suède||audio=LL-Q9027 (swe)-Moonhouse-mars.wav}}""" ) extract_pronunciation( - self.wxr, page_data, root.children[0], defaultdict(list) + self.wxr, + page_data, + root.children[0], + WordEntry(word="你好", lang_code="fr", lang_name="Français"), ) self.assertEqual( - page_data[0].get("sounds"), - [ - { - "tags": ["Suède"], - "audio": "LL-Q9027 (swe)-Moonhouse-mars.wav", - "wav_url": "https://commons.wikimedia.org/wiki/Special:FilePath/LL-Q9027 (swe)-Moonhouse-mars.wav", - "ogg_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/3/3f/LL-Q9027_(swe)-Moonhouse-mars.wav/LL-Q9027_(swe)-Moonhouse-mars.wav.ogg", - "mp3_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/3/3f/LL-Q9027_(swe)-Moonhouse-mars.wav/LL-Q9027_(swe)-Moonhouse-mars.wav.mp3", - } - ], + page_data[-1].sounds[0].model_dump(exclude_defaults=True), + { + "tags": ["Suède"], + "audio": "LL-Q9027 (swe)-Moonhouse-mars.wav", + "wav_url": "https://commons.wikimedia.org/wiki/Special:FilePath/LL-Q9027 (swe)-Moonhouse-mars.wav", + "ogg_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/3/3f/LL-Q9027_(swe)-Moonhouse-mars.wav/LL-Q9027_(swe)-Moonhouse-mars.wav.ogg", + "mp3_url": "https://upload.wikimedia.org/wikipedia/commons/transcoded/3/3f/LL-Q9027_(swe)-Moonhouse-mars.wav/LL-Q9027_(swe)-Moonhouse-mars.wav.mp3", + }, ) diff --git a/tests/test_fr_translation.py b/tests/test_fr_translation.py index bfc9f0aac..6feaf6c6f 100644 --- a/tests/test_fr_translation.py +++ b/tests/test_fr_translation.py @@ -1,13 +1,13 @@ -import unittest -from collections import defaultdict +from unittest import TestCase from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.fr.models import WordEntry from wiktextract.extractor.fr.translation import extract_translation from wiktextract.wxr_context import WiktextractContext -class TestTranslation(unittest.TestCase): +class TestTranslation(TestCase): def setUp(self) -> None: self.wxr = WiktextractContext( Wtp(lang_code="fr"), WiktionaryConfig(dump_file_lang_code="fr") @@ -17,111 +17,125 @@ def tearDown(self) -> None: self.wxr.wtp.close_db_conn() def test_italic_tag(self): - self.wxr.wtp.start_page("") + # https://fr.wiktionary.org/wiki/bonjour + self.wxr.wtp.start_page("bonjour") self.wxr.wtp.add_page("Modèle:T", 10, body="Albanais") root = self.wxr.wtp.parse( "=== Traductions ===\n* {{trad-début|Formule pour saluer}}\n* {{T|sq}} : {{trad+|sq|mirëdita}}, {{trad-|sq|mirë mëngjes}} ''(le matin)''" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="bonjour", lang_code="fr", lang_name="Français") + ] extract_translation(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, - [ - { - "translations": [ - { - "lang_code": "sq", - "lang_name": "Albanais", - "word": "mirëdita", - "sense": "Formule pour saluer", - }, - { - "lang_code": "sq", - "lang_name": "Albanais", - "word": "mirë mëngjes", - "sense": "Formule pour saluer", - "tags": ["le matin"], - }, - ] - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "bonjour", + "lang_code": "fr", + "lang_name": "Français", + "translations": [ + { + "lang_code": "sq", + "lang_name": "Albanais", + "word": "mirëdita", + "sense": "Formule pour saluer", + }, + { + "lang_code": "sq", + "lang_name": "Albanais", + "word": "mirë mëngjes", + "sense": "Formule pour saluer", + "tags": ["le matin"], + }, + ], + }, ) def test_template_tag(self): - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("bonjour") self.wxr.wtp.add_page("Modèle:T", 10, body="Arabe") self.wxr.wtp.add_page("Modèle:transliterator", 10, body="mrḥbā") self.wxr.wtp.add_page("Modèle:informel", 10, body="(Informel)") root = self.wxr.wtp.parse( "=== Traductions ===\n* {{T|ar}} : {{trad+|ar|مرحبا|dif=مرحبًا|tr={{transliterator|ar|مرحبا}}}} {{informel|nocat=1}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="bonjour", lang_code="fr", lang_name="Français") + ] extract_translation(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, - [ - { - "translations": [ - { - "lang_code": "ar", - "lang_name": "Arabe", - "word": "مرحبًا", - "roman": "mrḥbā", - "tags": ["Informel"], - }, - ] - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "bonjour", + "lang_code": "fr", + "lang_name": "Français", + "translations": [ + { + "lang_code": "ar", + "lang_name": "Arabe", + "word": "مرحبًا", + "roman": "mrḥbā", + "tags": ["Informel"], + }, + ], + }, ) def test_traditional_writing(self): - self.wxr.wtp.start_page("") + self.wxr.wtp.start_page("bonjour") self.wxr.wtp.add_page("Modèle:T", 10, body="Mongol") root = self.wxr.wtp.parse( "=== Traductions ===\n* {{T|mn}} : {{trad+|mn|сайн байна уу|tr=sain baina uu|tradi=ᠰᠠᠶᠢᠨ ᠪᠠᠶᠢᠨ᠎ᠠ ᠤᠤ}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="bonjour", lang_code="fr", lang_name="Français") + ] extract_translation(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, - [ - { - "translations": [ - { - "lang_code": "mn", - "lang_name": "Mongol", - "word": "сайн байна уу", - "roman": "sain baina uu", - "traditional_writing": "ᠰᠠᠶᠢᠨ ᠪᠠᠶᠢᠨ᠎ᠠ ᠤᠤ", - }, - ] - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "bonjour", + "lang_code": "fr", + "lang_name": "Français", + "translations": [ + { + "lang_code": "mn", + "lang_name": "Mongol", + "word": "сайн байна уу", + "roman": "sain baina uu", + "traditional_writing": "ᠰᠠᠶᠢᠨ ᠪᠠᠶᠢᠨ᠎ᠠ ᠤᠤ", + }, + ], + }, ) def test_trad_template_gender_parameter(self): - self.wxr.wtp.start_page("") + # https://fr.wiktionary.org/wiki/cambium + self.wxr.wtp.start_page("cambium") self.wxr.wtp.add_page("Modèle:T", 10, body="Allemand") self.wxr.wtp.add_page("Modèle:trad", 10, body="''neutre''") root = self.wxr.wtp.parse( "=== Traductions ===\n* {{T|de}} : {{trad|de|Kambium|n}}" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="cambium", lang_code="fr", lang_name="Français") + ] extract_translation(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, - [ - { - "translations": [ - { - "lang_code": "de", - "lang_name": "Allemand", - "word": "Kambium", - "tags": ["neutre"], - }, - ] - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "cambium", + "lang_code": "fr", + "lang_name": "Français", + "translations": [ + { + "lang_code": "de", + "lang_name": "Allemand", + "word": "Kambium", + "tags": ["neutre"], + }, + ], + }, ) def test_template_sense_parameter(self): @@ -134,20 +148,23 @@ def test_template_sense_parameter(self): {{trad-début|{{info lex|finance}}|12}} * {{T|hr}} : {{trad+|hr|masa}}""" ) - page_data = [defaultdict(list)] + page_data = [ + WordEntry(word="masse", lang_code="fr", lang_name="Français") + ] extract_translation(self.wxr, page_data, root.children[0]) self.assertEqual( - page_data, - [ - { - "translations": [ - { - "lang_code": "hr", - "lang_name": "Croate", - "word": "masa", - "sense": "(Finance)", - }, - ] - } - ], + page_data[-1].model_dump(exclude_defaults=True), + { + "word": "masse", + "lang_code": "fr", + "lang_name": "Français", + "translations": [ + { + "lang_code": "hr", + "lang_name": "Croate", + "word": "masa", + "sense": "(Finance)", + }, + ], + }, )