From 64b9513ab29a90e3a3c019ffb865309ed1eb195b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 26 Jan 2024 10:31:27 +0800 Subject: [PATCH 1/3] Add "onomatopoeic" tag to fr edition POS type "onomatopoeia" --- src/wiktextract/data/fr/pos_subtitles.json | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/data/fr/pos_subtitles.json b/src/wiktextract/data/fr/pos_subtitles.json index c018a7192..b126f4112 100644 --- a/src/wiktextract/data/fr/pos_subtitles.json +++ b/src/wiktextract/data/fr/pos_subtitles.json @@ -444,13 +444,22 @@ "pos": "num" }, "onom": { - "pos": "onomatopoeia" + "pos": "onomatopoeia", + "tags": [ + "onomatopoeic" + ] }, "onoma": { - "pos": "onomatopoeia" + "pos": "onomatopoeia", + "tags": [ + "onomatopoeic" + ] }, "onomatopée": { - "pos": "onomatopoeia" + "pos": "onomatopoeia", + "tags": [ + "onomatopoeic" + ] }, "part": { "pos": "particle" @@ -776,4 +785,4 @@ "pronominal" ] } -} \ No newline at end of file +} From 4ba54111bf1f72ba8a99ce13dee779859e52438c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 26 Jan 2024 10:59:13 +0800 Subject: [PATCH 2/3] Add "alt-of" tag to fr edition "typographic variant" POS --- src/wiktextract/data/fr/pos_subtitles.json | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/data/fr/pos_subtitles.json b/src/wiktextract/data/fr/pos_subtitles.json index b126f4112..62066eae7 100644 --- a/src/wiktextract/data/fr/pos_subtitles.json +++ b/src/wiktextract/data/fr/pos_subtitles.json @@ -753,16 +753,28 @@ "pos": "symbol" }, "var-typo": { - "pos": "typographic variant" + "pos": "typographic variant", + "tags": [ + "alt-of" + ] }, "variante par contrainte typographique": { - "pos": "typographic variant" + "pos": "typographic variant", + "tags": [ + "alt-of" + ] }, "variante typo": { - "pos": "typographic variant" + "pos": "typographic variant", + "tags": [ + "alt-of" + ] }, "variante typographique": { - "pos": "typographic variant" + "pos": "typographic variant", + "tags": [ + "alt-of" + ] }, "verb pr": { "pos": "verb", From 3fa7188b25ae8a0d92cfc3ef389aa5db60a52a72 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 26 Jan 2024 12:42:39 +0800 Subject: [PATCH 3/3] Find alternative form of the "typographic variant" POS word --- src/wiktextract/extractor/fr/gloss.py | 31 +++++++++++++- src/wiktextract/extractor/fr/models.py | 5 +++ src/wiktextract/extractor/fr/page.py | 4 +- tests/test_fr_gloss.py | 57 ++++++++++++++++++++++++++ 4 files changed, 95 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/fr/gloss.py b/src/wiktextract/extractor/fr/gloss.py index cfb920173..412ff2823 100644 --- a/src/wiktextract/extractor/fr/gloss.py +++ b/src/wiktextract/extractor/fr/gloss.py @@ -1,11 +1,12 @@ from collections import defaultdict +from typing import Union from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -from .models import Example, Sense, WordEntry +from .models import AltForm, Example, Sense, WordEntry def extract_gloss( @@ -65,6 +66,9 @@ def extract_gloss( and gloss_only_nodes[index].template_name == "note" ): note_index = index + find_alt_of_form( + wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data + ) gloss_text = clean_node( wxr, gloss_data, gloss_only_nodes[:note_index] ).strip(" ()") @@ -146,3 +150,28 @@ def process_exemple_template( ) if len(example_data.text) > 0: gloss_data.examples.append(example_data) + + +def find_alt_of_form( + wxr: WiktextractContext, + gloss_nodes: list[Union[str, WikiNode]], + pos_type: str, + gloss_data: Sense, +): + if pos_type == "typographic variant": + alt_of = "" + for gloss_node in filter( + lambda n: isinstance(n, WikiNode), gloss_nodes + ): + # use the last link + if gloss_node.kind == NodeKind.LINK: + alt_of = clean_node(wxr, None, gloss_node) + if isinstance(gloss_node, TemplateNode): + gloss_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(gloss_node), expand_all=True + ) + for link in gloss_node.find_child_recursively(NodeKind.LINK): + alt_of = clean_node(wxr, None, link) + if len(alt_of) > 0: + gloss_data.alt_of.append(AltForm(word=alt_of)) + gloss_data.tags.append("alt-of") diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py index 461a810ec..dd8b5021a 100644 --- a/src/wiktextract/extractor/fr/models.py +++ b/src/wiktextract/extractor/fr/models.py @@ -76,12 +76,17 @@ class Linkage(FrenchBaseModel): lang_code: str = Field("", description="Wiktionary language code") +class AltForm(FrenchBaseModel): + word: str + + class Sense(FrenchBaseModel): glosses: list[str] = [] tags: list[str] = [] categories: list[str] = [] examples: list[Example] = [] note: str = "" + alt_of: list[AltForm] = [] class WordEntry(FrenchBaseModel): diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 1c5200a20..f835a09de 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -113,11 +113,13 @@ def process_pos_block( pos_argument: str, pos_title: str, ): - pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"] + pos_data = wxr.config.POS_SUBTITLES[pos_argument] + pos_type = pos_data["pos"] if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: page_data.append(base_data.model_copy(deep=True)) page_data[-1].pos = pos_type page_data[-1].pos_title = pos_title + page_data[-1].tags.extend(pos_data.get("tags", [])) child_nodes = list(pos_title_node.filter_empty_str_child()) form_line_start = 0 # Ligne de forme gloss_start = len(child_nodes) diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index c27c126e1..6adc51f5f 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -308,3 +308,60 @@ def test_gloss_note_template(self): } ], ) + + def test_typographic_variant_alt_of_template(self): + self.wxr.wtp.start_page("abajhuro") + self.wxr.wtp.add_page( + "Modèle:eo-sys-h", + 10, + """''Orthographe par contrainte typographique par [[Annexe:Systèmes h et x en espéranto#Système h|système h]] de'' [[abaĵuro#eo|abaĵuro]]""", + ) + root = self.wxr.wtp.parse("# {{eo-sys-h|abaĵuro}}.") + page_data = [ + WordEntry( + word="abajhuro", + lang_code="fr", + lang="Français", + pos="typographic variant", + ) + ] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], + [ + { + "glosses": [ + "Orthographe par contrainte typographique par système h de abaĵuro." + ], + "alt_of": [{"word": "abaĵuro"}], + "tags": ["alt-of"], + } + ], + ) + + def test_typographic_variant_alt_of_text(self): + self.wxr.wtp.start_page("alphoenix") + root = self.wxr.wtp.parse( + "# ''Variante par contrainte typographique de'' [[alphœnix]]." + ) + page_data = [ + WordEntry( + word="alphoenix", + lang_code="fr", + lang="Français", + pos="typographic variant", + ) + ] + extract_gloss(self.wxr, page_data, root.children[0]) + self.assertEqual( + [d.model_dump(exclude_defaults=True) for d in page_data[-1].senses], + [ + { + "glosses": [ + "Variante par contrainte typographique de alphœnix." + ], + "alt_of": [{"word": "alphœnix"}], + "tags": ["alt-of"], + } + ], + )