From d4b786b91c3c996facda09779072cea86fe857a5 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 17 Jan 2025 16:25:49 +0800 Subject: [PATCH] [th] extract "syn", "ant" linkage templates --- src/wiktextract/extractor/th/example.py | 13 ++++++++-- src/wiktextract/extractor/th/linkage.py | 33 +++++++++++++++++++++++++ src/wiktextract/extractor/th/models.py | 1 + src/wiktextract/extractor/th/pos.py | 2 +- tests/test_th_linkage.py | 14 +++++++++++ 5 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/th/example.py b/src/wiktextract/extractor/th/example.py index f91d9469..107609b9 100644 --- a/src/wiktextract/extractor/th/example.py +++ b/src/wiktextract/extractor/th/example.py @@ -5,16 +5,19 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from ..ruby import extract_ruby -from .models import Example, Sense +from .models import Example, Sense, WordEntry from .tags import translate_raw_tags def extract_example_list_item( wxr: WiktextractContext, + word_entry: WordEntry, sense: Sense, list_item: WikiNode, ref: str = "", ) -> None: + from .linkage import LINKAGE_TEMPLATES, extract_syn_template + for node in list_item.children: if isinstance(node, TemplateNode): if node.template_name in ["ux", "usex", "ko-usex"]: @@ -25,9 +28,15 @@ def extract_example_list_item( extract_template_ja_usex(wxr, sense, node, ref) elif node.template_name.startswith("quote-"): ref = extract_quote_template(wxr, sense, node) + elif node.template_name in LINKAGE_TEMPLATES: + extract_syn_template( + wxr, word_entry, node, LINKAGE_TEMPLATES[node.template_name] + ) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: for child_list_item in node.find_child(NodeKind.LIST_ITEM): - extract_example_list_item(wxr, sense, child_list_item, ref) + extract_example_list_item( + wxr, word_entry, sense, child_list_item, ref + ) def extract_ux_template( diff --git a/src/wiktextract/extractor/th/linkage.py b/src/wiktextract/extractor/th/linkage.py index 5b841c4f..e08070ce 100644 --- a/src/wiktextract/extractor/th/linkage.py +++ b/src/wiktextract/extractor/th/linkage.py @@ -1,3 +1,5 @@ +from itertools import count + from wikitextprocessor.parser import ( LEVEL_KIND_FLAGS, LevelNode, @@ -129,3 +131,34 @@ def extract_ws_template( if word != "": l_data = Linkage(word=word, source=source) getattr(word_entry, linkage_type).append(l_data) + + +LINKAGE_TEMPLATES = { + "syn": "synonyms", + "synonyms": "synonyms", + "synsee": "synonyms", + "ant": "antonyms", + "antonyms": "antonyms", + "cot": "coordinate_terms", + "coordinate terms": "coordinate_terms", + "hyper": "hypernyms", + "hypernyms": "hypernyms", + "hypo": "hyponyms", + "hyponyms": "hyponyms", +} + + +def extract_syn_template( + wxr: WiktextractContext, + word_entry: WordEntry, + t_node: TemplateNode, + linkage_type: str, +) -> None: + for arg_name in count(2): + if arg_name not in t_node.template_parameters: + break + arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name]) + if arg_value.startswith("อรรถาภิธาน:"): + extract_thesaurus_page(wxr, word_entry, linkage_type, arg_value) + elif arg_value != "": + getattr(word_entry, linkage_type).append(Linkage(word=arg_value)) diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index a18da70d..ec1acbe0 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -102,3 +102,4 @@ class WordEntry(ThaiBaseModel): hyponyms: list[Linkage] = [] hypernyms: list[Linkage] = [] idioms: list[Linkage] = [] + coordinate_terms: list[Linkage] = [] diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py index fab32407..386ae2c6 100644 --- a/src/wiktextract/extractor/th/pos.py +++ b/src/wiktextract/extractor/th/pos.py @@ -66,7 +66,7 @@ def extract_gloss_list_item( (":", "*") ): for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): - extract_example_list_item(wxr, sense, e_list_item) + extract_example_list_item(wxr, word_entry, sense, e_list_item) if gloss_str != "": sense.glosses.append(gloss_str) diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py index 58501449..a9c06651 100644 --- a/tests/test_th_linkage.py +++ b/tests/test_th_linkage.py @@ -83,3 +83,17 @@ def test_theasurus_page(self): page_data[0]["synonyms"], [{"word": "ต่อมโลหิต", "source": "อรรถาภิธาน:ระดู"}], ) + + def test_syn_template(self): + page_data = parse_page( + self.wxr, + "โทรทัศน์", + """== ภาษาไทย == +=== คำนาม === +# กระบวนการถ่ายทอด +#: {{syn|th|ทีวี|โทรภาพ}}""", + ) + self.assertEqual( + page_data[0]["synonyms"], + [{"word": "ทีวี"}, {"word": "โทรภาพ"}], + )