From d25ab61a8e4c2db399c8368e3d752b26efb148b5 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 10 Jan 2025 16:55:24 +0800 Subject: [PATCH 1/2] [th] extract translation section --- src/wiktextract/extractor/th/models.py | 14 +++ src/wiktextract/extractor/th/page.py | 5 + src/wiktextract/extractor/th/tags.py | 21 +++- src/wiktextract/extractor/th/translation.py | 101 ++++++++++++++++++ tests/test_th_translation.py | 111 ++++++++++++++++++++ 5 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 src/wiktextract/extractor/th/translation.py create mode 100644 tests/test_th_translation.py diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index 74896417..f5c0ec42 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -43,6 +43,19 @@ class Form(ThaiBaseModel): raw_tags: list[str] = [] +class Translation(ThaiBaseModel): + lang_code: str = Field( + description="Wiktionary language code of the translation term", + ) + lang: str = Field(description="Translation language name") + word: str = Field(description="Translation term") + sense: str = Field(default="", description="Translation gloss") + tags: list[str] = [] + raw_tags: list[str] = [] + roman: str = "" + lit: str = Field(default="", description="Literal translation") + + class WordEntry(ThaiBaseModel): model_config = ConfigDict(title="Thai Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -58,3 +71,4 @@ class WordEntry(ThaiBaseModel): etymology_text: str = "" classifiers: list[str] = [] forms: list[Form] = [] + translations: list[Translation] = [] diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py index 7c52cf87..643ed002 100644 --- a/src/wiktextract/extractor/th/page.py +++ b/src/wiktextract/extractor/th/page.py @@ -10,6 +10,7 @@ from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import POS_DATA +from .translation import extract_translation_section def parse_section( @@ -25,6 +26,10 @@ def parse_section( extract_pos_section(wxr, page_data, base_data, level_node, title_text) elif title_text == "รากศัพท์": extract_etymology_section(wxr, base_data, level_node) + elif title_text == "คำแปลภาษาอื่น": + extract_translation_section( + wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + ) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/th/tags.py b/src/wiktextract/extractor/th/tags.py index 6de4f51d..62304a4e 100644 --- a/src/wiktextract/extractor/th/tags.py +++ b/src/wiktextract/extractor/th/tags.py @@ -8,8 +8,27 @@ "simp.": "Simplified Chinese", } +TRANSLATION_TAGS = { + # แม่แบบ:t + # https://th.wiktionary.org/wiki/มอดูล:gender_and_number/data + "ญ.": "feminine", + "ช.": "masculine", + "ก.": "neuter", + "ร.": "common", + "ชีว.": "animate", + "อชีว.": "inanimate", + "สัต.": "animal-not-person", + "บุค.": "personal", + "อบุค.": "impersonal", + "เอก.": "singular", + "ทวิ.": "dual", + "พหู.": "plural", + "ไม่สมบูรณ์": "imperfective", + "สมบูรณ์": "perfective", +} + -TAGS = {**EXAMPLE_TAGS} +TAGS = {**EXAMPLE_TAGS, **TRANSLATION_TAGS} def translate_raw_tags(data: WordEntry) -> None: diff --git a/src/wiktextract/extractor/th/translation.py b/src/wiktextract/extractor/th/translation.py new file mode 100644 index 00000000..93883fa5 --- /dev/null +++ b/src/wiktextract/extractor/th/translation.py @@ -0,0 +1,101 @@ +from mediawiki_langcodes import name_to_code +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Translation, WordEntry +from .tags import translate_raw_tags + + +def extract_translation_section( + wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode +) -> None: + sense = "" + for node in level_node.children: + if isinstance(node, TemplateNode) and node.template_name == "trans-top": + sense = clean_node(wxr, None, node.template_parameters.get(1, "")) + clean_node(wxr, word_entry, node) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + for list_item in node.find_child(NodeKind.LIST_ITEM): + extract_translation_list_item(wxr, word_entry, list_item, sense) + + +def extract_translation_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, + sense: str, +) -> None: + lang_name = "unknown" + lang_code = "unknown" + for index, node in enumerate(list_item.children): + if isinstance(node, str) and ":" in node and lang_name == "unknown": + lang_name = ( + clean_node(wxr, None, list_item.children[:index]) + + node[: node.index(":")].strip() + ) + lang_code = name_to_code(lang_name, "th") + if lang_code == "": + lang_code = "unknown" + elif isinstance(node, TemplateNode) and node.template_name in [ + "t", + "t+", + ]: + extract_t_template(wxr, word_entry, node, lang_name, sense) + elif ( + isinstance(node, WikiNode) + and node.kind == NodeKind.LINK + and lang_name != "unknown" + ): + word = clean_node(wxr, None, node) + if word != "": + word_entry.translations.append( + Translation( + word=word, + lang=lang_name, + lang_code=lang_code, + sense=sense, + ) + ) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + for child_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_translation_list_item( + wxr, word_entry, child_list_item, sense + ) + + +def extract_t_template( + wxr: WiktextractContext, + word_entry: WordEntry, + t_node: TemplateNode, + lang_name: str, + sense: str, +) -> None: + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + if lang_code == "": + lang_code = "unknown" + tr_data = Translation( + word="", lang=lang_name, lang_code=lang_code, sense=sense + ) + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for span_tag in expanded_node.find_html_recursively("span"): + if span_tag.attrs.get("lang") == lang_code and tr_data.word == "": + tr_data.word = clean_node(wxr, None, span_tag) + else: + span_class = span_tag.attrs.get("class", "") + if "Latn" in span_class: + tr_data.roman = clean_node(wxr, None, span_tag) + + tr_data.lit = clean_node( + wxr, None, t_node.template_parameters.get("lit", "") + ) + for abbr_tag in expanded_node.find_html_recursively("abbr"): + tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag)) + + if tr_data.word != "": + translate_raw_tags(tr_data) + word_entry.translations.append(tr_data) + for link_node in expanded_node.find_child(NodeKind.LINK): + clean_node(wxr, word_entry, link_node) diff --git a/tests/test_th_translation.py b/tests/test_th_translation.py new file mode 100644 index 00000000..8341b451 --- /dev/null +++ b/tests/test_th_translation.py @@ -0,0 +1,111 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.th.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestThTranslation(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="th"), + WiktionaryConfig( + dump_file_lang_code="th", capture_language_codes=None + ), + ) + + def test_nested_list(self): + self.wxr.wtp.add_page( + "แม่แบบ:trans-top", + 10, + """
ชื่อสัตว์สะเทินน้ำสะเทินบกชนิดหนึ่ง