diff --git a/src/wiktextract/extractor/th/etymology.py b/src/wiktextract/extractor/th/etymology.py new file mode 100644 index 000000000..b5419ca9b --- /dev/null +++ b/src/wiktextract/extractor/th/etymology.py @@ -0,0 +1,21 @@ +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import WordEntry + + +def extract_etymology_section( + wxr: WiktextractContext, + base_data: WordEntry, + level_node: LevelNode, +) -> None: + base_data.etymology_text = "" + base_data.categories.clear() + index = len(level_node.children) + for node_index, _ in level_node.find_child(LEVEL_KIND_FLAGS, True): + index = node_index + break + e_str = clean_node(wxr, base_data, level_node.children[:index]) + if e_str != "": + base_data.etymology_text = e_str diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py new file mode 100644 index 000000000..969ea485d --- /dev/null +++ b/src/wiktextract/extractor/th/models.py @@ -0,0 +1,31 @@ +from pydantic import BaseModel, ConfigDict, Field + + +class ThaiBaseModel(BaseModel): + model_config = ConfigDict( + extra="forbid", + strict=True, + validate_assignment=True, + validate_default=True, + ) + + +class Sense(ThaiBaseModel): + glosses: list[str] = [] + tags: list[str] = [] + raw_tags: list[str] = [] + categories: list[str] = [] + + +class WordEntry(ThaiBaseModel): + model_config = ConfigDict(title="Thai Wiktionary") + word: str = Field(description="Word string", min_length=1) + lang_code: str = Field(description="Wiktionary language code", min_length=1) + lang: str = Field(description="Localized language name", min_length=1) + pos: str = Field(description="Part of speech type", min_length=1) + pos_title: str = "" + senses: list[Sense] = [] + categories: list[str] = [] + tags: list[str] = [] + raw_tags: list[str] = [] + etymology_text: str = "" diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py new file mode 100644 index 000000000..9aa2ad2a2 --- /dev/null +++ b/src/wiktextract/extractor/th/page.py @@ -0,0 +1,62 @@ +import string +from typing import Any + +from mediawiki_langcodes import name_to_code +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .etymology import extract_etymology_section +from .models import Sense, WordEntry +from .pos import extract_pos_section +from .section_titles import POS_DATA + + +def parse_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: LevelNode, +) -> None: + title_text = clean_node(wxr, None, level_node.largs) + title_text = title_text.rstrip(string.digits + string.whitespace) + wxr.wtp.start_subsection(title_text) + if title_text in POS_DATA: + extract_pos_section(wxr, page_data, base_data, level_node, title_text) + elif title_text == "รากศัพท์": + extract_etymology_section(wxr, base_data, level_node) + + for next_level in level_node.find_child(LEVEL_KIND_FLAGS): + parse_section(wxr, page_data, base_data, next_level) + + +def parse_page( + wxr: WiktextractContext, page_title: str, page_text: str +) -> list[dict[str, Any]]: + # page layout + # https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน + wxr.wtp.start_page(page_title) + tree = wxr.wtp.parse(page_text, pre_expand=True) + page_data: list[WordEntry] = [] + for level2_node in tree.find_child(NodeKind.LEVEL2): + lang_name = clean_node(wxr, None, level2_node.largs) + lang_name = lang_name.removeprefix("ภาษา") + lang_code = name_to_code(lang_name, "th") + if lang_code == "": + lang_code = "unknown" + if lang_name == "": + lang_name = "unknown" + wxr.wtp.start_section(lang_name) + base_data = WordEntry( + word=wxr.wtp.title, + lang_code=lang_code, + lang=lang_name, + pos="unknown", + ) + for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): + parse_section(wxr, page_data, base_data, next_level_node) + + for data in page_data: + if len(data.senses) == 0: + data.senses.append(Sense(tags=["no-gloss"])) + return [m.model_dump(exclude_defaults=True) for m in page_data] diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py new file mode 100644 index 000000000..47c8ac9d6 --- /dev/null +++ b/src/wiktextract/extractor/th/pos.py @@ -0,0 +1,39 @@ +from wikitextprocessor import LevelNode, NodeKind, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Sense, WordEntry +from .section_titles import POS_DATA + + +def extract_pos_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: LevelNode, + pos_title: str, +) -> None: + page_data.append(base_data.model_copy(deep=True)) + page_data[-1].pos_title = pos_title + pos_data = POS_DATA[pos_title] + page_data[-1].pos = pos_data["pos"] + page_data[-1].tags.extend(pos_data.get("tags", [])) + + for list_node in level_node.find_child(NodeKind.LIST): + if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_gloss_list_item(wxr, page_data[-1], list_item) + + +def extract_gloss_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, +) -> None: + sense = Sense() + gloss_str = clean_node( + wxr, sense, list(list_item.invert_find_child(NodeKind.LIST)) + ) + if gloss_str != "": + sense.glosses.append(gloss_str) + word_entry.senses.append(sense) diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py new file mode 100644 index 000000000..43415e367 --- /dev/null +++ b/src/wiktextract/extractor/th/section_titles.py @@ -0,0 +1,15 @@ +POS_DATA = { + "คำกริยา": {"pos": "verb"}, + "คำกริยาวิเศษณ์": {"pos": "adv"}, + "คำคุณศัพท์": {"pos": "adj"}, + "คำนาม": {"pos": "noun"}, + "คำบุพบท": {"pos": "prep"}, + "คำลักษณนาม": {"pos": "classifier"}, + "คำวิสามานยนาม": {"pos": "name"}, + "คำสรรพนาม": {"pos": "pron"}, + "คำสันธาน": {"pos": "conj"}, + "คำอนุภาค": {"pos": "particle"}, + "คำอุทาน": {"pos": "intj"}, + "วลี": {"pos": "phrase"}, + "เลข": {"pos": "num", "tags": ["number"]}, +} diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py new file mode 100644 index 000000000..41ba3d3cf --- /dev/null +++ b/tests/test_th_gloss.py @@ -0,0 +1,71 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.th.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestThGloss(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="th"), + WiktionaryConfig( + dump_file_lang_code="th", capture_language_codes=None + ), + ) + + def test_do_not_share_etymology_data(self): + self.wxr.wtp.add_page( + "แม่แบบ:inh+", + 10, + """สืบทอดจาก[[w:ภาษาไทดั้งเดิม|ไทดั้งเดิม]][[Category:ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม|กบ]][[Category:ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม|กบ]] [[การสืบสร้าง:ไทดั้งเดิม/kɤpᴰ|*kɤpᴰ]]""", + ) + page_data = parse_page( + self.wxr, + "กบ", + """== ภาษาไทย == +=== รากศัพท์ 2 === +{{inh+|th|tai-pro|*kɤpᴰ}} + +==== คำนาม ==== +{{th-noun|ตัว}} + +# [[ชื่อ]] + +=== รากศัพท์ 3 === + +==== คำนาม ==== +{{th-noun|ตัว}} + +# [[ปลา]]""", + ) + self.assertEqual( + page_data, + [ + { + "categories": [ + "ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม", + "ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม", + ], + "etymology_text": "สืบทอดจากไทดั้งเดิม *kɤpᴰ", + "senses": [{"glosses": ["ชื่อ"]}], + "pos": "noun", + "pos_title": "คำนาม", + "word": "กบ", + "lang": "ไทย", + "lang_code": "th", + }, + { + "senses": [{"glosses": ["ปลา"]}], + "pos": "noun", + "pos_title": "คำนาม", + "word": "กบ", + "lang": "ไทย", + "lang_code": "th", + }, + ], + )