diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py new file mode 100644 index 000000000..55e9febf4 --- /dev/null +++ b/src/wiktextract/extractor/th/alt_form.py @@ -0,0 +1,43 @@ +from wikitextprocessor import LevelNode, NodeKind, TemplateNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Form, WordEntry +from .tags import translate_raw_tags + + +def extract_alt_form_section( + wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + for node in list_item.children: + if ( + isinstance(node, TemplateNode) + and node.template_name == "alt" + ): + extract_alt_template(wxr, word_entry, node) + + +def extract_alt_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + + raw_tags = [] + for italic_node in expanded_node.find_child(NodeKind.ITALIC): + raw_tags_str = clean_node(wxr, None, italic_node) + for raw_tag in raw_tags_str.split(","): + raw_tag = raw_tag.strip() + if raw_tag != "": + raw_tags.append(raw_tag) + break + + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + for span_tag in expanded_node.find_html("span"): + if span_tag.attrs.get("lang", "") == lang_code: + form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags) + translate_raw_tags(form) + word_entry.forms.append(form) diff --git a/src/wiktextract/extractor/th/descendant.py b/src/wiktextract/extractor/th/descendant.py new file mode 100644 index 000000000..889ded10b --- /dev/null +++ b/src/wiktextract/extractor/th/descendant.py @@ -0,0 +1,77 @@ +from mediawiki_langcodes import code_to_name +from wikitextprocessor import NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Descendant, WordEntry + + +def extract_descendant_section( + wxr: WiktextractContext, + word_entry: WordEntry, + level_node: WikiNode, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_desc_list_item(wxr, word_entry, [], list_item) + + +def extract_desc_list_item( + wxr: WiktextractContext, + word_entry: WordEntry, + parent_data: list[Descendant], + list_item: WikiNode, +) -> None: + desc_list = [] + for node in list_item.children: + if isinstance(node, TemplateNode) and node.template_name in [ + "desc", + "descendant", + "desctree", + "descendants tree", + ]: + desc_list.extend( + extract_desc_template(wxr, word_entry, parent_data, node) + ) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + for child_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_desc_list_item( + wxr, word_entry, desc_list, child_list_item + ) + + +def extract_desc_template( + wxr: WiktextractContext, + word_entry: WordEntry, + parent_data: list[Descendant], + t_node: TemplateNode, +) -> list[Descendant]: + desc_data = [] + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + lang_name = code_to_name(lang_code, "th") or "unknown" + for span_tag in expanded_node.find_html("span"): + span_lang = span_tag.attrs.get("lang", "") + span_class = span_tag.attrs.get("class", "") + if span_lang == lang_code: + desc_data.append( + Descendant( + lang_code=lang_code, + lang=lang_name, + word=clean_node(wxr, None, span_tag), + ) + ) + elif span_lang.endswith("-Latn") and len(desc_data) > 0: + desc_data[-1].roman = clean_node(wxr, None, span_tag) + elif span_class == "mention-gloss" and len(desc_data) > 0: + desc_data[-1].sense = clean_node(wxr, None, span_tag) + + if len(parent_data) > 0: + for p_data in parent_data: + p_data.descendants.extend(desc_data) + else: + word_entry.descendants.extend(desc_data) + clean_node(wxr, word_entry, expanded_node) + return desc_data diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index 31d1b91ea..c04492a96 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -63,6 +63,17 @@ class Linkage(ThaiBaseModel): roman: str = "" +class Descendant(ThaiBaseModel): + lang_code: str = Field(description="Wiktionary language code") + lang: str = Field(description="Language name") + word: str + roman: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + descendants: list["Descendant"] = [] + sense: str = "" + + class WordEntry(ThaiBaseModel): model_config = ConfigDict(title="Thai Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -82,3 +93,5 @@ class WordEntry(ThaiBaseModel): antonyms: list[Linkage] = [] synonyms: list[Linkage] = [] derived: list[Linkage] = [] + related: list[Linkage] = [] + descendants: list[Descendant] = [] diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py index b3dafa36d..03bf20d2e 100644 --- a/src/wiktextract/extractor/th/page.py +++ b/src/wiktextract/extractor/th/page.py @@ -6,6 +6,8 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from .alt_form import extract_alt_form_section +from .descendant import extract_descendant_section from .etymology import extract_etymology_section from .linkage import extract_linkage_section from .models import Sense, WordEntry @@ -38,6 +40,18 @@ def parse_section( level_node, LINKAGE_SECTIONS[title_text], ) + elif title_text == "คำสืบทอด": + extract_descendant_section( + wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + ) + elif title_text == "การออกเสียง": + pass # sounds + elif title_text == "รูปแบบอื่น": + extract_alt_form_section( + wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + ) + elif title_text not in ["ดูเพิ่ม"]: + wxr.wtp.debug(f"Unknown title: {title_text}") for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py index 61f036c56..ff14f847c 100644 --- a/src/wiktextract/extractor/th/section_titles.py +++ b/src/wiktextract/extractor/th/section_titles.py @@ -19,4 +19,5 @@ "คำตรงข้าม": "antonyms", "คำพ้องความ": "synonyms", "ลูกคำ": "derived", + "คำเกี่ยวข้อง": "related", } diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index d9bebf693..fe6c1cf5e 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -5,7 +5,7 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.gloss import extract_glosses from wiktextract.extractor.de.page import parse_page -from wiktextract.extractor.es.models import WordEntry +from wiktextract.extractor.de.models import WordEntry from wiktextract.wxr_context import WiktextractContext diff --git a/tests/test_th_desc.py b/tests/test_th_desc.py new file mode 100644 index 000000000..f9c47d58c --- /dev/null +++ b/tests/test_th_desc.py @@ -0,0 +1,56 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.th.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestThDesc(unittest.TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="th"), + WiktionaryConfig( + dump_file_lang_code="th", capture_language_codes=None + ), + ) + + def test_desc_template(self): + self.wxr.wtp.add_page( + "แม่แบบ:desc", + 10, + """ พม่า: [[{{{2}}}]] ({{{tr|}}}, {{{t|}}})""", + ) + page_data = parse_page( + self.wxr, + "สยาม", + """== ภาษาไทย == +=== คำคุณศัพท์ === +# [[ของ]]ประเทศไทย (โบราณหรือปัจจุบัน) +===== คำสืบทอด ===== +* {{desc|my|သျှမ်း|bor=1|t=Shan}} +* {{desc|pt|Sciam|bor=1}} +** {{desc|en|Siam|bor=1}}""", + ) + self.assertEqual( + page_data[0]["descendants"], + [ + { + "lang": "พม่า", + "lang_code": "my", + "word": "သျှမ်း", + "sense": "Shan", + }, + { + "lang": "โปรตุเกส", + "lang_code": "pt", + "word": "Sciam", + "descendants": [ + {"lang": "อังกฤษ", "lang_code": "en", "word": "Siam"} + ], + }, + ], + ) diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py index 22505401d..119543259 100644 --- a/tests/test_th_gloss.py +++ b/tests/test_th_gloss.py @@ -185,3 +185,26 @@ def test_th_verb(self): "lang_code": "th", }, ) + + def test_alt_template(self): + self.wxr.wtp.add_page( + "แม่แบบ:alt", + 10, + """(''เลิกใช้'') [[เดอร#ภาษาไทย|เดอร]], [[เดิร#ภาษาไทย|เดิร]]""", + ) + page_data = parse_page( + self.wxr, + "เดิน", + """== ภาษาไทย == +=== รูปแบบอื่น === +* {{alt|th|เดอร|เดิร||เลิกใช้}} +=== คำกริยา === +# [[ยก]][[เท้า]][[ก้าว]][[ไป]]""", + ) + self.assertEqual( + page_data[0]["forms"], + [ + {"form": "เดอร", "raw_tags": ["เลิกใช้"]}, + {"form": "เดิร", "raw_tags": ["เลิกใช้"]}, + ], + )