tatuylonen · xxyzz · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py
@@ -0,0 +1,43 @@
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Form, WordEntry
+from .tags import translate_raw_tags
+
+
+def extract_alt_form_section(
+    wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
+) -> None:
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            for node in list_item.children:
+                if (
+                    isinstance(node, TemplateNode)
+                    and node.template_name == "alt"
+                ):
+                    extract_alt_template(wxr, word_entry, node)
+
+
+def extract_alt_template(
+    wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+
+    raw_tags = []
+    for italic_node in expanded_node.find_child(NodeKind.ITALIC):
+        raw_tags_str = clean_node(wxr, None, italic_node)
+        for raw_tag in raw_tags_str.split(","):
+            raw_tag = raw_tag.strip()
+            if raw_tag != "":
+                raw_tags.append(raw_tag)
+        break
+
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    for span_tag in expanded_node.find_html("span"):
+        if span_tag.attrs.get("lang", "") == lang_code:
+            form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
+            translate_raw_tags(form)
+            word_entry.forms.append(form)
diff --git a/src/wiktextract/extractor/th/descendant.py b/src/wiktextract/extractor/th/descendant.py
@@ -0,0 +1,77 @@
+from mediawiki_langcodes import code_to_name
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Descendant, WordEntry
+
+
+def extract_descendant_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: WikiNode,
+) -> None:
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            extract_desc_list_item(wxr, word_entry, [], list_item)
+
+
+def extract_desc_list_item(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    parent_data: list[Descendant],
+    list_item: WikiNode,
+) -> None:
+    desc_list = []
+    for node in list_item.children:
+        if isinstance(node, TemplateNode) and node.template_name in [
+            "desc",
+            "descendant",
+            "desctree",
+            "descendants tree",
+        ]:
+            desc_list.extend(
+                extract_desc_template(wxr, word_entry, parent_data, node)
+            )
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_desc_list_item(
+                    wxr, word_entry, desc_list, child_list_item
+                )
+
+
+def extract_desc_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    parent_data: list[Descendant],
+    t_node: TemplateNode,
+) -> list[Descendant]:
+    desc_data = []
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    lang_name = code_to_name(lang_code, "th") or "unknown"
+    for span_tag in expanded_node.find_html("span"):
+        span_lang = span_tag.attrs.get("lang", "")
+        span_class = span_tag.attrs.get("class", "")
+        if span_lang == lang_code:
+            desc_data.append(
+                Descendant(
+                    lang_code=lang_code,
+                    lang=lang_name,
+                    word=clean_node(wxr, None, span_tag),
+                )
+            )
+        elif span_lang.endswith("-Latn") and len(desc_data) > 0:
+            desc_data[-1].roman = clean_node(wxr, None, span_tag)
+        elif span_class == "mention-gloss" and len(desc_data) > 0:
+            desc_data[-1].sense = clean_node(wxr, None, span_tag)
+
+    if len(parent_data) > 0:
+        for p_data in parent_data:
+            p_data.descendants.extend(desc_data)
+    else:
+        word_entry.descendants.extend(desc_data)
+    clean_node(wxr, word_entry, expanded_node)
+    return desc_data
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
@@ -63,6 +63,17 @@ class Linkage(ThaiBaseModel):
     roman: str = ""
 
 
+class Descendant(ThaiBaseModel):
+    lang_code: str = Field(description="Wiktionary language code")
+    lang: str = Field(description="Language name")
+    word: str
+    roman: str = ""
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    descendants: list["Descendant"] = []
+    sense: str = ""
+
+
 class WordEntry(ThaiBaseModel):
     model_config = ConfigDict(title="Thai Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -82,3 +93,5 @@ class WordEntry(ThaiBaseModel):
     antonyms: list[Linkage] = []
     synonyms: list[Linkage] = []
     derived: list[Linkage] = []
+    related: list[Linkage] = []
+    descendants: list[Descendant] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
@@ -6,6 +6,8 @@
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
+from .alt_form import extract_alt_form_section
+from .descendant import extract_descendant_section
 from .etymology import extract_etymology_section
 from .linkage import extract_linkage_section
 from .models import Sense, WordEntry
@@ -38,6 +40,18 @@ def parse_section(
             level_node,
             LINKAGE_SECTIONS[title_text],
         )
+    elif title_text == "คำสืบทอด":
+        extract_descendant_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
+    elif title_text == "การออกเสียง":
+        pass  # sounds
+    elif title_text == "รูปแบบอื่น":
+        extract_alt_form_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
+    elif title_text not in ["ดูเพิ่ม"]:
+        wxr.wtp.debug(f"Unknown title: {title_text}")
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)

diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
@@ -19,4 +19,5 @@
     "คำตรงข้าม": "antonyms",
     "คำพ้องความ": "synonyms",
     "ลูกคำ": "derived",
+    "คำเกี่ยวข้อง": "related",
 }
diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
@@ -5,7 +5,7 @@
 from wiktextract.config import WiktionaryConfig
 from wiktextract.extractor.de.gloss import extract_glosses
 from wiktextract.extractor.de.page import parse_page
-from wiktextract.extractor.es.models import WordEntry
+from wiktextract.extractor.de.models import WordEntry
 from wiktextract.wxr_context import WiktextractContext
 
 

diff --git a/tests/test_th_desc.py b/tests/test_th_desc.py
@@ -0,0 +1,56 @@
+import unittest
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.th.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestThDesc(unittest.TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="th"),
+            WiktionaryConfig(
+                dump_file_lang_code="th", capture_language_codes=None
+            ),
+        )
+
+    def test_desc_template(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:desc",
+            10,
+            """<span class="desc-arr" title="borrowed">→</span> พม่า: <span class="Mymr" lang="{{{1}}}">[[{{{2}}}]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="my-Latn" class="tr Latn">{{{tr|}}}</span>, <span class="mention-gloss-double-quote">“</span><span class="mention-gloss">{{{t|}}}</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "สยาม",
+            """== ภาษาไทย ==
+=== คำคุณศัพท์ ===
+# [[ของ]]ประเทศไทย (โบราณหรือปัจจุบัน)
+===== คำสืบทอด =====
+* {{desc|my|သျှမ်း|bor=1|t=Shan}}
+* {{desc|pt|Sciam|bor=1}}
+** {{desc|en|Siam|bor=1}}""",
+        )
+        self.assertEqual(
+            page_data[0]["descendants"],
+            [
+                {
+                    "lang": "พม่า",
+                    "lang_code": "my",
+                    "word": "သျှမ်း",
+                    "sense": "Shan",
+                },
+                {
+                    "lang": "โปรตุเกส",
+                    "lang_code": "pt",
+                    "word": "Sciam",
+                    "descendants": [
+                        {"lang": "อังกฤษ", "lang_code": "en", "word": "Siam"}
+                    ],
+                },
+            ],
+        )
diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
@@ -185,3 +185,26 @@ def test_th_verb(self):
                 "lang_code": "th",
             },
         )
+
+    def test_alt_template(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:alt",
+            10,
+            """(''เลิกใช้'') <span class="Thai" lang="th">[[เดอร#ภาษาไทย|เดอร]]</span>, <span class="Thai" lang="th">[[เดิร#ภาษาไทย|เดิร]]</span>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "เดิน",
+            """== ภาษาไทย ==
+=== รูปแบบอื่น ===
+* {{alt|th|เดอร|เดิร||เลิกใช้}}
+=== คำกริยา ===
+# [[ยก]][[เท้า]][[ก้าว]][[ไป]]""",
+        )
+        self.assertEqual(
+            page_data[0]["forms"],
+            [
+                {"form": "เดอร", "raw_tags": ["เลิกใช้"]},
+                {"form": "เดิร", "raw_tags": ["เลิกใช้"]},
+            ],
+        )