diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py
new file mode 100644
index 000000000..55e9febf4
--- /dev/null
+++ b/src/wiktextract/extractor/th/alt_form.py
@@ -0,0 +1,43 @@
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Form, WordEntry
+from .tags import translate_raw_tags
+
+
+def extract_alt_form_section(
+ wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
+) -> None:
+ for list_node in level_node.find_child(NodeKind.LIST):
+ for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+ for node in list_item.children:
+ if (
+ isinstance(node, TemplateNode)
+ and node.template_name == "alt"
+ ):
+ extract_alt_template(wxr, word_entry, node)
+
+
+def extract_alt_template(
+ wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+ expanded_node = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(t_node), expand_all=True
+ )
+
+ raw_tags = []
+ for italic_node in expanded_node.find_child(NodeKind.ITALIC):
+ raw_tags_str = clean_node(wxr, None, italic_node)
+ for raw_tag in raw_tags_str.split(","):
+ raw_tag = raw_tag.strip()
+ if raw_tag != "":
+ raw_tags.append(raw_tag)
+ break
+
+ lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ for span_tag in expanded_node.find_html("span"):
+ if span_tag.attrs.get("lang", "") == lang_code:
+ form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
+ translate_raw_tags(form)
+ word_entry.forms.append(form)
diff --git a/src/wiktextract/extractor/th/descendant.py b/src/wiktextract/extractor/th/descendant.py
new file mode 100644
index 000000000..889ded10b
--- /dev/null
+++ b/src/wiktextract/extractor/th/descendant.py
@@ -0,0 +1,77 @@
+from mediawiki_langcodes import code_to_name
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Descendant, WordEntry
+
+
+def extract_descendant_section(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ level_node: WikiNode,
+) -> None:
+ for list_node in level_node.find_child(NodeKind.LIST):
+ for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+ extract_desc_list_item(wxr, word_entry, [], list_item)
+
+
+def extract_desc_list_item(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ parent_data: list[Descendant],
+ list_item: WikiNode,
+) -> None:
+ desc_list = []
+ for node in list_item.children:
+ if isinstance(node, TemplateNode) and node.template_name in [
+ "desc",
+ "descendant",
+ "desctree",
+ "descendants tree",
+ ]:
+ desc_list.extend(
+ extract_desc_template(wxr, word_entry, parent_data, node)
+ )
+ elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+ for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+ extract_desc_list_item(
+ wxr, word_entry, desc_list, child_list_item
+ )
+
+
+def extract_desc_template(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ parent_data: list[Descendant],
+ t_node: TemplateNode,
+) -> list[Descendant]:
+ desc_data = []
+ expanded_node = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(t_node), expand_all=True
+ )
+ lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ lang_name = code_to_name(lang_code, "th") or "unknown"
+ for span_tag in expanded_node.find_html("span"):
+ span_lang = span_tag.attrs.get("lang", "")
+ span_class = span_tag.attrs.get("class", "")
+ if span_lang == lang_code:
+ desc_data.append(
+ Descendant(
+ lang_code=lang_code,
+ lang=lang_name,
+ word=clean_node(wxr, None, span_tag),
+ )
+ )
+ elif span_lang.endswith("-Latn") and len(desc_data) > 0:
+ desc_data[-1].roman = clean_node(wxr, None, span_tag)
+ elif span_class == "mention-gloss" and len(desc_data) > 0:
+ desc_data[-1].sense = clean_node(wxr, None, span_tag)
+
+ if len(parent_data) > 0:
+ for p_data in parent_data:
+ p_data.descendants.extend(desc_data)
+ else:
+ word_entry.descendants.extend(desc_data)
+ clean_node(wxr, word_entry, expanded_node)
+ return desc_data
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
index 31d1b91ea..c04492a96 100644
--- a/src/wiktextract/extractor/th/models.py
+++ b/src/wiktextract/extractor/th/models.py
@@ -63,6 +63,17 @@ class Linkage(ThaiBaseModel):
roman: str = ""
+class Descendant(ThaiBaseModel):
+ lang_code: str = Field(description="Wiktionary language code")
+ lang: str = Field(description="Language name")
+ word: str
+ roman: str = ""
+ tags: list[str] = []
+ raw_tags: list[str] = []
+ descendants: list["Descendant"] = []
+ sense: str = ""
+
+
class WordEntry(ThaiBaseModel):
model_config = ConfigDict(title="Thai Wiktionary")
word: str = Field(description="Word string", min_length=1)
@@ -82,3 +93,5 @@ class WordEntry(ThaiBaseModel):
antonyms: list[Linkage] = []
synonyms: list[Linkage] = []
derived: list[Linkage] = []
+ related: list[Linkage] = []
+ descendants: list[Descendant] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
index b3dafa36d..03bf20d2e 100644
--- a/src/wiktextract/extractor/th/page.py
+++ b/src/wiktextract/extractor/th/page.py
@@ -6,6 +6,8 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
+from .alt_form import extract_alt_form_section
+from .descendant import extract_descendant_section
from .etymology import extract_etymology_section
from .linkage import extract_linkage_section
from .models import Sense, WordEntry
@@ -38,6 +40,18 @@ def parse_section(
level_node,
LINKAGE_SECTIONS[title_text],
)
+ elif title_text == "คำสืบทอด":
+ extract_descendant_section(
+ wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+ )
+ elif title_text == "การออกเสียง":
+ pass # sounds
+ elif title_text == "รูปแบบอื่น":
+ extract_alt_form_section(
+ wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+ )
+ elif title_text not in ["ดูเพิ่ม"]:
+ wxr.wtp.debug(f"Unknown title: {title_text}")
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
index 61f036c56..ff14f847c 100644
--- a/src/wiktextract/extractor/th/section_titles.py
+++ b/src/wiktextract/extractor/th/section_titles.py
@@ -19,4 +19,5 @@
"คำตรงข้าม": "antonyms",
"คำพ้องความ": "synonyms",
"ลูกคำ": "derived",
+ "คำเกี่ยวข้อง": "related",
}
diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py
index d9bebf693..fe6c1cf5e 100644
--- a/tests/test_de_gloss.py
+++ b/tests/test_de_gloss.py
@@ -5,7 +5,7 @@
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.de.gloss import extract_glosses
from wiktextract.extractor.de.page import parse_page
-from wiktextract.extractor.es.models import WordEntry
+from wiktextract.extractor.de.models import WordEntry
from wiktextract.wxr_context import WiktextractContext
diff --git a/tests/test_th_desc.py b/tests/test_th_desc.py
new file mode 100644
index 000000000..f9c47d58c
--- /dev/null
+++ b/tests/test_th_desc.py
@@ -0,0 +1,56 @@
+import unittest
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.th.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestThDesc(unittest.TestCase):
+ maxDiff = None
+
+ def setUp(self) -> None:
+ self.wxr = WiktextractContext(
+ Wtp(lang_code="th"),
+ WiktionaryConfig(
+ dump_file_lang_code="th", capture_language_codes=None
+ ),
+ )
+
+ def test_desc_template(self):
+ self.wxr.wtp.add_page(
+ "แม่แบบ:desc",
+ 10,
+ """→ พม่า: [[{{{2}}}]] ({{{tr|}}}, “{{{t|}}}”)""",
+ )
+ page_data = parse_page(
+ self.wxr,
+ "สยาม",
+ """== ภาษาไทย ==
+=== คำคุณศัพท์ ===
+# [[ของ]]ประเทศไทย (โบราณหรือปัจจุบัน)
+===== คำสืบทอด =====
+* {{desc|my|သျှမ်း|bor=1|t=Shan}}
+* {{desc|pt|Sciam|bor=1}}
+** {{desc|en|Siam|bor=1}}""",
+ )
+ self.assertEqual(
+ page_data[0]["descendants"],
+ [
+ {
+ "lang": "พม่า",
+ "lang_code": "my",
+ "word": "သျှမ်း",
+ "sense": "Shan",
+ },
+ {
+ "lang": "โปรตุเกส",
+ "lang_code": "pt",
+ "word": "Sciam",
+ "descendants": [
+ {"lang": "อังกฤษ", "lang_code": "en", "word": "Siam"}
+ ],
+ },
+ ],
+ )
diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
index 22505401d..119543259 100644
--- a/tests/test_th_gloss.py
+++ b/tests/test_th_gloss.py
@@ -185,3 +185,26 @@ def test_th_verb(self):
"lang_code": "th",
},
)
+
+ def test_alt_template(self):
+ self.wxr.wtp.add_page(
+ "แม่แบบ:alt",
+ 10,
+ """(''เลิกใช้'') [[เดอร#ภาษาไทย|เดอร]], [[เดิร#ภาษาไทย|เดิร]]""",
+ )
+ page_data = parse_page(
+ self.wxr,
+ "เดิน",
+ """== ภาษาไทย ==
+=== รูปแบบอื่น ===
+* {{alt|th|เดอร|เดิร||เลิกใช้}}
+=== คำกริยา ===
+# [[ยก]][[เท้า]][[ก้าว]][[ไป]]""",
+ )
+ self.assertEqual(
+ page_data[0]["forms"],
+ [
+ {"form": "เดอร", "raw_tags": ["เลิกใช้"]},
+ {"form": "เดิร", "raw_tags": ["เลิกใช้"]},
+ ],
+ )