tatuylonen · xxyzz · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/src/wiktextract/extractor/th/linkage.py b/src/wiktextract/extractor/th/linkage.py
@@ -0,0 +1,63 @@
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Linkage, WordEntry
+
+
+def extract_linkage_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: LevelNode,
+    linkage_type: str,
+) -> None:
+    for node in level_node.children:
+        if isinstance(node, TemplateNode) and node.template_name.startswith(
+            "col"
+        ):
+            extract_col_template(wxr, word_entry, node, linkage_type)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_linkage_lite_item(
+                    wxr, word_entry, list_item, linkage_type
+                )
+
+
+def extract_col_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    t_node: TemplateNode,
+    linkage_type: str,
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for li_tag in expanded_node.find_html_recursively("li"):
+        l_data = Linkage(word="")
+        for span_tag in li_tag.find_html("span"):
+            span_class = span_tag.attrs.get("class", "")
+            if "Latn" in span_class:
+                l_data.roman = clean_node(wxr, None, span_tag)
+            elif "lang" in span_tag.attrs:
+                l_data.word = clean_node(wxr, None, span_tag)
+        if l_data.word != "":
+            getattr(word_entry, linkage_type).append(l_data)
+
+
+def extract_linkage_lite_item(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    list_item: WikiNode,
+    linkage_type: str,
+) -> None:
+    linkages = []
+
+    for node in list_item.children:
+        if isinstance(node, TemplateNode) and node.template_name == "l":
+            l_data = Linkage(
+                word=clean_node(wxr, None, node.template_parameters.get(2, ""))
+            )
+            if l_data.word != "":
+                linkages.append(l_data)
+
+    getattr(word_entry, linkage_type).extend(linkages)
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
@@ -43,6 +43,26 @@ class Form(ThaiBaseModel):
     raw_tags: list[str] = []
 
 
+class Translation(ThaiBaseModel):
+    lang_code: str = Field(
+        description="Wiktionary language code of the translation term",
+    )
+    lang: str = Field(description="Translation language name")
+    word: str = Field(description="Translation term")
+    sense: str = Field(default="", description="Translation gloss")
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    roman: str = ""
+    lit: str = Field(default="", description="Literal translation")
+
+
+class Linkage(ThaiBaseModel):
+    word: str
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    roman: str = ""
+
+
 class WordEntry(ThaiBaseModel):
     model_config = ConfigDict(title="Thai Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -58,3 +78,7 @@ class WordEntry(ThaiBaseModel):
     etymology_text: str = ""
     classifiers: list[str] = []
     forms: list[Form] = []
+    translations: list[Translation] = []
+    antonyms: list[Linkage] = []
+    synonyms: list[Linkage] = []
+    derived: list[Linkage] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
@@ -7,9 +7,11 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .etymology import extract_etymology_section
+from .linkage import extract_linkage_section
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
-from .section_titles import POS_DATA
+from .section_titles import LINKAGE_SECTIONS, POS_DATA
+from .translation import extract_translation_section
 
 
 def parse_section(
@@ -25,6 +27,17 @@ def parse_section(
         extract_pos_section(wxr, page_data, base_data, level_node, title_text)
     elif title_text == "รากศัพท์":
         extract_etymology_section(wxr, base_data, level_node)
+    elif title_text == "คำแปลภาษาอื่น":
+        extract_translation_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
+    elif title_text in LINKAGE_SECTIONS:
+        extract_linkage_section(
+            wxr,
+            page_data[-1] if len(page_data) > 0 else base_data,
+            level_node,
+            LINKAGE_SECTIONS[title_text],
+        )
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)

diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
@@ -13,3 +13,10 @@
     "วลี": {"pos": "phrase"},
     "เลข": {"pos": "num", "tags": ["number"]},
 }
+
+
+LINKAGE_SECTIONS = {
+    "คำตรงข้าม": "antonyms",
+    "คำพ้องความ": "synonyms",
+    "ลูกคำ": "derived",
+}
diff --git a/src/wiktextract/extractor/th/tags.py b/src/wiktextract/extractor/th/tags.py
@@ -8,8 +8,27 @@
     "simp.": "Simplified Chinese",
 }
 
+TRANSLATION_TAGS = {
+    # แม่แบบ:t
+    # https://th.wiktionary.org/wiki/มอดูล:gender_and_number/data
+    "ญ.": "feminine",
+    "ช.": "masculine",
+    "ก.": "neuter",
+    "ร.": "common",
+    "ชีว.": "animate",
+    "อชีว.": "inanimate",
+    "สัต.": "animal-not-person",
+    "บุค.": "personal",
+    "อบุค.": "impersonal",
+    "เอก.": "singular",
+    "ทวิ.": "dual",
+    "พหู.": "plural",
+    "ไม่สมบูรณ์": "imperfective",
+    "สมบูรณ์": "perfective",
+}
+
 
-TAGS = {**EXAMPLE_TAGS}
+TAGS = {**EXAMPLE_TAGS, **TRANSLATION_TAGS}
 
 
 def translate_raw_tags(data: WordEntry) -> None:

diff --git a/src/wiktextract/extractor/th/translation.py b/src/wiktextract/extractor/th/translation.py
@@ -0,0 +1,101 @@
+from mediawiki_langcodes import name_to_code
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Translation, WordEntry
+from .tags import translate_raw_tags
+
+
+def extract_translation_section(
+    wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
+) -> None:
+    sense = ""
+    for node in level_node.children:
+        if isinstance(node, TemplateNode) and node.template_name == "trans-top":
+            sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
+            clean_node(wxr, word_entry, node)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_translation_list_item(wxr, word_entry, list_item, sense)
+
+
+def extract_translation_list_item(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    list_item: WikiNode,
+    sense: str,
+) -> None:
+    lang_name = "unknown"
+    lang_code = "unknown"
+    for index, node in enumerate(list_item.children):
+        if isinstance(node, str) and ":" in node and lang_name == "unknown":
+            lang_name = (
+                clean_node(wxr, None, list_item.children[:index])
+                + node[: node.index(":")].strip()
+            )
+            lang_code = name_to_code(lang_name, "th")
+            if lang_code == "":
+                lang_code = "unknown"
+        elif isinstance(node, TemplateNode) and node.template_name in [
+            "t",
+            "t+",
+        ]:
+            extract_t_template(wxr, word_entry, node, lang_name, sense)
+        elif (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.LINK
+            and lang_name != "unknown"
+        ):
+            word = clean_node(wxr, None, node)
+            if word != "":
+                word_entry.translations.append(
+                    Translation(
+                        word=word,
+                        lang=lang_name,
+                        lang_code=lang_code,
+                        sense=sense,
+                    )
+                )
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_translation_list_item(
+                    wxr, word_entry, child_list_item, sense
+                )
+
+
+def extract_t_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    t_node: TemplateNode,
+    lang_name: str,
+    sense: str,
+) -> None:
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    if lang_code == "":
+        lang_code = "unknown"
+    tr_data = Translation(
+        word="", lang=lang_name, lang_code=lang_code, sense=sense
+    )
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for span_tag in expanded_node.find_html_recursively("span"):
+        if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":
+            tr_data.word = clean_node(wxr, None, span_tag)
+        else:
+            span_class = span_tag.attrs.get("class", "")
+            if "Latn" in span_class:
+                tr_data.roman = clean_node(wxr, None, span_tag)
+
+    tr_data.lit = clean_node(
+        wxr, None, t_node.template_parameters.get("lit", "")
+    )
+    for abbr_tag in expanded_node.find_html_recursively("abbr"):
+        tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag))
+
+    if tr_data.word != "":
+        translate_raw_tags(tr_data)
+        word_entry.translations.append(tr_data)
+        for link_node in expanded_node.find_child(NodeKind.LINK):
+            clean_node(wxr, word_entry, link_node)
diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py
@@ -0,0 +1,56 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.th.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestThLinkage(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="th"),
+            WiktionaryConfig(
+                dump_file_lang_code="th", capture_language_codes=None
+            ),
+        )
+
+    def test_col(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:col2",
+            10,
+            """<div class="list-switcher" data-toggle-category="derived terms"><div class="columns-bg term-list ul-column-count" data-column-count="2"><ul><li><span class="Thai" lang="th">[[กบทูด#ภาษาไทย|กบทูด]]</span></li></ul></div><div class="list-switcher-element" data-showtext=" show more ▼ " data-hidetext=" show less ▲ " style="display:none"> </div></div>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "กบ",
+            """== ภาษาไทย ==
+=== รากศัพท์ 2 ===
+==== คำนาม ====
+# [[ชื่อ]]
+===== ลูกคำ =====
+{{col2|th|กบทูด}}""",
+        )
+        self.assertEqual(
+            page_data[0]["derived"],
+            [{"word": "กบทูด"}],
+        )
+
+    def test_list(self):
+        page_data = parse_page(
+            self.wxr,
+            "กบ",
+            """== ภาษาไทย ==
+=== รากศัพท์ 2 ===
+==== คำนาม ====
+# [[ชื่อ]]
+===== คำพ้องความ =====
+* {{l|th|มณฑก}}""",
+        )
+        self.assertEqual(
+            page_data[0]["synonyms"],
+            [{"word": "มณฑก"}],
+        )