From d25ab61a8e4c2db399c8368e3d752b26efb148b5 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 10 Jan 2025 16:55:24 +0800
Subject: [PATCH 1/2] [th] extract translation section

---
 src/wiktextract/extractor/th/models.py      |  14 +++
 src/wiktextract/extractor/th/page.py        |   5 +
 src/wiktextract/extractor/th/tags.py        |  21 +++-
 src/wiktextract/extractor/th/translation.py | 101 ++++++++++++++++++
 tests/test_th_translation.py                | 111 ++++++++++++++++++++
 5 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 src/wiktextract/extractor/th/translation.py
 create mode 100644 tests/test_th_translation.py

diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
index 74896417..f5c0ec42 100644
--- a/src/wiktextract/extractor/th/models.py
+++ b/src/wiktextract/extractor/th/models.py
@@ -43,6 +43,19 @@ class Form(ThaiBaseModel):
     raw_tags: list[str] = []
 
 
+class Translation(ThaiBaseModel):
+    lang_code: str = Field(
+        description="Wiktionary language code of the translation term",
+    )
+    lang: str = Field(description="Translation language name")
+    word: str = Field(description="Translation term")
+    sense: str = Field(default="", description="Translation gloss")
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    roman: str = ""
+    lit: str = Field(default="", description="Literal translation")
+
+
 class WordEntry(ThaiBaseModel):
     model_config = ConfigDict(title="Thai Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -58,3 +71,4 @@ class WordEntry(ThaiBaseModel):
     etymology_text: str = ""
     classifiers: list[str] = []
     forms: list[Form] = []
+    translations: list[Translation] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
index 7c52cf87..643ed002 100644
--- a/src/wiktextract/extractor/th/page.py
+++ b/src/wiktextract/extractor/th/page.py
@@ -10,6 +10,7 @@
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
 from .section_titles import POS_DATA
+from .translation import extract_translation_section
 
 
 def parse_section(
@@ -25,6 +26,10 @@ def parse_section(
         extract_pos_section(wxr, page_data, base_data, level_node, title_text)
     elif title_text == "รากศัพท์":
         extract_etymology_section(wxr, base_data, level_node)
+    elif title_text == "คำแปลภาษาอื่น":
+        extract_translation_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)
diff --git a/src/wiktextract/extractor/th/tags.py b/src/wiktextract/extractor/th/tags.py
index 6de4f51d..62304a4e 100644
--- a/src/wiktextract/extractor/th/tags.py
+++ b/src/wiktextract/extractor/th/tags.py
@@ -8,8 +8,27 @@
     "simp.": "Simplified Chinese",
 }
 
+TRANSLATION_TAGS = {
+    # แม่แบบ:t
+    # https://th.wiktionary.org/wiki/มอดูล:gender_and_number/data
+    "ญ.": "feminine",
+    "ช.": "masculine",
+    "ก.": "neuter",
+    "ร.": "common",
+    "ชีว.": "animate",
+    "อชีว.": "inanimate",
+    "สัต.": "animal-not-person",
+    "บุค.": "personal",
+    "อบุค.": "impersonal",
+    "เอก.": "singular",
+    "ทวิ.": "dual",
+    "พหู.": "plural",
+    "ไม่สมบูรณ์": "imperfective",
+    "สมบูรณ์": "perfective",
+}
+
 
-TAGS = {**EXAMPLE_TAGS}
+TAGS = {**EXAMPLE_TAGS, **TRANSLATION_TAGS}
 
 
 def translate_raw_tags(data: WordEntry) -> None:
diff --git a/src/wiktextract/extractor/th/translation.py b/src/wiktextract/extractor/th/translation.py
new file mode 100644
index 00000000..93883fa5
--- /dev/null
+++ b/src/wiktextract/extractor/th/translation.py
@@ -0,0 +1,101 @@
+from mediawiki_langcodes import name_to_code
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Translation, WordEntry
+from .tags import translate_raw_tags
+
+
+def extract_translation_section(
+    wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
+) -> None:
+    sense = ""
+    for node in level_node.children:
+        if isinstance(node, TemplateNode) and node.template_name == "trans-top":
+            sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
+            clean_node(wxr, word_entry, node)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_translation_list_item(wxr, word_entry, list_item, sense)
+
+
+def extract_translation_list_item(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    list_item: WikiNode,
+    sense: str,
+) -> None:
+    lang_name = "unknown"
+    lang_code = "unknown"
+    for index, node in enumerate(list_item.children):
+        if isinstance(node, str) and ":" in node and lang_name == "unknown":
+            lang_name = (
+                clean_node(wxr, None, list_item.children[:index])
+                + node[: node.index(":")].strip()
+            )
+            lang_code = name_to_code(lang_name, "th")
+            if lang_code == "":
+                lang_code = "unknown"
+        elif isinstance(node, TemplateNode) and node.template_name in [
+            "t",
+            "t+",
+        ]:
+            extract_t_template(wxr, word_entry, node, lang_name, sense)
+        elif (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.LINK
+            and lang_name != "unknown"
+        ):
+            word = clean_node(wxr, None, node)
+            if word != "":
+                word_entry.translations.append(
+                    Translation(
+                        word=word,
+                        lang=lang_name,
+                        lang_code=lang_code,
+                        sense=sense,
+                    )
+                )
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_translation_list_item(
+                    wxr, word_entry, child_list_item, sense
+                )
+
+
+def extract_t_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    t_node: TemplateNode,
+    lang_name: str,
+    sense: str,
+) -> None:
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    if lang_code == "":
+        lang_code = "unknown"
+    tr_data = Translation(
+        word="", lang=lang_name, lang_code=lang_code, sense=sense
+    )
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for span_tag in expanded_node.find_html_recursively("span"):
+        if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":
+            tr_data.word = clean_node(wxr, None, span_tag)
+        else:
+            span_class = span_tag.attrs.get("class", "")
+            if "Latn" in span_class:
+                tr_data.roman = clean_node(wxr, None, span_tag)
+
+    tr_data.lit = clean_node(
+        wxr, None, t_node.template_parameters.get("lit", "")
+    )
+    for abbr_tag in expanded_node.find_html_recursively("abbr"):
+        tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag))
+
+    if tr_data.word != "":
+        translate_raw_tags(tr_data)
+        word_entry.translations.append(tr_data)
+        for link_node in expanded_node.find_child(NodeKind.LINK):
+            clean_node(wxr, word_entry, link_node)
diff --git a/tests/test_th_translation.py b/tests/test_th_translation.py
new file mode 100644
index 00000000..8341b451
--- /dev/null
+++ b/tests/test_th_translation.py
@@ -0,0 +1,111 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.th.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestThTranslation(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="th"),
+            WiktionaryConfig(
+                dump_file_lang_code="th", capture_language_codes=None
+            ),
+        )
+
+    def test_nested_list(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:trans-top",
+            10,
+            """<div><div>ชื่อสัตว์สะเทินน้ำสะเทินบกชนิดหนึ่ง</div><div class="NavContent"><table class="translations" role="presentation" data-gloss="ชื่อสัตว์สะเทินน้ำสะเทินบกชนิดหนึ่ง"><tr><td class="translations-cell multicolumn-list" colspan="3">[[Category:รายการที่มีกล่องคำแปล|กบ]]""",
+        )
+        self.wxr.wtp.add_page(
+            "แม่แบบ:t+",
+            10,
+            """<span class="Hani" lang="cmn">[[蛙#ภาษาจีนกลาง|蛙]]</span><span class="tpos">&nbsp;[[:zh&#x3A;蛙|(zh)]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="cmn-Latn" class="tr Latn">wā</span><span class="mention-gloss-paren annotation-paren">)</span>[[Category:หน้าที่มีคำแปลภาษาจีนกลาง|กบ]]""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "กบ",
+            """== ภาษาไทย ==
+=== รากศัพท์ 2 ===
+==== คำนาม ====
+# [[ชื่อ]]
+===== คำแปลภาษาอื่น =====
+{{trans-top|ชื่อสัตว์สะเทินน้ำสะเทินบกชนิดหนึ่ง}}
+* จีน:
+*: จีนกลาง: {{t+|cmn|蛙|tr=wā}}
+{{trans-bottom}} """,
+        )
+        self.assertEqual(
+            page_data[0]["translations"],
+            [
+                {
+                    "word": "蛙",
+                    "lang": "จีนกลาง",
+                    "lang_code": "cmn",
+                    "roman": "wā",
+                    "sense": "ชื่อสัตว์สะเทินน้ำสะเทินบกชนิดหนึ่ง",
+                }
+            ],
+        )
+        self.assertEqual(
+            page_data[0]["categories"],
+            ["รายการที่มีกล่องคำแปล", "หน้าที่มีคำแปลภาษาจีนกลาง"],
+        )
+
+    def test_t_tag(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:t+",
+            10,
+            """<span class="Latn" lang="gl">[[ra#ภาษากาลิเซีย|ra]]</span><span class="tpos">&nbsp;[[:gl&#x3A;ra|(gl)]]</span>&nbsp;<span class="gender"><abbr title="เพศหญิง">ญ.</abbr></span>[[Category:หน้าที่มีคำแปลภาษากาลิเซีย|กบ]]""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "กบ",
+            """== ภาษาไทย ==
+=== รากศัพท์ 2 ===
+==== คำนาม ====
+# [[ชื่อ]]
+===== คำแปลภาษาอื่น =====
+{{trans-top}}
+* กาลิเซีย: {{t+|gl|ra|f}}
+{{trans-bottom}} """,
+        )
+        self.assertEqual(
+            page_data[0]["translations"],
+            [
+                {
+                    "word": "ra",
+                    "lang": "กาลิเซีย",
+                    "lang_code": "gl",
+                    "tags": ["feminine"],
+                }
+            ],
+        )
+
+    def test_no_template(self):
+        page_data = parse_page(
+            self.wxr,
+            "เกาหลี",
+            """== ภาษาไทย ==
+=== คำวิสามานยนาม ===
+# ชื่อ
+==== คำแปลภาษาอื่น ====
+* [[อินเทอร์ลิงกวา]] : [[Corea]]""",
+        )
+        self.assertEqual(
+            page_data[0]["translations"],
+            [
+                {
+                    "word": "Corea",
+                    "lang": "อินเทอร์ลิงกวา",
+                    "lang_code": "unknown",
+                }
+            ],
+            )

From 5f942122c92e132c2febfdd683bde9916a72c57e Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 10 Jan 2025 17:33:39 +0800
Subject: [PATCH 2/2] [th] extract some linkage sections

---
 src/wiktextract/extractor/th/linkage.py       | 63 +++++++++++++++++++
 src/wiktextract/extractor/th/models.py        | 10 +++
 src/wiktextract/extractor/th/page.py          | 10 ++-
 .../extractor/th/section_titles.py            |  7 +++
 tests/test_th_linkage.py                      | 56 +++++++++++++++++
 tests/test_th_translation.py                  |  6 +-
 6 files changed, 148 insertions(+), 4 deletions(-)
 create mode 100644 src/wiktextract/extractor/th/linkage.py
 create mode 100644 tests/test_th_linkage.py

diff --git a/src/wiktextract/extractor/th/linkage.py b/src/wiktextract/extractor/th/linkage.py
new file mode 100644
index 00000000..dbc72430
--- /dev/null
+++ b/src/wiktextract/extractor/th/linkage.py
@@ -0,0 +1,63 @@
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Linkage, WordEntry
+
+
+def extract_linkage_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: LevelNode,
+    linkage_type: str,
+) -> None:
+    for node in level_node.children:
+        if isinstance(node, TemplateNode) and node.template_name.startswith(
+            "col"
+        ):
+            extract_col_template(wxr, word_entry, node, linkage_type)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_linkage_lite_item(
+                    wxr, word_entry, list_item, linkage_type
+                )
+
+
+def extract_col_template(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    t_node: TemplateNode,
+    linkage_type: str,
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for li_tag in expanded_node.find_html_recursively("li"):
+        l_data = Linkage(word="")
+        for span_tag in li_tag.find_html("span"):
+            span_class = span_tag.attrs.get("class", "")
+            if "Latn" in span_class:
+                l_data.roman = clean_node(wxr, None, span_tag)
+            elif "lang" in span_tag.attrs:
+                l_data.word = clean_node(wxr, None, span_tag)
+        if l_data.word != "":
+            getattr(word_entry, linkage_type).append(l_data)
+
+
+def extract_linkage_lite_item(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    list_item: WikiNode,
+    linkage_type: str,
+) -> None:
+    linkages = []
+
+    for node in list_item.children:
+        if isinstance(node, TemplateNode) and node.template_name == "l":
+            l_data = Linkage(
+                word=clean_node(wxr, None, node.template_parameters.get(2, ""))
+            )
+            if l_data.word != "":
+                linkages.append(l_data)
+
+    getattr(word_entry, linkage_type).extend(linkages)
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
index f5c0ec42..31d1b91e 100644
--- a/src/wiktextract/extractor/th/models.py
+++ b/src/wiktextract/extractor/th/models.py
@@ -56,6 +56,13 @@ class Translation(ThaiBaseModel):
     lit: str = Field(default="", description="Literal translation")
 
 
+class Linkage(ThaiBaseModel):
+    word: str
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    roman: str = ""
+
+
 class WordEntry(ThaiBaseModel):
     model_config = ConfigDict(title="Thai Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -72,3 +79,6 @@ class WordEntry(ThaiBaseModel):
     classifiers: list[str] = []
     forms: list[Form] = []
     translations: list[Translation] = []
+    antonyms: list[Linkage] = []
+    synonyms: list[Linkage] = []
+    derived: list[Linkage] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
index 643ed002..b3dafa36 100644
--- a/src/wiktextract/extractor/th/page.py
+++ b/src/wiktextract/extractor/th/page.py
@@ -7,9 +7,10 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .etymology import extract_etymology_section
+from .linkage import extract_linkage_section
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
-from .section_titles import POS_DATA
+from .section_titles import LINKAGE_SECTIONS, POS_DATA
 from .translation import extract_translation_section
 
 
@@ -30,6 +31,13 @@ def parse_section(
         extract_translation_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
         )
+    elif title_text in LINKAGE_SECTIONS:
+        extract_linkage_section(
+            wxr,
+            page_data[-1] if len(page_data) > 0 else base_data,
+            level_node,
+            LINKAGE_SECTIONS[title_text],
+        )
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)
diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
index 43415e36..61f036c5 100644
--- a/src/wiktextract/extractor/th/section_titles.py
+++ b/src/wiktextract/extractor/th/section_titles.py
@@ -13,3 +13,10 @@
     "วลี": {"pos": "phrase"},
     "เลข": {"pos": "num", "tags": ["number"]},
 }
+
+
+LINKAGE_SECTIONS = {
+    "คำตรงข้าม": "antonyms",
+    "คำพ้องความ": "synonyms",
+    "ลูกคำ": "derived",
+}
diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py
new file mode 100644
index 00000000..5a3310b5
--- /dev/null
+++ b/tests/test_th_linkage.py
@@ -0,0 +1,56 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.th.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestThLinkage(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="th"),
+            WiktionaryConfig(
+                dump_file_lang_code="th", capture_language_codes=None
+            ),
+        )
+
+    def test_col(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:col2",
+            10,
+            """<div class="list-switcher" data-toggle-category="derived terms"><div class="columns-bg term-list ul-column-count" data-column-count="2"><ul><li><span class="Thai" lang="th">[[กบทูด#ภาษาไทย|กบทูด]]</span></li></ul></div><div class="list-switcher-element" data-showtext=" show more ▼ " data-hidetext=" show less ▲ " style="display:none"> </div></div>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "กบ",
+            """== ภาษาไทย ==
+=== รากศัพท์ 2 ===
+==== คำนาม ====
+# [[ชื่อ]]
+===== ลูกคำ =====
+{{col2|th|กบทูด}}""",
+        )
+        self.assertEqual(
+            page_data[0]["derived"],
+            [{"word": "กบทูด"}],
+        )
+
+    def test_list(self):
+        page_data = parse_page(
+            self.wxr,
+            "กบ",
+            """== ภาษาไทย ==
+=== รากศัพท์ 2 ===
+==== คำนาม ====
+# [[ชื่อ]]
+===== คำพ้องความ =====
+* {{l|th|มณฑก}}""",
+        )
+        self.assertEqual(
+            page_data[0]["synonyms"],
+            [{"word": "มณฑก"}],
+        )
diff --git a/tests/test_th_translation.py b/tests/test_th_translation.py
index 8341b451..2a43dd69 100644
--- a/tests/test_th_translation.py
+++ b/tests/test_th_translation.py
@@ -40,7 +40,7 @@ def test_nested_list(self):
 {{trans-top|ชื่อสัตว์สะเทินน้ำสะเทินบกชนิดหนึ่ง}}
 * จีน:
 *: จีนกลาง: {{t+|cmn|蛙|tr=wā}}
-{{trans-bottom}} """,
+{{trans-bottom}}""",
         )
         self.assertEqual(
             page_data[0]["translations"],
@@ -75,7 +75,7 @@ def test_t_tag(self):
 ===== คำแปลภาษาอื่น =====
 {{trans-top}}
 * กาลิเซีย: {{t+|gl|ra|f}}
-{{trans-bottom}} """,
+{{trans-bottom}}""",
         )
         self.assertEqual(
             page_data[0]["translations"],
@@ -108,4 +108,4 @@ def test_no_template(self):
                     "lang_code": "unknown",
                 }
             ],
-            )
+        )