From 3ee45d6be52941a94d239ab72d5d1aa4dab8686b Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 15 Jan 2025 14:12:05 +0800
Subject: [PATCH 1/3] [th] extract "lo-alt" form template

---
 src/wiktextract/extractor/th/alt_form.py | 38 ++++++++++++++++++++----
 src/wiktextract/extractor/th/models.py   |  1 +
 tests/test_th_gloss.py                   | 23 ++++++++++++++
 3 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py
index 55e9febf..fff7052e 100644
--- a/src/wiktextract/extractor/th/alt_form.py
+++ b/src/wiktextract/extractor/th/alt_form.py
@@ -1,4 +1,4 @@
-from wikitextprocessor import LevelNode, NodeKind, TemplateNode
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
@@ -18,6 +18,10 @@ def extract_alt_form_section(
                 ):
                     extract_alt_template(wxr, word_entry, node)
 
+    for t_node in level_node.find_child(NodeKind.TEMPLATE):
+        if t_node.template_name == "lo-alt":
+            extract_lo_alt_template(wxr, word_entry, t_node)
+
 
 def extract_alt_template(
     wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
@@ -25,9 +29,18 @@ def extract_alt_template(
     expanded_node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(t_node), expand_all=True
     )
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    extract_alt_expanded_nodes(wxr, word_entry, expanded_node, lang_code)
 
+
+def extract_alt_expanded_nodes(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    root: WikiNode,
+    lang_code: str,
+) -> None:
     raw_tags = []
-    for italic_node in expanded_node.find_child(NodeKind.ITALIC):
+    for italic_node in root.find_child(NodeKind.ITALIC):
         raw_tags_str = clean_node(wxr, None, italic_node)
         for raw_tag in raw_tags_str.split(","):
             raw_tag = raw_tag.strip()
@@ -35,9 +48,24 @@ def extract_alt_template(
                 raw_tags.append(raw_tag)
         break
 
-    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
-    for span_tag in expanded_node.find_html("span"):
-        if span_tag.attrs.get("lang", "") == lang_code:
+    for span_tag in root.find_html("span"):
+        span_lang = span_tag.attrs.get("lang", "")
+        if span_lang == lang_code:
             form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
             translate_raw_tags(form)
             word_entry.forms.append(form)
+        elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0:
+            word_entry.forms[-1].roman = clean_node(wxr, None, span_tag)
+
+    clean_node(wxr, word_entry, root)
+
+
+def extract_lo_alt_template(
+    wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for list_node in expanded_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo")
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
index c04492a9..d42347df 100644
--- a/src/wiktextract/extractor/th/models.py
+++ b/src/wiktextract/extractor/th/models.py
@@ -41,6 +41,7 @@ class Form(ThaiBaseModel):
     form: str
     tags: list[str] = []
     raw_tags: list[str] = []
+    roman: str = ""
 
 
 class Translation(ThaiBaseModel):
diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
index 11954325..0bb2c6ae 100644
--- a/tests/test_th_gloss.py
+++ b/tests/test_th_gloss.py
@@ -208,3 +208,26 @@ def test_alt_template(self):
                 {"form": "เดิร", "raw_tags": ["เลิกใช้"]},
             ],
         )
+
+    def test_lo_alt(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:lo-alt",
+            10,
+            """* (''ล้าสมัย'') <span class="Laoo" lang="lo">[[ທຸຣຽນ#ภาษาลาว|ທຸຣຽນ]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="lo-Latn" class="tr Latn">ทุรย̂น</span><span class="mention-gloss-paren annotation-paren">)</span>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "ທຸລຽນ",
+            """== ภาษาลาว ==
+=== รูปแบบอื่น ===
+{{lo-alt|d=ທຸຣຽນ}}
+=== คำนาม ===
+{{lo-noun}}
+# [[ทุเรียน]]""",
+        )
+        self.assertEqual(
+            page_data[0]["forms"],
+            [
+                {"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"},
+            ],
+        )

From ef7147c2724ca18a1644f1f20cfa18358ced4817 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 15 Jan 2025 17:44:08 +0800
Subject: [PATCH 2/3] [th] extract notes section and add more section titles

---
 src/wiktextract/extractor/th/models.py        |  5 +++++
 src/wiktextract/extractor/th/page.py          | 12 ++++++----
 src/wiktextract/extractor/th/pos.py           | 16 ++++++++++++++
 .../extractor/th/section_titles.py            | 22 +++++++++++++++++++
 4 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
index d42347df..82f5b626 100644
--- a/src/wiktextract/extractor/th/models.py
+++ b/src/wiktextract/extractor/th/models.py
@@ -96,3 +96,8 @@ class WordEntry(ThaiBaseModel):
     derived: list[Linkage] = []
     related: list[Linkage] = []
     descendants: list[Descendant] = []
+    anagrams: list[Linkage] = []
+    notes: list[str] = []
+    hyponyms: list[Linkage] = []
+    hypernyms: list[Linkage] = []
+    idioms: list[Linkage] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
index 03bf20d2..418dbaf2 100644
--- a/src/wiktextract/extractor/th/page.py
+++ b/src/wiktextract/extractor/th/page.py
@@ -11,7 +11,7 @@
 from .etymology import extract_etymology_section
 from .linkage import extract_linkage_section
 from .models import Sense, WordEntry
-from .pos import extract_pos_section
+from .pos import extract_note_section, extract_pos_section
 from .section_titles import LINKAGE_SECTIONS, POS_DATA
 from .translation import extract_translation_section
 
@@ -29,7 +29,7 @@ def parse_section(
         extract_pos_section(wxr, page_data, base_data, level_node, title_text)
     elif title_text == "รากศัพท์":
         extract_etymology_section(wxr, base_data, level_node)
-    elif title_text == "คำแปลภาษาอื่น":
+    elif title_text in ["คำแปลภาษาอื่น", "คำแปล"]:
         extract_translation_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
         )
@@ -44,13 +44,17 @@ def parse_section(
         extract_descendant_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
         )
-    elif title_text == "การออกเสียง":
+    elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง")):
         pass  # sounds
     elif title_text == "รูปแบบอื่น":
         extract_alt_form_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
         )
-    elif title_text not in ["ดูเพิ่ม"]:
+    elif title_text == "การใช้":
+        extract_note_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
+    elif title_text not in ["ดูเพิ่ม", "อ้างอิง", "อ่านเพิ่ม", "อ่านเพิ่มเติม"]:
         wxr.wtp.debug(f"Unknown title: {title_text}")
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
index d5d272e5..fab32407 100644
--- a/src/wiktextract/extractor/th/pos.py
+++ b/src/wiktextract/extractor/th/pos.py
@@ -151,3 +151,19 @@ def extract_th_verb_adj_template(
             )
 
     clean_node(wxr, word_entry, expanded_node)
+
+
+def extract_note_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: LevelNode,
+) -> None:
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            note_str = clean_node(
+                wxr,
+                word_entry,
+                list(list_item.invert_find_child(NodeKind.LIST)),
+            )
+            if note_str != "":
+                word_entry.notes.append(note_str)
diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
index ff14f847..5bd7a6f5 100644
--- a/src/wiktextract/extractor/th/section_titles.py
+++ b/src/wiktextract/extractor/th/section_titles.py
@@ -12,6 +12,20 @@
     "คำอุทาน": {"pos": "intj"},
     "วลี": {"pos": "phrase"},
     "เลข": {"pos": "num", "tags": ["number"]},
+    "อักษรจีน": {"pos": "character"},
+    "การถอดเป็นอักษรโรมัน": {"pos": "romanization"},
+    "สัญลักษณ์": {"pos": "symbol"},
+    "คำวิเศษณ์": {"pos": "adj"},
+    "ตัวอักษร": {"pos": "character"},
+    "รูปผันคำกำกับนาม": {"pos": "noun", "tags": ["form-of"]},
+    "รูปผันคำสรรพนาม": {"pos": "pron", "tags": ["form-of"]},
+    "คำกำกับนาม": {"pos": "noun"},
+    "ตัวเลข": {"pos": "num", "tags": ["number"]},
+    "พาร์ทิซิเพิล": {"pos": "verb", "tags": ["participle"]},
+    "พยางค์": {"pos": "syllable"},
+    "คันจิ": {"pos": "character", "tags": ["kanji"]},
+    "คำอาการนาม": {"pos": "adj_noun"},
+    "อุปสรรค": {"pos": "prefix", "tags": ["morpheme"]},
 }
 
 
@@ -20,4 +34,12 @@
     "คำพ้องความ": "synonyms",
     "ลูกคำ": "derived",
     "คำเกี่ยวข้อง": "related",
+    "คำที่เกี่ยวข้อง": "related",
+    "คำที่รับมา": "derived",
+    "คำตรงกันข้าม": "antonyms",
+    "คำสลับอักษร": "anagrams",
+    "การสลับอักษร": "anagrams",
+    "คำลูกกลุ่ม": "hyponyms",
+    "คำจ่ากลุ่ม": "hypernyms",
+    "สำนวน": "idioms",
 }

From 4fd882ad95a3a6f1a2c5b06c8b2a43fa94b2536b Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 15 Jan 2025 18:17:51 +0800
Subject: [PATCH 3/3] [it,pt,th] delete temp db file in tests

---
 tests/test_it_etymology.py   | 3 +++
 tests/test_it_example.py     | 3 +++
 tests/test_it_forms.py       | 3 +++
 tests/test_it_gloss.py       | 3 +++
 tests/test_it_linkage.py     | 3 +++
 tests/test_it_sound.py       | 3 +++
 tests/test_it_translation.py | 3 +++
 tests/test_pt_etymology.py   | 3 +++
 tests/test_pt_example.py     | 3 +++
 tests/test_pt_form.py        | 3 +++
 tests/test_pt_gloss.py       | 3 +++
 tests/test_pt_head_line.py   | 3 +++
 tests/test_pt_linkage.py     | 3 +++
 tests/test_pt_sound.py       | 3 +++
 tests/test_pt_translation.py | 3 +++
 tests/test_th_desc.py        | 3 +++
 tests/test_th_example.py     | 3 +++
 tests/test_th_gloss.py       | 3 +++
 tests/test_th_linkage.py     | 3 +++
 tests/test_th_translation.py | 3 +++
 20 files changed, 60 insertions(+)

diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py
index 110bc18a..1b9c8101 100644
--- a/tests/test_it_etymology.py
+++ b/tests/test_it_etymology.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_quote_template(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         data = parse_page(
diff --git a/tests/test_it_example.py b/tests/test_it_example.py
index e8079358..d367f3fe 100644
--- a/tests/test_it_example.py
+++ b/tests/test_it_example.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_list_example(self):
         self.wxr.wtp.add_page("Template:-br-", 10, "Bretone")
         data = parse_page(
diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py
index 5ad323f0..4cd9dba9 100644
--- a/tests/test_it_forms.py
+++ b/tests/test_it_forms.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_tabs_template(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         data = parse_page(
diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py
index b5d2a259..83f19e60 100644
--- a/tests/test_it_gloss.py
+++ b/tests/test_it_gloss.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_gloss_list(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         self.wxr.wtp.add_page(
diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py
index f1cf5c23..d0ad5d53 100644
--- a/tests/test_it_linkage.py
+++ b/tests/test_it_linkage.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_synonyms(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         self.wxr.wtp.add_page(
diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py
index 30ba4a95..d67736b5 100644
--- a/tests/test_it_sound.py
+++ b/tests/test_it_sound.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_hyphenation_single_list(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         data = parse_page(
diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py
index 1ab8dfb7..60c39faa 100644
--- a/tests/test_it_translation.py
+++ b/tests/test_it_translation.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_common_lists(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         self.wxr.wtp.add_page("Template:ar", 10, "arabo")
diff --git a/tests/test_pt_etymology.py b/tests/test_pt_etymology.py
index de08b0f2..5045f16a 100644
--- a/tests/test_pt_etymology.py
+++ b/tests/test_pt_etymology.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_list(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(
diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py
index e106ce58..3cfe8150 100644
--- a/tests/test_pt_example.py
+++ b/tests/test_pt_example.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_tradex_template(self):
         self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano")
         self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo")
diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py
index 2b3fce00..3b7af273 100644
--- a/tests/test_pt_form.py
+++ b/tests/test_pt_form.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_flex_pt_subst_completa(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(
diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py
index 7458db65..a454a33f 100644
--- a/tests/test_pt_gloss.py
+++ b/tests/test_pt_gloss.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_escopo(self):
         self.wxr.wtp.add_page(
             "Predefinição:-pt-",
diff --git a/tests/test_pt_head_line.py b/tests/test_pt_head_line.py
index 6e88274b..bdffc532 100644
--- a/tests/test_pt_head_line.py
+++ b/tests/test_pt_head_line.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_gramática_template(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")
diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py
index f260b4cd..09775fe7 100644
--- a/tests/test_pt_linkage.py
+++ b/tests/test_pt_linkage.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_expression(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")
diff --git a/tests/test_pt_sound.py b/tests/test_pt_sound.py
index 2078c8df..2cc887cd 100644
--- a/tests/test_pt_sound.py
+++ b/tests/test_pt_sound.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_subsection(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(
diff --git a/tests/test_pt_translation.py b/tests/test_pt_translation.py
index dff4a040..18ddc2c1 100644
--- a/tests/test_pt_translation.py
+++ b/tests/test_pt_translation.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_subpage(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(
diff --git a/tests/test_th_desc.py b/tests/test_th_desc.py
index f9c47d58..584faa6f 100644
--- a/tests/test_th_desc.py
+++ b/tests/test_th_desc.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_desc_template(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:desc",
diff --git a/tests/test_th_example.py b/tests/test_th_example.py
index 97f40da3..d0a042c7 100644
--- a/tests/test_th_example.py
+++ b/tests/test_th_example.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_ux(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:ko-usex",
diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
index 0bb2c6ae..bc10beec 100644
--- a/tests/test_th_gloss.py
+++ b/tests/test_th_gloss.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_do_not_share_etymology_data(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:inh+",
diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py
index 5a3310b5..a22f03d9 100644
--- a/tests/test_th_linkage.py
+++ b/tests/test_th_linkage.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_col(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:col2",
diff --git a/tests/test_th_translation.py b/tests/test_th_translation.py
index 2a43dd69..644ab46c 100644
--- a/tests/test_th_translation.py
+++ b/tests/test_th_translation.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_nested_list(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:trans-top",