Merge pull request #988 from xxyzz/th

[th] extract "lo-alt" form template and notes section
tatuylonen · Jan 15, 2025 · 1565781 · 1565781
2 parents b941637 + 4fd882a
commit 1565781
Show file tree

Hide file tree

Showing 25 changed files with 168 additions and 9 deletions.
diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py
@@ -1,4 +1,4 @@
-from wikitextprocessor import LevelNode, NodeKind, TemplateNode
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
@@ -18,26 +18,54 @@ def extract_alt_form_section(
                 ):
                     extract_alt_template(wxr, word_entry, node)
 
+    for t_node in level_node.find_child(NodeKind.TEMPLATE):
+        if t_node.template_name == "lo-alt":
+            extract_lo_alt_template(wxr, word_entry, t_node)
+
 
 def extract_alt_template(
     wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
 ) -> None:
     expanded_node = wxr.wtp.parse(
         wxr.wtp.node_to_wikitext(t_node), expand_all=True
     )
+    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+    extract_alt_expanded_nodes(wxr, word_entry, expanded_node, lang_code)
 
+
+def extract_alt_expanded_nodes(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    root: WikiNode,
+    lang_code: str,
+) -> None:
     raw_tags = []
-    for italic_node in expanded_node.find_child(NodeKind.ITALIC):
+    for italic_node in root.find_child(NodeKind.ITALIC):
         raw_tags_str = clean_node(wxr, None, italic_node)
         for raw_tag in raw_tags_str.split(","):
             raw_tag = raw_tag.strip()
             if raw_tag != "":
                 raw_tags.append(raw_tag)
         break
 
-    lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
-    for span_tag in expanded_node.find_html("span"):
-        if span_tag.attrs.get("lang", "") == lang_code:
+    for span_tag in root.find_html("span"):
+        span_lang = span_tag.attrs.get("lang", "")
+        if span_lang == lang_code:
             form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
             translate_raw_tags(form)
             word_entry.forms.append(form)
+        elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0:
+            word_entry.forms[-1].roman = clean_node(wxr, None, span_tag)
+
+    clean_node(wxr, word_entry, root)
+
+
+def extract_lo_alt_template(
+    wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    for list_node in expanded_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo")
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
@@ -41,6 +41,7 @@ class Form(ThaiBaseModel):
     form: str
     tags: list[str] = []
     raw_tags: list[str] = []
+    roman: str = ""
 
 
 class Translation(ThaiBaseModel):
@@ -95,3 +96,8 @@ class WordEntry(ThaiBaseModel):
     derived: list[Linkage] = []
     related: list[Linkage] = []
     descendants: list[Descendant] = []
+    anagrams: list[Linkage] = []
+    notes: list[str] = []
+    hyponyms: list[Linkage] = []
+    hypernyms: list[Linkage] = []
+    idioms: list[Linkage] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
@@ -11,7 +11,7 @@
 from .etymology import extract_etymology_section
 from .linkage import extract_linkage_section
 from .models import Sense, WordEntry
-from .pos import extract_pos_section
+from .pos import extract_note_section, extract_pos_section
 from .section_titles import LINKAGE_SECTIONS, POS_DATA
 from .translation import extract_translation_section
 
@@ -29,7 +29,7 @@ def parse_section(
         extract_pos_section(wxr, page_data, base_data, level_node, title_text)
     elif title_text == "รากศัพท์":
         extract_etymology_section(wxr, base_data, level_node)
-    elif title_text == "คำแปลภาษาอื่น":
+    elif title_text in ["คำแปลภาษาอื่น", "คำแปล"]:
         extract_translation_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
         )
@@ -44,13 +44,17 @@ def parse_section(
         extract_descendant_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
         )
-    elif title_text == "การออกเสียง":
+    elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง")):
         pass  # sounds
     elif title_text == "รูปแบบอื่น":
         extract_alt_form_section(
             wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
         )
-    elif title_text not in ["ดูเพิ่ม"]:
+    elif title_text == "การใช้":
+        extract_note_section(
+            wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+        )
+    elif title_text not in ["ดูเพิ่ม", "อ้างอิง", "อ่านเพิ่ม", "อ่านเพิ่มเติม"]:
         wxr.wtp.debug(f"Unknown title: {title_text}")
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):

diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
@@ -151,3 +151,19 @@ def extract_th_verb_adj_template(
             )
 
     clean_node(wxr, word_entry, expanded_node)
+
+
+def extract_note_section(
+    wxr: WiktextractContext,
+    word_entry: WordEntry,
+    level_node: LevelNode,
+) -> None:
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            note_str = clean_node(
+                wxr,
+                word_entry,
+                list(list_item.invert_find_child(NodeKind.LIST)),
+            )
+            if note_str != "":
+                word_entry.notes.append(note_str)
diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
@@ -12,6 +12,20 @@
     "คำอุทาน": {"pos": "intj"},
     "วลี": {"pos": "phrase"},
     "เลข": {"pos": "num", "tags": ["number"]},
+    "อักษรจีน": {"pos": "character"},
+    "การถอดเป็นอักษรโรมัน": {"pos": "romanization"},
+    "สัญลักษณ์": {"pos": "symbol"},
+    "คำวิเศษณ์": {"pos": "adj"},
+    "ตัวอักษร": {"pos": "character"},
+    "รูปผันคำกำกับนาม": {"pos": "noun", "tags": ["form-of"]},
+    "รูปผันคำสรรพนาม": {"pos": "pron", "tags": ["form-of"]},
+    "คำกำกับนาม": {"pos": "noun"},
+    "ตัวเลข": {"pos": "num", "tags": ["number"]},
+    "พาร์ทิซิเพิล": {"pos": "verb", "tags": ["participle"]},
+    "พยางค์": {"pos": "syllable"},
+    "คันจิ": {"pos": "character", "tags": ["kanji"]},
+    "คำอาการนาม": {"pos": "adj_noun"},
+    "อุปสรรค": {"pos": "prefix", "tags": ["morpheme"]},
 }
 
 
@@ -20,4 +34,12 @@
     "คำพ้องความ": "synonyms",
     "ลูกคำ": "derived",
     "คำเกี่ยวข้อง": "related",
+    "คำที่เกี่ยวข้อง": "related",
+    "คำที่รับมา": "derived",
+    "คำตรงกันข้าม": "antonyms",
+    "คำสลับอักษร": "anagrams",
+    "การสลับอักษร": "anagrams",
+    "คำลูกกลุ่ม": "hyponyms",
+    "คำจ่ากลุ่ม": "hypernyms",
+    "สำนวน": "idioms",
 }
diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_quote_template(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         data = parse_page(

diff --git a/tests/test_it_example.py b/tests/test_it_example.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_list_example(self):
         self.wxr.wtp.add_page("Template:-br-", 10, "Bretone")
         data = parse_page(

diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_tabs_template(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         data = parse_page(

diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_gloss_list(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         self.wxr.wtp.add_page(

diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_synonyms(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         self.wxr.wtp.add_page(

diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_hyphenation_single_list(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         data = parse_page(

diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_common_lists(self):
         self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
         self.wxr.wtp.add_page("Template:ar", 10, "arabo")

diff --git a/tests/test_pt_etymology.py b/tests/test_pt_etymology.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_list(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(

diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_tradex_template(self):
         self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano")
         self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo")

diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_flex_pt_subst_completa(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(

diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_escopo(self):
         self.wxr.wtp.add_page(
             "Predefinição:-pt-",

diff --git a/tests/test_pt_head_line.py b/tests/test_pt_head_line.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_gramática_template(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")

diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_expression(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")

diff --git a/tests/test_pt_sound.py b/tests/test_pt_sound.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_subsection(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(

diff --git a/tests/test_pt_translation.py b/tests/test_pt_translation.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
             conf,
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_subpage(self):
         self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
         self.wxr.wtp.add_page(

diff --git a/tests/test_th_desc.py b/tests/test_th_desc.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_desc_template(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:desc",

diff --git a/tests/test_th_example.py b/tests/test_th_example.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_ux(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:ko-usex",

diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_do_not_share_etymology_data(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:inh+",
@@ -208,3 +211,26 @@ def test_alt_template(self):
                 {"form": "เดิร", "raw_tags": ["เลิกใช้"]},
             ],
         )
+
+    def test_lo_alt(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:lo-alt",
+            10,
+            """* (''ล้าสมัย'') <span class="Laoo" lang="lo">[[ທຸຣຽນ#ภาษาลาว|ທຸຣຽນ]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="lo-Latn" class="tr Latn">ทุรย̂น</span><span class="mention-gloss-paren annotation-paren">)</span>""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "ທຸລຽນ",
+            """== ภาษาลาว ==
+=== รูปแบบอื่น ===
+{{lo-alt|d=ທຸຣຽນ}}
+=== คำนาม ===
+{{lo-noun}}
+# [[ทุเรียน]]""",
+        )
+        self.assertEqual(
+            page_data[0]["forms"],
+            [
+                {"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"},
+            ],
+        )
diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
             ),
         )
 
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+
     def test_col(self):
         self.wxr.wtp.add_page(
             "แม่แบบ:col2",