diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py
index 55e9febf..fff7052e 100644
--- a/src/wiktextract/extractor/th/alt_form.py
+++ b/src/wiktextract/extractor/th/alt_form.py
@@ -1,4 +1,4 @@
-from wikitextprocessor import LevelNode, NodeKind, TemplateNode
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
from ...page import clean_node
from ...wxr_context import WiktextractContext
@@ -18,6 +18,10 @@ def extract_alt_form_section(
):
extract_alt_template(wxr, word_entry, node)
+ for t_node in level_node.find_child(NodeKind.TEMPLATE):
+ if t_node.template_name == "lo-alt":
+ extract_lo_alt_template(wxr, word_entry, t_node)
+
def extract_alt_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
@@ -25,9 +29,18 @@ def extract_alt_template(
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
+ lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
+ extract_alt_expanded_nodes(wxr, word_entry, expanded_node, lang_code)
+
+def extract_alt_expanded_nodes(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ root: WikiNode,
+ lang_code: str,
+) -> None:
raw_tags = []
- for italic_node in expanded_node.find_child(NodeKind.ITALIC):
+ for italic_node in root.find_child(NodeKind.ITALIC):
raw_tags_str = clean_node(wxr, None, italic_node)
for raw_tag in raw_tags_str.split(","):
raw_tag = raw_tag.strip()
@@ -35,9 +48,24 @@ def extract_alt_template(
raw_tags.append(raw_tag)
break
- lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
- for span_tag in expanded_node.find_html("span"):
- if span_tag.attrs.get("lang", "") == lang_code:
+ for span_tag in root.find_html("span"):
+ span_lang = span_tag.attrs.get("lang", "")
+ if span_lang == lang_code:
form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
translate_raw_tags(form)
word_entry.forms.append(form)
+ elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0:
+ word_entry.forms[-1].roman = clean_node(wxr, None, span_tag)
+
+ clean_node(wxr, word_entry, root)
+
+
+def extract_lo_alt_template(
+ wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
+) -> None:
+ expanded_node = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(t_node), expand_all=True
+ )
+ for list_node in expanded_node.find_child(NodeKind.LIST):
+ for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+ extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo")
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
index c04492a9..82f5b626 100644
--- a/src/wiktextract/extractor/th/models.py
+++ b/src/wiktextract/extractor/th/models.py
@@ -41,6 +41,7 @@ class Form(ThaiBaseModel):
form: str
tags: list[str] = []
raw_tags: list[str] = []
+ roman: str = ""
class Translation(ThaiBaseModel):
@@ -95,3 +96,8 @@ class WordEntry(ThaiBaseModel):
derived: list[Linkage] = []
related: list[Linkage] = []
descendants: list[Descendant] = []
+ anagrams: list[Linkage] = []
+ notes: list[str] = []
+ hyponyms: list[Linkage] = []
+ hypernyms: list[Linkage] = []
+ idioms: list[Linkage] = []
diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py
index 03bf20d2..418dbaf2 100644
--- a/src/wiktextract/extractor/th/page.py
+++ b/src/wiktextract/extractor/th/page.py
@@ -11,7 +11,7 @@
from .etymology import extract_etymology_section
from .linkage import extract_linkage_section
from .models import Sense, WordEntry
-from .pos import extract_pos_section
+from .pos import extract_note_section, extract_pos_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .translation import extract_translation_section
@@ -29,7 +29,7 @@ def parse_section(
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "รากศัพท์":
extract_etymology_section(wxr, base_data, level_node)
- elif title_text == "คำแปลภาษาอื่น":
+ elif title_text in ["คำแปลภาษาอื่น", "คำแปล"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
@@ -44,13 +44,17 @@ def parse_section(
extract_descendant_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
- elif title_text == "การออกเสียง":
+ elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง")):
pass # sounds
elif title_text == "รูปแบบอื่น":
extract_alt_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
- elif title_text not in ["ดูเพิ่ม"]:
+ elif title_text == "การใช้":
+ extract_note_section(
+ wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
+ )
+ elif title_text not in ["ดูเพิ่ม", "อ้างอิง", "อ่านเพิ่ม", "อ่านเพิ่มเติม"]:
wxr.wtp.debug(f"Unknown title: {title_text}")
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
index d5d272e5..fab32407 100644
--- a/src/wiktextract/extractor/th/pos.py
+++ b/src/wiktextract/extractor/th/pos.py
@@ -151,3 +151,19 @@ def extract_th_verb_adj_template(
)
clean_node(wxr, word_entry, expanded_node)
+
+
+def extract_note_section(
+ wxr: WiktextractContext,
+ word_entry: WordEntry,
+ level_node: LevelNode,
+) -> None:
+ for list_node in level_node.find_child(NodeKind.LIST):
+ for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+ note_str = clean_node(
+ wxr,
+ word_entry,
+ list(list_item.invert_find_child(NodeKind.LIST)),
+ )
+ if note_str != "":
+ word_entry.notes.append(note_str)
diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py
index ff14f847..5bd7a6f5 100644
--- a/src/wiktextract/extractor/th/section_titles.py
+++ b/src/wiktextract/extractor/th/section_titles.py
@@ -12,6 +12,20 @@
"คำอุทาน": {"pos": "intj"},
"วลี": {"pos": "phrase"},
"เลข": {"pos": "num", "tags": ["number"]},
+ "อักษรจีน": {"pos": "character"},
+ "การถอดเป็นอักษรโรมัน": {"pos": "romanization"},
+ "สัญลักษณ์": {"pos": "symbol"},
+ "คำวิเศษณ์": {"pos": "adj"},
+ "ตัวอักษร": {"pos": "character"},
+ "รูปผันคำกำกับนาม": {"pos": "noun", "tags": ["form-of"]},
+ "รูปผันคำสรรพนาม": {"pos": "pron", "tags": ["form-of"]},
+ "คำกำกับนาม": {"pos": "noun"},
+ "ตัวเลข": {"pos": "num", "tags": ["number"]},
+ "พาร์ทิซิเพิล": {"pos": "verb", "tags": ["participle"]},
+ "พยางค์": {"pos": "syllable"},
+ "คันจิ": {"pos": "character", "tags": ["kanji"]},
+ "คำอาการนาม": {"pos": "adj_noun"},
+ "อุปสรรค": {"pos": "prefix", "tags": ["morpheme"]},
}
@@ -20,4 +34,12 @@
"คำพ้องความ": "synonyms",
"ลูกคำ": "derived",
"คำเกี่ยวข้อง": "related",
+ "คำที่เกี่ยวข้อง": "related",
+ "คำที่รับมา": "derived",
+ "คำตรงกันข้าม": "antonyms",
+ "คำสลับอักษร": "anagrams",
+ "การสลับอักษร": "anagrams",
+ "คำลูกกลุ่ม": "hyponyms",
+ "คำจ่ากลุ่ม": "hypernyms",
+ "สำนวน": "idioms",
}
diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py
index 110bc18a..1b9c8101 100644
--- a/tests/test_it_etymology.py
+++ b/tests/test_it_etymology.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_quote_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
diff --git a/tests/test_it_example.py b/tests/test_it_example.py
index e8079358..d367f3fe 100644
--- a/tests/test_it_example.py
+++ b/tests/test_it_example.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_list_example(self):
self.wxr.wtp.add_page("Template:-br-", 10, "Bretone")
data = parse_page(
diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py
index 5ad323f0..4cd9dba9 100644
--- a/tests/test_it_forms.py
+++ b/tests/test_it_forms.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_tabs_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py
index b5d2a259..83f19e60 100644
--- a/tests/test_it_gloss.py
+++ b/tests/test_it_gloss.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_gloss_list(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py
index f1cf5c23..d0ad5d53 100644
--- a/tests/test_it_linkage.py
+++ b/tests/test_it_linkage.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_synonyms(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py
index 30ba4a95..d67736b5 100644
--- a/tests/test_it_sound.py
+++ b/tests/test_it_sound.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_hyphenation_single_list(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py
index 1ab8dfb7..60c39faa 100644
--- a/tests/test_it_translation.py
+++ b/tests/test_it_translation.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_common_lists(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page("Template:ar", 10, "arabo")
diff --git a/tests/test_pt_etymology.py b/tests/test_pt_etymology.py
index de08b0f2..5045f16a 100644
--- a/tests/test_pt_etymology.py
+++ b/tests/test_pt_etymology.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py
index e106ce58..3cfe8150 100644
--- a/tests/test_pt_example.py
+++ b/tests/test_pt_example.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_tradex_template(self):
self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano")
self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo")
diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py
index 2b3fce00..3b7af273 100644
--- a/tests/test_pt_form.py
+++ b/tests/test_pt_form.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_flex_pt_subst_completa(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py
index 7458db65..a454a33f 100644
--- a/tests/test_pt_gloss.py
+++ b/tests/test_pt_gloss.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_escopo(self):
self.wxr.wtp.add_page(
"Predefinição:-pt-",
diff --git a/tests/test_pt_head_line.py b/tests/test_pt_head_line.py
index 6e88274b..bdffc532 100644
--- a/tests/test_pt_head_line.py
+++ b/tests/test_pt_head_line.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_gramática_template(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")
diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py
index f260b4cd..09775fe7 100644
--- a/tests/test_pt_linkage.py
+++ b/tests/test_pt_linkage.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_expression(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")
diff --git a/tests/test_pt_sound.py b/tests/test_pt_sound.py
index 2078c8df..2cc887cd 100644
--- a/tests/test_pt_sound.py
+++ b/tests/test_pt_sound.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_subsection(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
diff --git a/tests/test_pt_translation.py b/tests/test_pt_translation.py
index dff4a040..18ddc2c1 100644
--- a/tests/test_pt_translation.py
+++ b/tests/test_pt_translation.py
@@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_subpage(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
diff --git a/tests/test_th_desc.py b/tests/test_th_desc.py
index f9c47d58..584faa6f 100644
--- a/tests/test_th_desc.py
+++ b/tests/test_th_desc.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_desc_template(self):
self.wxr.wtp.add_page(
"แม่แบบ:desc",
diff --git a/tests/test_th_example.py b/tests/test_th_example.py
index 97f40da3..d0a042c7 100644
--- a/tests/test_th_example.py
+++ b/tests/test_th_example.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_ux(self):
self.wxr.wtp.add_page(
"แม่แบบ:ko-usex",
diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py
index 11954325..bc10beec 100644
--- a/tests/test_th_gloss.py
+++ b/tests/test_th_gloss.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_do_not_share_etymology_data(self):
self.wxr.wtp.add_page(
"แม่แบบ:inh+",
@@ -208,3 +211,26 @@ def test_alt_template(self):
{"form": "เดิร", "raw_tags": ["เลิกใช้"]},
],
)
+
+ def test_lo_alt(self):
+ self.wxr.wtp.add_page(
+ "แม่แบบ:lo-alt",
+ 10,
+ """* (''ล้าสมัย'') [[ທຸຣຽນ#ภาษาลาว|ທຸຣຽນ]] (ทุรย̂น)""",
+ )
+ page_data = parse_page(
+ self.wxr,
+ "ທຸລຽນ",
+ """== ภาษาลาว ==
+=== รูปแบบอื่น ===
+{{lo-alt|d=ທຸຣຽນ}}
+=== คำนาม ===
+{{lo-noun}}
+# [[ทุเรียน]]""",
+ )
+ self.assertEqual(
+ page_data[0]["forms"],
+ [
+ {"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"},
+ ],
+ )
diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py
index 5a3310b5..a22f03d9 100644
--- a/tests/test_th_linkage.py
+++ b/tests/test_th_linkage.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_col(self):
self.wxr.wtp.add_page(
"แม่แบบ:col2",
diff --git a/tests/test_th_translation.py b/tests/test_th_translation.py
index 2a43dd69..644ab46c 100644
--- a/tests/test_th_translation.py
+++ b/tests/test_th_translation.py
@@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)
+ def tearDown(self):
+ self.wxr.wtp.close_db_conn()
+
def test_nested_list(self):
self.wxr.wtp.add_page(
"แม่แบบ:trans-top",