Skip to content

Commit

Permalink
Merge pull request #988 from xxyzz/th
Browse files Browse the repository at this point in the history
[th] extract "lo-alt" form template and notes section
  • Loading branch information
xxyzz authored Jan 15, 2025
2 parents b941637 + 4fd882a commit 1565781
Show file tree
Hide file tree
Showing 25 changed files with 168 additions and 9 deletions.
38 changes: 33 additions & 5 deletions src/wiktextract/extractor/th/alt_form.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from wikitextprocessor import LevelNode, NodeKind, TemplateNode
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
Expand All @@ -18,26 +18,54 @@ def extract_alt_form_section(
):
extract_alt_template(wxr, word_entry, node)

for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "lo-alt":
extract_lo_alt_template(wxr, word_entry, t_node)


def extract_alt_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
extract_alt_expanded_nodes(wxr, word_entry, expanded_node, lang_code)


def extract_alt_expanded_nodes(
wxr: WiktextractContext,
word_entry: WordEntry,
root: WikiNode,
lang_code: str,
) -> None:
raw_tags = []
for italic_node in expanded_node.find_child(NodeKind.ITALIC):
for italic_node in root.find_child(NodeKind.ITALIC):
raw_tags_str = clean_node(wxr, None, italic_node)
for raw_tag in raw_tags_str.split(","):
raw_tag = raw_tag.strip()
if raw_tag != "":
raw_tags.append(raw_tag)
break

lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
for span_tag in expanded_node.find_html("span"):
if span_tag.attrs.get("lang", "") == lang_code:
for span_tag in root.find_html("span"):
span_lang = span_tag.attrs.get("lang", "")
if span_lang == lang_code:
form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
translate_raw_tags(form)
word_entry.forms.append(form)
elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0:
word_entry.forms[-1].roman = clean_node(wxr, None, span_tag)

clean_node(wxr, word_entry, root)


def extract_lo_alt_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for list_node in expanded_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo")
6 changes: 6 additions & 0 deletions src/wiktextract/extractor/th/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class Form(ThaiBaseModel):
form: str
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""


class Translation(ThaiBaseModel):
Expand Down Expand Up @@ -95,3 +96,8 @@ class WordEntry(ThaiBaseModel):
derived: list[Linkage] = []
related: list[Linkage] = []
descendants: list[Descendant] = []
anagrams: list[Linkage] = []
notes: list[str] = []
hyponyms: list[Linkage] = []
hypernyms: list[Linkage] = []
idioms: list[Linkage] = []
12 changes: 8 additions & 4 deletions src/wiktextract/extractor/th/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .etymology import extract_etymology_section
from .linkage import extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .pos import extract_note_section, extract_pos_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .translation import extract_translation_section

Expand All @@ -29,7 +29,7 @@ def parse_section(
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "รากศัพท์":
extract_etymology_section(wxr, base_data, level_node)
elif title_text == "คำแปลภาษาอื่น":
elif title_text in ["คำแปลภาษาอื่น", "คำแปล"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
Expand All @@ -44,13 +44,17 @@ def parse_section(
extract_descendant_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "การออกเสียง":
elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง")):
pass # sounds
elif title_text == "รูปแบบอื่น":
extract_alt_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text not in ["ดูเพิ่ม"]:
elif title_text == "การใช้":
extract_note_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text not in ["ดูเพิ่ม", "อ้างอิง", "อ่านเพิ่ม", "อ่านเพิ่มเติม"]:
wxr.wtp.debug(f"Unknown title: {title_text}")

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
Expand Down
16 changes: 16 additions & 0 deletions src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,19 @@ def extract_th_verb_adj_template(
)

clean_node(wxr, word_entry, expanded_node)


def extract_note_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
note_str = clean_node(
wxr,
word_entry,
list(list_item.invert_find_child(NodeKind.LIST)),
)
if note_str != "":
word_entry.notes.append(note_str)
22 changes: 22 additions & 0 deletions src/wiktextract/extractor/th/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,20 @@
"คำอุทาน": {"pos": "intj"},
"วลี": {"pos": "phrase"},
"เลข": {"pos": "num", "tags": ["number"]},
"อักษรจีน": {"pos": "character"},
"การถอดเป็นอักษรโรมัน": {"pos": "romanization"},
"สัญลักษณ์": {"pos": "symbol"},
"คำวิเศษณ์": {"pos": "adj"},
"ตัวอักษร": {"pos": "character"},
"รูปผันคำกำกับนาม": {"pos": "noun", "tags": ["form-of"]},
"รูปผันคำสรรพนาม": {"pos": "pron", "tags": ["form-of"]},
"คำกำกับนาม": {"pos": "noun"},
"ตัวเลข": {"pos": "num", "tags": ["number"]},
"พาร์ทิซิเพิล": {"pos": "verb", "tags": ["participle"]},
"พยางค์": {"pos": "syllable"},
"คันจิ": {"pos": "character", "tags": ["kanji"]},
"คำอาการนาม": {"pos": "adj_noun"},
"อุปสรรค": {"pos": "prefix", "tags": ["morpheme"]},
}


Expand All @@ -20,4 +34,12 @@
"คำพ้องความ": "synonyms",
"ลูกคำ": "derived",
"คำเกี่ยวข้อง": "related",
"คำที่เกี่ยวข้อง": "related",
"คำที่รับมา": "derived",
"คำตรงกันข้าม": "antonyms",
"คำสลับอักษร": "anagrams",
"การสลับอักษร": "anagrams",
"คำลูกกลุ่ม": "hyponyms",
"คำจ่ากลุ่ม": "hypernyms",
"สำนวน": "idioms",
}
3 changes: 3 additions & 0 deletions tests/test_it_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_quote_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_it_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_list_example(self):
self.wxr.wtp.add_page("Template:-br-", 10, "Bretone")
data = parse_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_it_forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_tabs_template(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_it_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_gloss_list(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_it_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_synonyms(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_it_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_hyphenation_single_list(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_it_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_common_lists(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page("Template:ar", 10, "arabo")
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_list(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_tradex_template(self):
self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano")
self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo")
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_flex_pt_subst_completa(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_escopo(self):
self.wxr.wtp.add_page(
"Predefinição:-pt-",
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_head_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_gramática_template(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_expression(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''")
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_subsection(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_pt_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def setUp(self) -> None:
conf,
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_subpage(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
Expand Down
3 changes: 3 additions & 0 deletions tests/test_th_desc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_desc_template(self):
self.wxr.wtp.add_page(
"แม่แบบ:desc",
Expand Down
3 changes: 3 additions & 0 deletions tests/test_th_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_ux(self):
self.wxr.wtp.add_page(
"แม่แบบ:ko-usex",
Expand Down
26 changes: 26 additions & 0 deletions tests/test_th_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_do_not_share_etymology_data(self):
self.wxr.wtp.add_page(
"แม่แบบ:inh+",
Expand Down Expand Up @@ -208,3 +211,26 @@ def test_alt_template(self):
{"form": "เดิร", "raw_tags": ["เลิกใช้"]},
],
)

def test_lo_alt(self):
self.wxr.wtp.add_page(
"แม่แบบ:lo-alt",
10,
"""* (''ล้าสมัย'') <span class="Laoo" lang="lo">[[ທຸຣຽນ#ภาษาลาว|ທຸຣຽນ]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="lo-Latn" class="tr Latn">ทุรย̂น</span><span class="mention-gloss-paren annotation-paren">)</span>""",
)
page_data = parse_page(
self.wxr,
"ທຸລຽນ",
"""== ภาษาลาว ==
=== รูปแบบอื่น ===
{{lo-alt|d=ທຸຣຽນ}}
=== คำนาม ===
{{lo-noun}}
# [[ทุเรียน]]""",
)
self.assertEqual(
page_data[0]["forms"],
[
{"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"},
],
)
3 changes: 3 additions & 0 deletions tests/test_th_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def setUp(self) -> None:
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_col(self):
self.wxr.wtp.add_page(
"แม่แบบ:col2",
Expand Down
Loading

0 comments on commit 1565781

Please sign in to comment.