Skip to content

Commit

Permalink
[th] extract alternate forms section
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 14, 2025
1 parent c9beecc commit 46f49be
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 2 deletions.
43 changes: 43 additions & 0 deletions src/wiktextract/extractor/th/alt_form.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from wikitextprocessor import LevelNode, NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry
from .tags import translate_raw_tags


def extract_alt_form_section(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for node in list_item.children:
if (
isinstance(node, TemplateNode)
and node.template_name == "alt"
):
extract_alt_template(wxr, word_entry, node)


def extract_alt_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)

raw_tags = []
for italic_node in expanded_node.find_child(NodeKind.ITALIC):
raw_tags_str = clean_node(wxr, None, italic_node)
for raw_tag in raw_tags_str.split(","):
raw_tag = raw_tag.strip()
if raw_tag != "":
raw_tags.append(raw_tag)
break

lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
for span_tag in expanded_node.find_html("span"):
if span_tag.attrs.get("lang", "") == lang_code:
form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags)
translate_raw_tags(form)
word_entry.forms.append(form)
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/th/descendant.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def extract_desc_list_item(
"desc",
"descendant",
"desctree",
"descendants tree"
"descendants tree",
]:
desc_list.extend(
extract_desc_template(wxr, word_entry, parent_data, node)
Expand Down
9 changes: 9 additions & 0 deletions src/wiktextract/extractor/th/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .alt_form import extract_alt_form_section
from .descendant import extract_descendant_section
from .etymology import extract_etymology_section
from .linkage import extract_linkage_section
Expand Down Expand Up @@ -43,6 +44,14 @@ def parse_section(
extract_descendant_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "การออกเสียง":
pass # sounds
elif title_text == "รูปแบบอื่น":
extract_alt_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text not in ["ดูเพิ่ม"]:
wxr.wtp.debug(f"Unknown title: {title_text}")

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_th_desc.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def test_desc_template(self):
self.wxr.wtp.add_page(
"แม่แบบ:desc",
10,
"""<span class="desc-arr" title="borrowed">→</span> พม่า: <span class="Mymr" lang="{{{1}}}">[[{{{2}}}]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="my-Latn" class="tr Latn">{{{tr|}}}</span>, <span class="mention-gloss-double-quote">“</span><span class="mention-gloss">{{{t|}}}</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span>"""
"""<span class="desc-arr" title="borrowed">→</span> พม่า: <span class="Mymr" lang="{{{1}}}">[[{{{2}}}]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span lang="my-Latn" class="tr Latn">{{{tr|}}}</span>, <span class="mention-gloss-double-quote">“</span><span class="mention-gloss">{{{t|}}}</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span>""",
)
page_data = parse_page(
self.wxr,
Expand Down
23 changes: 23 additions & 0 deletions tests/test_th_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,26 @@ def test_th_verb(self):
"lang_code": "th",
},
)

def test_alt_template(self):
self.wxr.wtp.add_page(
"แม่แบบ:alt",
10,
"""(''เลิกใช้'') <span class="Thai" lang="th">[[เดอร#ภาษาไทย|เดอร]]</span>, <span class="Thai" lang="th">[[เดิร#ภาษาไทย|เดิร]]</span>""",
)
page_data = parse_page(
self.wxr,
"เดิน",
"""== ภาษาไทย ==
=== รูปแบบอื่น ===
* {{alt|th|เดอร|เดิร||เลิกใช้}}
=== คำกริยา ===
# [[ยก]][[เท้า]][[ก้าว]][[ไป]]""",
)
self.assertEqual(
page_data[0]["forms"],
[
{"form": "เดอร", "raw_tags": ["เลิกใช้"]},
{"form": "เดิร", "raw_tags": ["เลิกใช้"]},
],
)

0 comments on commit 46f49be

Please sign in to comment.