Skip to content

Commit

Permalink
[th] extract form-of templates
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 17, 2025
1 parent d4b786b commit 42b3fdb
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 3 deletions.
6 changes: 6 additions & 0 deletions src/wiktextract/extractor/th/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,19 @@ class Example(ThaiBaseModel):
raw_tags: list[str] = []


class AltForm(ThaiBaseModel):
word: str
roman: str = ""


class Sense(ThaiBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []
classifiers: list[str] = []
form_of: list[AltForm] = []


class Form(ThaiBaseModel):
Expand Down
37 changes: 35 additions & 2 deletions src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import itertools

from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
from wikitextprocessor import (
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Form, Sense, WordEntry
from .models import AltForm, Form, Sense, WordEntry
from .section_titles import POS_DATA
from .tags import translate_raw_tags

Expand Down Expand Up @@ -57,6 +62,15 @@ def extract_gloss_list_item(
extract_label_template(wxr, sense, node)
elif isinstance(node, TemplateNode) and node.template_name == "cls":
extract_cls_template(wxr, sense, node)
elif isinstance(node, TemplateNode) and (
node.template_name.endswith(" of")
or node.template_name == "altform"
):
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
extract_form_of_template(wxr, sense, expanded_node)
gloss_nodes.append(expanded_node)
elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
gloss_nodes.append(node)

Expand Down Expand Up @@ -167,3 +181,22 @@ def extract_note_section(
)
if note_str != "":
word_entry.notes.append(note_str)


def extract_form_of_template(
wxr: WiktextractContext,
sense: Sense,
expanded_node: WikiNode,
) -> None:
form = AltForm(word="")
for i_tag in expanded_node.find_html_recursively("i"):
form.word = clean_node(wxr, None, i_tag)
break
for span_tag in expanded_node.find_html_recursively("span"):
if "mention-tr" in span_tag.attrs.get("class", ""):
form.roman = clean_node(wxr, None, span_tag)
break
if form.word != "":
sense.form_of.append(form)
if "form-of" not in sense.tags:
sense.tags.append("form-of")
4 changes: 3 additions & 1 deletion tests/test_th_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def test_ja_x(self):
self.wxr.wtp.add_page(
"แม่แบบ:syn of",
10,
"คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)",
"""<span class='form-of-definition use-with-mention'>คำพ้องความของ <span class='form-of-definition-link'><i class="Jpan mention" lang="ja">[[北極熊#ภาษาญี่ปุ่น|北極熊]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span class="mention-tr tr">ฮกเกียวกุงุมะ</span>, <span class="mention-gloss-double-quote">“</span><span class="mention-gloss">หมีขั้วโลก</span><span class="mention-gloss-double-quote">”</span><span class="mention-gloss-paren annotation-paren">)</span></span></span>""",
)
page_data = parse_page(
self.wxr,
Expand Down Expand Up @@ -236,5 +236,7 @@ def test_ja_x(self):
"ref": "1990 มิถุนายน 15, Takahashi, Rumiko, “:แม่แบบ:jaru [PART.5 Snatching the Scroll of Secrets]”, in :แม่แบบ:wj [Ranma ½], volume 11 (fiction), Tokyo: Shogakukan, →ISBN, page 72:",
}
],
"form_of": [{"word": "北極熊", "roman": "ฮกเกียวกุงุมะ"}],
"tags": ["form-of"],
},
)
24 changes: 24 additions & 0 deletions tests/test_th_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,3 +234,27 @@ def test_lo_alt(self):
{"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"},
],
)

def test_alt_form_template(self):
self.wxr.wtp.add_page(
"แม่แบบ:altform",
10,
"""<span class='form-of-definition use-with-mention'>อีกรูปหนึ่งของ <span class='form-of-definition-link'><i class="Lana mention" lang="nod">[[ᨸᩣ᩠ᨠ#ภาษาคำเมือง|ᨸᩣ᩠ᨠ]]</i> <span class="mention-gloss-paren annotation-paren">(</span><span lang="nod-Latn" class="mention-tr tr Latn">ปาก</span><span class="mention-gloss-paren annotation-paren">)</span></span></span>""",
)
page_data = parse_page(
self.wxr,
"ปาก",
"""== ภาษาคำเมือง ==
=== คำนาม ===
{{nod-noun}}
# {{altform|nod|ᨸᩣ᩠ᨠ}}""",
)
self.assertEqual(
page_data[0]["senses"][0],
{
"glosses": ["อีกรูปหนึ่งของ ᨸᩣ᩠ᨠ (ปาก)"],
"form_of": [{"word": "ᨸᩣ᩠ᨠ", "roman": "ปาก"}],
"tags": ["form-of"],
},
)

0 comments on commit 42b3fdb

Please sign in to comment.