Skip to content

Commit

Permalink
[th] extract "ja-x" and "quote-*" example template
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 8, 2025
1 parent 00f01cc commit 1f1ff3e
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 3 deletions.
77 changes: 76 additions & 1 deletion src/wiktextract/extractor/th/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,30 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .models import Example, Sense
from .tags import translate_raw_tags


def extract_example_list_item(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
ref: str = "",
) -> None:
for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name in ["ux", "usex", "ko-usex"]:
extract_ux_template(wxr, sense, node)
elif node.template_name in ["zh-x", "zh-usex"]:
extract_template_zh_x(wxr, sense, node)
elif node.template_name in ["ja-x", "ja-usex"]:
extract_template_ja_usex(wxr, sense, node, ref)
elif node.template_name.startswith("quote-"):
ref = extract_quote_template(wxr, sense, node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, child_list_item, ref)


def extract_ux_template(
Expand Down Expand Up @@ -130,3 +141,67 @@ def extract_zh_x_no_dl_tag(
examples.append(example)

return examples


def extract_quote_template(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> str:
ref = ""
if all(
arg not in t_node.template_parameters for arg in ["text", "passage", 7]
):
ref = clean_node(wxr, sense, t_node)
else:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
example = Example(text="")
for span_tag in expanded_node.find_html_recursively("span"):
span_class = span_tag.attrs.get("class", "")
if "cited-source" == span_class:
example.ref = clean_node(wxr, None, span_tag)
elif "e-quotation" in span_class:
example.text = clean_node(wxr, None, span_tag)
elif "e-translation" in span_class:
example.translation = clean_node(wxr, None, span_tag)
for i_tag in expanded_node.find_html_recursively(
"i", attr_name="class", attr_value="e-transliteration"
):
example.roman = clean_node(wxr, None, i_tag)
break
if example.text != "":
sense.examples.append(example)
clean_node(wxr, sense, expanded_node)

return ref


def extract_template_ja_usex(
wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, ref: str
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
example = Example(text="", ref=ref)
for span_tag in expanded_node.find_html(
"span", attr_name="class", attr_value="Jpan"
):
ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
example.text = clean_node(wxr, None, node_without_ruby)
example.ruby = ruby_data
for span_tag in expanded_node.find_html_recursively(
"span", attr_name="class", attr_value="tr"
):
example.roman = clean_node(wxr, None, span_tag)
example.translation = clean_node(
wxr, None, t_node.template_parameters.get(3, "")
)
example.literal_meaning = clean_node(
wxr, None, t_node.template_parameters.get("lit", "")
)
if example.text != "":
sense.examples.append(example)
for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, sense, link_node)
26 changes: 26 additions & 0 deletions src/wiktextract/extractor/th/tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from .models import WordEntry

EXAMPLE_TAGS = {
# แม่แบบ:zh-x, มอดูล:zh-usex/data
"MSC": "Modern Standard Chinese",
"Pinyin": "Pinyin",
"trad.": "Traditional Chinese",
"simp.": "Simplified Chinese",
}


TAGS = {**EXAMPLE_TAGS}


def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS and hasattr(data, "tags"):
tr_tag = TAGS[raw_tag]
if isinstance(tr_tag, str):
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
69 changes: 67 additions & 2 deletions tests/test_th_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,7 @@ def test_zh_x_two_text_lines(self):
self.wxr.wtp.add_page(
"แม่แบบ:zh-x",
10,
"""<dl class="zhusex"><span lang="zh-Hant" class="Hant">[[請#Chinese|請]]<b>大家</b>[[保持#Chinese|保持]][[安靜#Chinese|安靜]]。</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:Standard Chinese|MSC]], <i>[[w:Traditional Chinese|trad.]]</i>&#93;</span><br><span lang="zh-Hans" class="Hans">[[请#Chinese|请]]<b>大家</b>[[保持#Chinese|保持]][[安静#Chinese|安静]]。</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:Standard Chinese|MSC]], <i>[[w:Simplified Chinese|simp.]]</i>&#93;</span><dd><span lang="zh-Latn" style="color:#404D52"><i>Qǐng <b>dàjiā</b> bǎochí ānjìng.</i></span> <span style="color:darkgreen; font-size:x-small;">&#91;Pinyin&#93;</span></dd><dd>'''ทุกคน'''กรุณาเงียบ</dd></dl>[[Category:ศัพท์ภาษาจีนกลางที่มีตัวอย่างการใช้]]
""",
"""<dl class="zhusex"><span lang="zh-Hant" class="Hant">[[請#Chinese|請]]<b>大家</b>[[保持#Chinese|保持]][[安靜#Chinese|安靜]]。</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:Standard Chinese|MSC]], <i>[[w:Traditional Chinese|trad.]]</i>&#93;</span><br><span lang="zh-Hans" class="Hans">[[请#Chinese|请]]<b>大家</b>[[保持#Chinese|保持]][[安静#Chinese|安静]]。</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:Standard Chinese|MSC]], <i>[[w:Simplified Chinese|simp.]]</i>&#93;</span><dd><span lang="zh-Latn" style="color:#404D52"><i>Qǐng <b>dàjiā</b> bǎochí ānjìng.</i></span> <span style="color:darkgreen; font-size:x-small;">&#91;Pinyin&#93;</span></dd><dd>'''ทุกคน'''กรุณาเงียบ</dd></dl>[[Category:ศัพท์ภาษาจีนกลางที่มีตัวอย่างการใช้]]""",
)
page_data = parse_page(
self.wxr,
Expand Down Expand Up @@ -170,3 +169,69 @@ def test_zh_x_two_text_lines(self):
],
},
)

def test_ja_x(self):
self.wxr.wtp.add_page(
"แม่แบบ:quote-book",
10,
"""<div class="citation-whole"><span class="cited-source">'''1990''' มิถุนายน 15, [[w:Rumiko Takahashi|Takahashi, Rumiko]], “[[:แม่แบบ:jaru]] &#91;PART.5 Snatching the Scroll of Secrets&#93;”, in <cite>[[:แม่แบบ:wj]]</cite> &#91;<cite>[[w:Ranma ½|Ranma ½]]</cite>&#93;, volume 11 (fiction), Tokyo&#58; Shogakukan, <small>[[Special:BookSources/4-09-122431-8|→ISBN]]</small>, page 72:[[Category:ศัพท์ภาษาญี่ปุ่นที่มีการยกข้อความ|大00宀07]]</span><dl><dd></dd></dl></div>""",
)
self.wxr.wtp.add_page(
"แม่แบบ:ja-usex",
10,
"""<span lang="ja" class="Jpan"><ruby>日<rp>(</rp><rt>にっ</rt><rp>)</rp></ruby><ruby>本<rp>(</rp><rt>ぽん</rt><rp>)</rp></ruby>の<ruby>山<rp>(</rp><rt>さん</rt><rp>)</rp></ruby><ruby>中<rp>(</rp><rt>ちゅう</rt><rp>)</rp></ruby>に…'''シロクマ'''がいるか—————っ‼</span><dl><dd><i><span class="tr">Nippon no sanchū ni… '''shirokuma''' ga iru ka—————'‼</span></i></dd><dd>ทำไมถึงมี...หมีขั้วโลกบนภูเขาญี่ปุ่นได้⁉</dd></dl>[[Category:ศัพท์ภาษาญี่ปุ่นที่มีตัวอย่างการใช้|大00宀07]]""",
)
self.wxr.wtp.add_page(
"แม่แบบ:syn of",
10,
"คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)",
)
page_data = parse_page(
self.wxr,
"白熊",
"""== ภาษาญี่ปุ่น ==
=== คำนาม ===
{{ja-noun|しろくま|シロクマ}}
# {{syn of|ja|北極熊|tr=ฮกเกียวกุงุมะ||หมีขั้วโลก}}
#* {{quote-book|ja
|| |{{wj|らんま1/2|らんま½}}
|| 72
| last=Takahashi
| first=Rumiko
| authorlink=Rumiko Takahashi
| chapter={{jaru|[PART] (パート).5 [秘] (ひ) [伝] (でん) [書] (しょ) を[奪] (うば) え}}
| trans-chapter=PART.5 Snatching the Scroll of Secrets
| trans-title={{w|Ranma ½}}
| genre=fiction
| location=Tokyo
| publisher=Shogakukan
| date=Jun 15 1990
| volume=11
| isbn=4-09-122431-8}}
#*: {{ja-usex|日%本の山%中に…'''シロクマ'''がいるか—————っ‼|^にっ%ぽん の さん%ちゅう に… '''シロクマ''' が いる か—————っ‼|ทำไมถึงมี...หมีขั้วโลกบนภูเขาญี่ปุ่นได้⁉}}""",
)
self.assertEqual(
page_data[0]["senses"][0],
{
"categories": [
"ศัพท์ภาษาญี่ปุ่นที่มีการยกข้อความ",
"ศัพท์ภาษาญี่ปุ่นที่มีตัวอย่างการใช้",
],
"glosses": ["คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)"],
"examples": [
{
"text": "日本の山中に…シロクマがいるか—————っ‼",
"roman": "Nippon no sanchū ni… shirokuma ga iru ka—————'‼",
"translation": "ทำไมถึงมี...หมีขั้วโลกบนภูเขาญี่ปุ่นได้⁉",
"ruby": [
("日", "にっ"),
("本", "ぽん"),
("山", "さん"),
("中", "ちゅう"),
],
"ref": "1990 มิถุนายน 15, Takahashi, Rumiko, “:แม่แบบ:jaru [PART.5 Snatching the Scroll of Secrets]”, in :แม่แบบ:wj [Ranma ½], volume 11 (fiction), Tokyo: Shogakukan, →ISBN, page 72:",
}
],
},
)

0 comments on commit 1f1ff3e

Please sign in to comment.