From 1f1ff3e775880d2facc6f3b6a6d3cae9c9cf3a6b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 8 Jan 2025 17:14:06 +0800 Subject: [PATCH] [th] extract "ja-x" and "quote-*" example template --- src/wiktextract/extractor/th/example.py | 77 ++++++++++++++++++++++++- src/wiktextract/extractor/th/tags.py | 26 +++++++++ tests/test_th_example.py | 69 +++++++++++++++++++++- 3 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 src/wiktextract/extractor/th/tags.py diff --git a/src/wiktextract/extractor/th/example.py b/src/wiktextract/extractor/th/example.py index 713bc7a8..f91d9469 100644 --- a/src/wiktextract/extractor/th/example.py +++ b/src/wiktextract/extractor/th/example.py @@ -4,12 +4,16 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from ..ruby import extract_ruby from .models import Example, Sense from .tags import translate_raw_tags def extract_example_list_item( - wxr: WiktextractContext, sense: Sense, list_item: WikiNode + wxr: WiktextractContext, + sense: Sense, + list_item: WikiNode, + ref: str = "", ) -> None: for node in list_item.children: if isinstance(node, TemplateNode): @@ -17,6 +21,13 @@ def extract_example_list_item( extract_ux_template(wxr, sense, node) elif node.template_name in ["zh-x", "zh-usex"]: extract_template_zh_x(wxr, sense, node) + elif node.template_name in ["ja-x", "ja-usex"]: + extract_template_ja_usex(wxr, sense, node, ref) + elif node.template_name.startswith("quote-"): + ref = extract_quote_template(wxr, sense, node) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + for child_list_item in node.find_child(NodeKind.LIST_ITEM): + extract_example_list_item(wxr, sense, child_list_item, ref) def extract_ux_template( @@ -130,3 +141,67 @@ def extract_zh_x_no_dl_tag( examples.append(example) return examples + + +def extract_quote_template( + wxr: WiktextractContext, + sense: Sense, + t_node: TemplateNode, +) -> str: + ref = "" + if all( + arg not in t_node.template_parameters for arg in ["text", "passage", 7] + ): + ref = clean_node(wxr, sense, t_node) + else: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + example = Example(text="") + for span_tag in expanded_node.find_html_recursively("span"): + span_class = span_tag.attrs.get("class", "") + if "cited-source" == span_class: + example.ref = clean_node(wxr, None, span_tag) + elif "e-quotation" in span_class: + example.text = clean_node(wxr, None, span_tag) + elif "e-translation" in span_class: + example.translation = clean_node(wxr, None, span_tag) + for i_tag in expanded_node.find_html_recursively( + "i", attr_name="class", attr_value="e-transliteration" + ): + example.roman = clean_node(wxr, None, i_tag) + break + if example.text != "": + sense.examples.append(example) + clean_node(wxr, sense, expanded_node) + + return ref + + +def extract_template_ja_usex( + wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, ref: str +) -> None: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + example = Example(text="", ref=ref) + for span_tag in expanded_node.find_html( + "span", attr_name="class", attr_value="Jpan" + ): + ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) + example.text = clean_node(wxr, None, node_without_ruby) + example.ruby = ruby_data + for span_tag in expanded_node.find_html_recursively( + "span", attr_name="class", attr_value="tr" + ): + example.roman = clean_node(wxr, None, span_tag) + example.translation = clean_node( + wxr, None, t_node.template_parameters.get(3, "") + ) + example.literal_meaning = clean_node( + wxr, None, t_node.template_parameters.get("lit", "") + ) + if example.text != "": + sense.examples.append(example) + for link_node in expanded_node.find_child(NodeKind.LINK): + clean_node(wxr, sense, link_node) diff --git a/src/wiktextract/extractor/th/tags.py b/src/wiktextract/extractor/th/tags.py new file mode 100644 index 00000000..6de4f51d --- /dev/null +++ b/src/wiktextract/extractor/th/tags.py @@ -0,0 +1,26 @@ +from .models import WordEntry + +EXAMPLE_TAGS = { + # แม่แบบ:zh-x, มอดูล:zh-usex/data + "MSC": "Modern Standard Chinese", + "Pinyin": "Pinyin", + "trad.": "Traditional Chinese", + "simp.": "Simplified Chinese", +} + + +TAGS = {**EXAMPLE_TAGS} + + +def translate_raw_tags(data: WordEntry) -> None: + raw_tags = [] + for raw_tag in data.raw_tags: + if raw_tag in TAGS and hasattr(data, "tags"): + tr_tag = TAGS[raw_tag] + if isinstance(tr_tag, str): + data.tags.append(tr_tag) + elif isinstance(tr_tag, list): + data.tags.extend(tr_tag) + else: + raw_tags.append(raw_tag) + data.raw_tags = raw_tags diff --git a/tests/test_th_example.py b/tests/test_th_example.py index 1d5c2a60..97f40da3 100644 --- a/tests/test_th_example.py +++ b/tests/test_th_example.py @@ -128,8 +128,7 @@ def test_zh_x_two_text_lines(self): self.wxr.wtp.add_page( "แม่แบบ:zh-x", 10, - """
[[請#Chinese|請]]大家[[保持#Chinese|保持]][[安靜#Chinese|安靜]]。 [[[w:Standard Chinese|MSC]], [[w:Traditional Chinese|trad.]]]
[[请#Chinese|请]]大家[[保持#Chinese|保持]][[安静#Chinese|安静]]。 [[[w:Standard Chinese|MSC]], [[w:Simplified Chinese|simp.]]]
Qǐng dàjiā bǎochí ānjìng. [Pinyin]
'''ทุกคน'''กรุณาเงียบ
[[Category:ศัพท์ภาษาจีนกลางที่มีตัวอย่างการใช้]] -""", + """
[[請#Chinese|請]]大家[[保持#Chinese|保持]][[安靜#Chinese|安靜]]。 [[[w:Standard Chinese|MSC]], [[w:Traditional Chinese|trad.]]]
[[请#Chinese|请]]大家[[保持#Chinese|保持]][[安静#Chinese|安静]]。 [[[w:Standard Chinese|MSC]], [[w:Simplified Chinese|simp.]]]
Qǐng dàjiā bǎochí ānjìng. [Pinyin]
'''ทุกคน'''กรุณาเงียบ
[[Category:ศัพท์ภาษาจีนกลางที่มีตัวอย่างการใช้]]""", ) page_data = parse_page( self.wxr, @@ -170,3 +169,69 @@ def test_zh_x_two_text_lines(self): ], }, ) + + def test_ja_x(self): + self.wxr.wtp.add_page( + "แม่แบบ:quote-book", + 10, + """
'''1990''' มิถุนายน 15, [[w:Rumiko Takahashi|Takahashi, Rumiko]], “[[:แม่แบบ:jaru]] [PART.5 Snatching the Scroll of Secrets]”, in [[:แม่แบบ:wj]] [[[w:Ranma ½|Ranma ½]]], volume 11 (fiction), Tokyo: Shogakukan, [[Special:BookSources/4-09-122431-8|→ISBN]], page 72:[[Category:ศัพท์ภาษาญี่ปุ่นที่มีการยกข้อความ|大00宀07]]
""", + ) + self.wxr.wtp.add_page( + "แม่แบบ:ja-usex", + 10, + """(にっ)(ぽん)(さん)(ちゅう)に…'''シロクマ'''がいるか—————っ‼
Nippon no sanchū ni… '''shirokuma''' ga iru ka—————'‼
ทำไมถึงมี...หมีขั้วโลกบนภูเขาญี่ปุ่นได้⁉
[[Category:ศัพท์ภาษาญี่ปุ่นที่มีตัวอย่างการใช้|大00宀07]]""", + ) + self.wxr.wtp.add_page( + "แม่แบบ:syn of", + 10, + "คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)", + ) + page_data = parse_page( + self.wxr, + "白熊", + """== ภาษาญี่ปุ่น == +=== คำนาม === +{{ja-noun|しろくま|シロクマ}} + +# {{syn of|ja|北極熊|tr=ฮกเกียวกุงุมะ||หมีขั้วโลก}} +#* {{quote-book|ja +|| |{{wj|らんま1/2|らんま½}} +|| 72 +| last=Takahashi +| first=Rumiko +| authorlink=Rumiko Takahashi +| chapter={{jaru|[PART] (パート).5 [秘] (ひ) [伝] (でん) [書] (しょ) を[奪] (うば) え}} +| trans-chapter=PART.5 Snatching the Scroll of Secrets +| trans-title={{w|Ranma ½}} +| genre=fiction +| location=Tokyo +| publisher=Shogakukan +| date=Jun 15 1990 +| volume=11 +| isbn=4-09-122431-8}} +#*: {{ja-usex|日%本の山%中に…'''シロクマ'''がいるか—————っ‼|^にっ%ぽん の さん%ちゅう に… '''シロクマ''' が いる か—————っ‼|ทำไมถึงมี...หมีขั้วโลกบนภูเขาญี่ปุ่นได้⁉}}""", + ) + self.assertEqual( + page_data[0]["senses"][0], + { + "categories": [ + "ศัพท์ภาษาญี่ปุ่นที่มีการยกข้อความ", + "ศัพท์ภาษาญี่ปุ่นที่มีตัวอย่างการใช้", + ], + "glosses": ["คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)"], + "examples": [ + { + "text": "日本の山中に…シロクマがいるか—————っ‼", + "roman": "Nippon no sanchū ni… shirokuma ga iru ka—————'‼", + "translation": "ทำไมถึงมี...หมีขั้วโลกบนภูเขาญี่ปุ่นได้⁉", + "ruby": [ + ("日", "にっ"), + ("本", "ぽん"), + ("山", "さん"), + ("中", "ちゅう"), + ], + "ref": "1990 มิถุนายน 15, Takahashi, Rumiko, “:แม่แบบ:jaru [PART.5 Snatching the Scroll of Secrets]”, in :แม่แบบ:wj [Ranma ½], volume 11 (fiction), Tokyo: Shogakukan, →ISBN, page 72:", + } + ], + }, + )