-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
same expanded HTML nodes as en and zh editions
- Loading branch information
Showing
4 changed files
with
126 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from wikitextprocessor import NodeKind, TemplateNode, WikiNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Example, Sense | ||
|
||
|
||
def extract_example_list_item( | ||
wxr: WiktextractContext, sense: Sense, list_item: WikiNode | ||
) -> None: | ||
for node in list_item.children: | ||
if isinstance(node, TemplateNode): | ||
if node.template_name in ["ux", "usex", "ko-usex"]: | ||
extract_ux_template(wxr, sense, node) | ||
|
||
|
||
def extract_ux_template( | ||
wxr: WiktextractContext, sense: Sense, t_node: TemplateNode | ||
) -> None: | ||
expanded_node = wxr.wtp.parse( | ||
wxr.wtp.node_to_wikitext(t_node), expand_all=True | ||
) | ||
e_data = Example(text="") | ||
for i_tag in expanded_node.find_html_recursively("i"): | ||
i_class = i_tag.attrs.get("class", "") | ||
if "e-example" in i_class: | ||
e_data.text = clean_node(wxr, None, i_tag) | ||
elif "e-transliteration" in i_class: | ||
e_data.roman = clean_node(wxr, None, i_tag) | ||
for span_tag in expanded_node.find_html_recursively("span"): | ||
span_class = span_tag.attrs.get("class", "") | ||
if "e-translation" in span_class: | ||
e_data.translation = clean_node(wxr, None, span_tag) | ||
elif "e-literally" in span_class: | ||
e_data.literal_meaning = clean_node(wxr, None, span_tag) | ||
elif "qualifier-content" in span_class: | ||
raw_tag = clean_node(wxr, None, span_tag) | ||
if raw_tag != "": | ||
e_data.raw_tags.append(raw_tag) | ||
|
||
e_data.ref = clean_node( | ||
wxr, None, t_node.template_parameters.get("ref", "") | ||
) | ||
if e_data.text != "": | ||
sense.examples.append(e_data) | ||
for link_node in expanded_node.find_child(NodeKind.LINK): | ||
clean_node(wxr, sense, link_node) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from unittest import TestCase | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.th.page import parse_page | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestThExample(TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="th"), | ||
WiktionaryConfig( | ||
dump_file_lang_code="th", capture_language_codes=None | ||
), | ||
) | ||
|
||
def test_ux(self): | ||
self.wxr.wtp.add_page( | ||
"แม่แบบ:ko-usex", | ||
10, | ||
"""<div class="h-usage-example"><i class="Kore mention e-example" lang="ko">^파리는 ^프랑스의 '''서울'''이다.</i><dl><dd><i lang="ko-Latn" class="e-transliteration tr Latn">Pari-neun Peurangseu-ui '''seour'''-ida.</i></dd><dd><span class="e-translation">ปารีสคือเมืองหลวงของฝรั่งเศส</span></dd></dl></div>[[Category:ศัพท์ภาษาเกาหลีที่มีตัวอย่างการใช้|서울]]""", | ||
) | ||
page_data = parse_page( | ||
self.wxr, | ||
"서울", | ||
"""== ภาษาเกาหลี == | ||
=== คำนาม === | ||
{{ko-noun}} | ||
# [[เมืองหลวง]]; [[เมือง]][[ใหญ่]] | ||
#: {{ko-usex|^파리-는 ^프랑스-의 '''서울'''-이다.|ปารีสคือเมืองหลวงของฝรั่งเศส}}""", | ||
) | ||
self.assertEqual( | ||
page_data[0]["senses"][0], | ||
{ | ||
"categories": ["ศัพท์ภาษาเกาหลีที่มีตัวอย่างการใช้"], | ||
"glosses": ["เมืองหลวง; เมืองใหญ่"], | ||
"examples": [ | ||
{ | ||
"text": "^파리는 ^프랑스의 서울이다.", | ||
"roman": "Pari-neun Peurangseu-ui seour-ida.", | ||
"translation": "ปารีสคือเมืองหลวงของฝรั่งเศส", | ||
} | ||
], | ||
}, | ||
) |