Skip to content

Commit

Permalink
[th] extract "ux" example template
Browse files Browse the repository at this point in the history
same expanded HTML nodes as en and zh editions
  • Loading branch information
xxyzz committed Jan 7, 2025
1 parent af15047 commit e52475f
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 2 deletions.
47 changes: 47 additions & 0 deletions src/wiktextract/extractor/th/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from wikitextprocessor import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Example, Sense


def extract_example_list_item(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode
) -> None:
for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name in ["ux", "usex", "ko-usex"]:
extract_ux_template(wxr, sense, node)


def extract_ux_template(
wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
e_data = Example(text="")
for i_tag in expanded_node.find_html_recursively("i"):
i_class = i_tag.attrs.get("class", "")
if "e-example" in i_class:
e_data.text = clean_node(wxr, None, i_tag)
elif "e-transliteration" in i_class:
e_data.roman = clean_node(wxr, None, i_tag)
for span_tag in expanded_node.find_html_recursively("span"):
span_class = span_tag.attrs.get("class", "")
if "e-translation" in span_class:
e_data.translation = clean_node(wxr, None, span_tag)
elif "e-literally" in span_class:
e_data.literal_meaning = clean_node(wxr, None, span_tag)
elif "qualifier-content" in span_class:
raw_tag = clean_node(wxr, None, span_tag)
if raw_tag != "":
e_data.raw_tags.append(raw_tag)

e_data.ref = clean_node(
wxr, None, t_node.template_parameters.get("ref", "")
)
if e_data.text != "":
sense.examples.append(e_data)
for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, sense, link_node)
19 changes: 19 additions & 0 deletions src/wiktextract/extractor/th/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,30 @@ class ThaiBaseModel(BaseModel):
)


class Example(ThaiBaseModel):
text: str
translation: str = ""
literal_meaning: str = ""
roman: str = Field(
default="", description="Romanization of the example sentence"
)
ref: str = Field(
default="",
description="Source of the sentence, like book title and page number",
)
ruby: list[tuple[str, ...]] = Field(
default=[], description="Japanese Kanji and furigana"
)
tags: list[str] = []
raw_tags: list[str] = []


class Sense(ThaiBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []


class WordEntry(ThaiBaseModel):
Expand Down
12 changes: 10 additions & 2 deletions src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .section_titles import POS_DATA

Expand All @@ -20,8 +21,8 @@ def extract_pos_section(
page_data[-1].tags.extend(pos_data.get("tags", []))

for list_node in level_node.find_child(NodeKind.LIST):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
extract_gloss_list_item(wxr, page_data[-1], list_item)


Expand All @@ -34,6 +35,13 @@ def extract_gloss_list_item(
gloss_str = clean_node(
wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
)
for child_list in list_item.find_child(NodeKind.LIST):
if child_list.sarg.startswith("#") and child_list.sarg.endswith(
(":", "*")
):
for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, e_list_item)

if gloss_str != "":
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)
50 changes: 50 additions & 0 deletions tests/test_th_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.th.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestThExample(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="th"),
WiktionaryConfig(
dump_file_lang_code="th", capture_language_codes=None
),
)

def test_ux(self):
self.wxr.wtp.add_page(
"แม่แบบ:ko-usex",
10,
"""<div class="h-usage-example"><i class="Kore mention e-example" lang="ko">^파리는 ^프랑스의 '''서울'''이다.</i><dl><dd><i lang="ko-Latn" class="e-transliteration tr Latn">Pari-neun Peurangseu-ui '''seour'''-ida.</i></dd><dd><span class="e-translation">ปารีสคือเมืองหลวงของฝรั่งเศส</span></dd></dl></div>[[Category:ศัพท์ภาษาเกาหลีที่มีตัวอย่างการใช้|서울]]""",
)
page_data = parse_page(
self.wxr,
"서울",
"""== ภาษาเกาหลี ==
=== คำนาม ===
{{ko-noun}}
# [[เมืองหลวง]]; [[เมือง]][[ใหญ่]]
#: {{ko-usex|^파리-는 ^프랑스-의 '''서울'''-이다.|ปารีสคือเมืองหลวงของฝรั่งเศส}}""",
)
self.assertEqual(
page_data[0]["senses"][0],
{
"categories": ["ศัพท์ภาษาเกาหลีที่มีตัวอย่างการใช้"],
"glosses": ["เมืองหลวง; เมืองใหญ่"],
"examples": [
{
"text": "^파리는 ^프랑스의 서울이다.",
"roman": "Pari-neun Peurangseu-ui seour-ida.",
"translation": "ปารีสคือเมืองหลวงของฝรั่งเศส",
}
],
},
)

0 comments on commit e52475f

Please sign in to comment.