[th] extract "ux" example template

same expanded HTML nodes as en and zh editions
tatuylonen · Jan 7, 2025 · e52475f · e52475f
1 parent af15047
commit e52475f
Show file tree

Hide file tree

Showing 4 changed files with 126 additions and 2 deletions.
diff --git a/src/wiktextract/extractor/th/example.py b/src/wiktextract/extractor/th/example.py
@@ -0,0 +1,47 @@
+from wikitextprocessor import NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Example, Sense
+
+
+def extract_example_list_item(
+    wxr: WiktextractContext, sense: Sense, list_item: WikiNode
+) -> None:
+    for node in list_item.children:
+        if isinstance(node, TemplateNode):
+            if node.template_name in ["ux", "usex", "ko-usex"]:
+                extract_ux_template(wxr, sense, node)
+
+
+def extract_ux_template(
+    wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    e_data = Example(text="")
+    for i_tag in expanded_node.find_html_recursively("i"):
+        i_class = i_tag.attrs.get("class", "")
+        if "e-example" in i_class:
+            e_data.text = clean_node(wxr, None, i_tag)
+        elif "e-transliteration" in i_class:
+            e_data.roman = clean_node(wxr, None, i_tag)
+    for span_tag in expanded_node.find_html_recursively("span"):
+        span_class = span_tag.attrs.get("class", "")
+        if "e-translation" in span_class:
+            e_data.translation = clean_node(wxr, None, span_tag)
+        elif "e-literally" in span_class:
+            e_data.literal_meaning = clean_node(wxr, None, span_tag)
+        elif "qualifier-content" in span_class:
+            raw_tag = clean_node(wxr, None, span_tag)
+            if raw_tag != "":
+                e_data.raw_tags.append(raw_tag)
+
+    e_data.ref = clean_node(
+        wxr, None, t_node.template_parameters.get("ref", "")
+    )
+    if e_data.text != "":
+        sense.examples.append(e_data)
+        for link_node in expanded_node.find_child(NodeKind.LINK):
+            clean_node(wxr, sense, link_node)
diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py
@@ -10,11 +10,30 @@ class ThaiBaseModel(BaseModel):
     )
 
 
+class Example(ThaiBaseModel):
+    text: str
+    translation: str = ""
+    literal_meaning: str = ""
+    roman: str = Field(
+        default="", description="Romanization of the example sentence"
+    )
+    ref: str = Field(
+        default="",
+        description="Source of the sentence, like book title and page number",
+    )
+    ruby: list[tuple[str, ...]] = Field(
+        default=[], description="Japanese Kanji and furigana"
+    )
+    tags: list[str] = []
+    raw_tags: list[str] = []
+
+
 class Sense(ThaiBaseModel):
     glosses: list[str] = []
     tags: list[str] = []
     raw_tags: list[str] = []
     categories: list[str] = []
+    examples: list[Example] = []
 
 
 class WordEntry(ThaiBaseModel):

diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
@@ -2,6 +2,7 @@
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
+from .example import extract_example_list_item
 from .models import Sense, WordEntry
 from .section_titles import POS_DATA
 
@@ -20,8 +21,8 @@ def extract_pos_section(
     page_data[-1].tags.extend(pos_data.get("tags", []))
 
     for list_node in level_node.find_child(NodeKind.LIST):
-        if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
-            for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
                 extract_gloss_list_item(wxr, page_data[-1], list_item)
 
 
@@ -34,6 +35,13 @@ def extract_gloss_list_item(
     gloss_str = clean_node(
         wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
     )
+    for child_list in list_item.find_child(NodeKind.LIST):
+        if child_list.sarg.startswith("#") and child_list.sarg.endswith(
+            (":", "*")
+        ):
+            for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
+                extract_example_list_item(wxr, sense, e_list_item)
+
     if gloss_str != "":
         sense.glosses.append(gloss_str)
         word_entry.senses.append(sense)
diff --git a/tests/test_th_example.py b/tests/test_th_example.py
@@ -0,0 +1,50 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.th.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestThExample(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="th"),
+            WiktionaryConfig(
+                dump_file_lang_code="th", capture_language_codes=None
+            ),
+        )
+
+    def test_ux(self):
+        self.wxr.wtp.add_page(
+            "แม่แบบ:ko-usex",
+            10,
+            """<div class="h-usage-example"><i class="Kore mention e-example" lang="ko">^파리는 ^프랑스의 '''서울'''이다.</i><dl><dd><i lang="ko-Latn" class="e-transliteration tr Latn">Pari-neun Peurangseu-ui '''seour'''-ida.</i></dd><dd><span class="e-translation">ปารีสคือเมืองหลวงของฝรั่งเศส</span></dd></dl></div>[[Category:ศัพท์ภาษาเกาหลีที่มีตัวอย่างการใช้|서울]]""",
+        )
+        page_data = parse_page(
+            self.wxr,
+            "서울",
+            """== ภาษาเกาหลี ==
+=== คำนาม ===
+{{ko-noun}}
+
+# [[เมืองหลวง]]; [[เมือง]][[ใหญ่]]
+#: {{ko-usex|^파리-는 ^프랑스-의 '''서울'''-이다.|ปารีสคือเมืองหลวงของฝรั่งเศส}}""",
+        )
+        self.assertEqual(
+            page_data[0]["senses"][0],
+            {
+                "categories": ["ศัพท์ภาษาเกาหลีที่มีตัวอย่างการใช้"],
+                "glosses": ["เมืองหลวง; เมืองใหญ่"],
+                "examples": [
+                    {
+                        "text": "^파리는 ^프랑스의 서울이다.",
+                        "roman": "Pari-neun Peurangseu-ui seour-ida.",
+                        "translation": "ปารีสคือเมืองหลวงของฝรั่งเศส",
+                    }
+                ],
+            },
+        )