Merge pull request #982 from xxyzz/th

[th] extract gloss tag and example templates
tatuylonen · Jan 8, 2025 · ef1c2aa · ef1c2aa
2 parents 3839b34 + 1f1ff3e
commit ef1c2aa
Show file tree

Hide file tree

Showing 5 changed files with 428 additions and 6 deletions.
diff --git a/src/wiktextract/extractor/th/example.py b/src/wiktextract/extractor/th/example.py
@@ -1,17 +1,33 @@
-from wikitextprocessor import NodeKind, TemplateNode, WikiNode
+import re
+
+from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
+from ..ruby import extract_ruby
 from .models import Example, Sense
+from .tags import translate_raw_tags
 
 
 def extract_example_list_item(
-    wxr: WiktextractContext, sense: Sense, list_item: WikiNode
+    wxr: WiktextractContext,
+    sense: Sense,
+    list_item: WikiNode,
+    ref: str = "",
 ) -> None:
     for node in list_item.children:
         if isinstance(node, TemplateNode):
             if node.template_name in ["ux", "usex", "ko-usex"]:
                 extract_ux_template(wxr, sense, node)
+            elif node.template_name in ["zh-x", "zh-usex"]:
+                extract_template_zh_x(wxr, sense, node)
+            elif node.template_name in ["ja-x", "ja-usex"]:
+                extract_template_ja_usex(wxr, sense, node, ref)
+            elif node.template_name.startswith("quote-"):
+                ref = extract_quote_template(wxr, sense, node)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                extract_example_list_item(wxr, sense, child_list_item, ref)
 
 
 def extract_ux_template(
@@ -42,6 +58,150 @@ def extract_ux_template(
         wxr, None, t_node.template_parameters.get("ref", "")
     )
     if e_data.text != "":
+        translate_raw_tags(e_data)
         sense.examples.append(e_data)
         for link_node in expanded_node.find_child(NodeKind.LINK):
             clean_node(wxr, sense, link_node)
+
+
+def extract_template_zh_x(
+    wxr: WiktextractContext,
+    sense: Sense,
+    t_node: TemplateNode,
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    examples = []
+    for dl_tag in expanded_node.find_html("dl"):
+        examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))
+    if len(examples) == 0:
+        examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))
+
+    translation = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
+    for e_data in examples:
+        e_data.translation = translation
+        translate_raw_tags(e_data)
+
+    for link_node in expanded_node.find_child(NodeKind.LINK):
+        clean_node(wxr, sense, link_node)
+
+    sense.examples.extend(examples)
+
+
+def extract_zh_x_dl_tag(
+    wxr: WiktextractContext, dl_tag: HTMLNode
+) -> list[Example]:
+    examples = []
+    for span_tag in dl_tag.find_html("span"):
+        if "lang" in span_tag.attrs:
+            e_text = clean_node(wxr, None, span_tag)
+            if e_text != "":
+                examples.append(Example(text=e_text))
+        else:
+            raw_tags = clean_node(wxr, None, span_tag).strip("[] ")
+            for raw_tag in re.split(r", | and ", raw_tags):
+                raw_tag = raw_tag.strip()
+                if raw_tag != "" and len(examples) > 0:
+                    examples[-1].raw_tags.append(raw_tag)
+    for dd_tag in dl_tag.find_html("dd"):
+        for span_tag in dd_tag.find_html("span"):
+            if "Latn" in span_tag.attrs.get("lang", ""):
+                roman = clean_node(wxr, None, span_tag)
+                for e_data in examples:
+                    e_data.roman = roman
+            else:
+                raw_tag = clean_node(wxr, None, span_tag).strip("[] ")
+                if raw_tag != "":
+                    for e_data in examples:
+                        e_data.raw_tags.append(raw_tag)
+    return examples
+
+
+def extract_zh_x_no_dl_tag(
+    wxr: WiktextractContext, expanded_node: WikiNode
+) -> list[Example]:
+    examples = []
+    for span_tag in expanded_node.find_html("span"):
+        lang = span_tag.attrs.get("lang", "")
+        match lang:
+            case "zh-Latn":
+                roman = clean_node(wxr, None, span_tag)
+                for e_data in examples:
+                    e_data.roman = roman
+            case "zh-Hant" | "zh-Hans":
+                e_text = clean_node(wxr, None, span_tag)
+                example = Example(text=e_text)
+                example.tags.append(
+                    "Traditional Chinese"
+                    if lang == "zh-Hant"
+                    else "Simplified Chinese"
+                )
+                if example.text != "":
+                    examples.append(example)
+
+    return examples
+
+
+def extract_quote_template(
+    wxr: WiktextractContext,
+    sense: Sense,
+    t_node: TemplateNode,
+) -> str:
+    ref = ""
+    if all(
+        arg not in t_node.template_parameters for arg in ["text", "passage", 7]
+    ):
+        ref = clean_node(wxr, sense, t_node)
+    else:
+        expanded_node = wxr.wtp.parse(
+            wxr.wtp.node_to_wikitext(t_node), expand_all=True
+        )
+        example = Example(text="")
+        for span_tag in expanded_node.find_html_recursively("span"):
+            span_class = span_tag.attrs.get("class", "")
+            if "cited-source" == span_class:
+                example.ref = clean_node(wxr, None, span_tag)
+            elif "e-quotation" in span_class:
+                example.text = clean_node(wxr, None, span_tag)
+            elif "e-translation" in span_class:
+                example.translation = clean_node(wxr, None, span_tag)
+        for i_tag in expanded_node.find_html_recursively(
+            "i", attr_name="class", attr_value="e-transliteration"
+        ):
+            example.roman = clean_node(wxr, None, i_tag)
+            break
+        if example.text != "":
+            sense.examples.append(example)
+        clean_node(wxr, sense, expanded_node)
+
+    return ref
+
+
+def extract_template_ja_usex(
+    wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, ref: str
+) -> None:
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(t_node), expand_all=True
+    )
+    example = Example(text="", ref=ref)
+    for span_tag in expanded_node.find_html(
+        "span", attr_name="class", attr_value="Jpan"
+    ):
+        ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
+        example.text = clean_node(wxr, None, node_without_ruby)
+        example.ruby = ruby_data
+    for span_tag in expanded_node.find_html_recursively(
+        "span", attr_name="class", attr_value="tr"
+    ):
+        example.roman = clean_node(wxr, None, span_tag)
+    example.translation = clean_node(
+        wxr, None, t_node.template_parameters.get(3, "")
+    )
+    example.literal_meaning = clean_node(
+        wxr, None, t_node.template_parameters.get("lit", "")
+    )
+    if example.text != "":
+        sense.examples.append(example)
+        for link_node in expanded_node.find_child(NodeKind.LINK):
+            clean_node(wxr, sense, link_node)
diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py
@@ -1,10 +1,11 @@
-from wikitextprocessor import LevelNode, NodeKind, WikiNode
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .example import extract_example_list_item
 from .models import Sense, WordEntry
 from .section_titles import POS_DATA
+from .tags import translate_raw_tags
 
 
 def extract_pos_section(
@@ -32,9 +33,18 @@ def extract_gloss_list_item(
     list_item: WikiNode,
 ) -> None:
     sense = Sense()
-    gloss_str = clean_node(
-        wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
-    )
+    gloss_nodes = []
+    for node in list_item.children:
+        if isinstance(node, TemplateNode) and node.template_name in [
+            "label",
+            "lb",
+            "lbl",
+        ]:
+            extract_label_template(wxr, sense, node)
+        elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
+            gloss_nodes.append(node)
+
+    gloss_str = clean_node(wxr, sense, gloss_nodes)
     for child_list in list_item.find_child(NodeKind.LIST):
         if child_list.sarg.startswith("#") and child_list.sarg.endswith(
             (":", "*")
@@ -44,4 +54,17 @@ def extract_gloss_list_item(
 
     if gloss_str != "":
         sense.glosses.append(gloss_str)
+        translate_raw_tags(sense)
         word_entry.senses.append(sense)
+
+
+def extract_label_template(
+    wxr: WiktextractContext,
+    sense: Sense,
+    t_node: TemplateNode,
+) -> None:
+    raw_tag_str = clean_node(wxr, sense, t_node).strip("() ")
+    for raw_tag in raw_tag_str.split(","):
+        raw_tag = raw_tag.strip()
+        if raw_tag != "":
+            sense.raw_tags.append(raw_tag)
diff --git a/src/wiktextract/extractor/th/tags.py b/src/wiktextract/extractor/th/tags.py
@@ -0,0 +1,26 @@
+from .models import WordEntry
+
+EXAMPLE_TAGS = {
+    # แม่แบบ:zh-x, มอดูล:zh-usex/data
+    "MSC": "Modern Standard Chinese",
+    "Pinyin": "Pinyin",
+    "trad.": "Traditional Chinese",
+    "simp.": "Simplified Chinese",
+}
+
+
+TAGS = {**EXAMPLE_TAGS}
+
+
+def translate_raw_tags(data: WordEntry) -> None:
+    raw_tags = []
+    for raw_tag in data.raw_tags:
+        if raw_tag in TAGS and hasattr(data, "tags"):
+            tr_tag = TAGS[raw_tag]
+            if isinstance(tr_tag, str):
+                data.tags.append(tr_tag)
+            elif isinstance(tr_tag, list):
+                data.tags.extend(tr_tag)
+        else:
+            raw_tags.append(raw_tag)
+    data.raw_tags = raw_tags