Merge pull request #476 from xxyzz/zh

Improve zh edition translation code
tatuylonen · Jan 29, 2024 · 8c5a85c · 8c5a85c
2 parents 889e026 + f957217
commit 8c5a85c
Show file tree

Hide file tree

Showing 7 changed files with 277 additions and 188 deletions.
diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py
@@ -66,7 +66,7 @@ class Linkage(FrenchBaseModel):
     word: str = ""
     tags: list[str] = []
     roman: str = ""
-    alt: str = Field("", description="ALternative form")
+    alt: str = Field("", description="Alternative form")
     translation: str = Field("", description="French translation")
     sense: str = Field("", description="Definition of the word")
     sense_index: int = Field(

diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
@@ -65,10 +65,12 @@ class Translation(ChineseBaseModel):
         "", description="Wiktionary language code of the translation term"
     )
     lang: str = Field("", description="Translation language name")
-    word: str = Field("", description="Translation term")
+    word: str = Field(description="Translation term")
     sense: str = Field("", description="Translation gloss")
     tags: list[str] = []
-    roman: str = ""
+    roman: str = Field("", description="Roman script")
+    alt: str = Field("", description="Alternative form")
+    lit: str = Field("", description="Literal translation for the term")
 
 
 class Linkage(ChineseBaseModel):
@@ -127,5 +129,5 @@ class WordEntry(ChineseBaseModel):
     descendants: list[Descendant] = []
     redirects: list[str] = Field(
         [],
-        description="Soft redirect page, extracted from template zh-see and ja-see",
+        description="Soft redirect page, extracted from template zh-see ja-see",
     )
diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
@@ -31,8 +31,6 @@
 # Additional templates to be expanded in the pre-expand phase
 ADDITIONAL_EXPAND_TEMPLATES = frozenset(
     {
-        "multitrans",
-        "multitrans-nowiki",
         "col1",
         "col2",
         "col3",
@@ -198,6 +196,9 @@ def extract_pronunciation(
 def parse_page(
     wxr: WiktextractContext, page_title: str, page_text: str
 ) -> list[dict[str, Any]]:
+    # page layout documents
+    # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋
+    # https://zh.wiktionary.org/wiki/Wiktionary:体例说明
     if wxr.config.verbose:
         logging.info(f"Parsing page: {page_title}")
 

diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
@@ -1,144 +1,161 @@
-import re
 from typing import Optional, Union
 
-from mediawiki_langcodes import name_to_code
+from mediawiki_langcodes import code_to_name, name_to_code
 from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import LEVEL_KIND_FLAGS
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
-from ..share import capture_text_in_parentheses
 from .models import Translation, WordEntry
 
 
 def extract_translation(
-    wxr: WiktextractContext, page_data: list[WordEntry], node: WikiNode
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level_node: WikiNode,
+    sense: str = "",
 ) -> None:
-    sense_text = ""
-    for child in node.children:
-        if isinstance(child, WikiNode):
-            if child.kind == NodeKind.TEMPLATE:
-                template_name = child.template_name.lower()
-                if (
-                    template_name in {"trans-top", "翻譯-頂", "trans-top-also"}
-                    and 1 in child.template_parameters
-                ):
-                    sense_text = clean_node(
-                        wxr, None, child.template_parameters.get(1)
-                    )
-                elif template_name == "checktrans-top":
-                    return
-                elif template_name == "see translation subpage":
-                    translation_subpage(
-                        wxr, page_data, child.template_parameters
-                    )
-            elif child.kind == NodeKind.LIST:
-                for list_item_node in child.find_child(NodeKind.LIST_ITEM):
-                    if not list_item_node.contain_node(NodeKind.LIST):
-                        process_translation_list_item(
-                            wxr,
-                            page_data,
-                            clean_node(wxr, None, list_item_node.children),
-                            sense_text,
-                        )
-                    else:
-                        nested_list_index = 0
-                        for index, item_child in enumerate(
-                            list_item_node.children
-                        ):
-                            if (
-                                isinstance(item_child, WikiNode)
-                                and item_child.kind == NodeKind.LIST
-                            ):
-                                nested_list_index = index
-                                break
-
-                        process_translation_list_item(
-                            wxr,
-                            page_data,
-                            clean_node(
-                                wxr,
-                                None,
-                                list_item_node.children[:nested_list_index],
-                            ),
-                            sense_text,
-                        )
-                        for nested_list_node in list_item_node.find_child(
-                            NodeKind.LIST
-                        ):
-                            for nested_list_item in nested_list_node.find_child(
-                                NodeKind.LIST_ITEM
-                            ):
-                                process_translation_list_item(
-                                    wxr,
-                                    page_data,
-                                    clean_node(
-                                        wxr, None, nested_list_item.children
-                                    ),
-                                    sense_text,
-                                )
+    for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
+        if isinstance(child, TemplateNode):
+            template_name = child.template_name.lower()
+            if (
+                template_name in {"trans-top", "翻譯-頂", "trans-top-also"}
+                and 1 in child.template_parameters
+            ):
+                sense = clean_node(wxr, None, child.template_parameters.get(1))
+            elif template_name in {"see translation subpage", "trans-see"}:
+                translation_subpage(wxr, page_data, child)
+            elif template_name == "multitrans":
+                wikitext = "".join(
+                    wxr.wtp.node_to_wikitext(c)
+                    for c in child.template_parameters.get("data", [])
+                )
+                multitrans = wxr.wtp.parse(wikitext)
+                extract_translation(wxr, page_data, multitrans, sense)
+        else:
+            for list_item in child.find_child_recursively(NodeKind.LIST_ITEM):
+                process_translation_list_item(
+                    wxr,
+                    page_data,
+                    list_item,
+                    sense,
+                )
 
 
 def process_translation_list_item(
     wxr: WiktextractContext,
     page_data: list[WordEntry],
-    expanded_text: str,
+    list_item: WikiNode,
     sense: str,
 ) -> None:
-    from .headword_line import GENDERS
-
-    split_results = re.split(r":|：", expanded_text, maxsplit=1)
-    if len(split_results) != 2:
-        return
-    lang_text, words_text = split_results
-    lang_text = lang_text.strip()
-    words_text = words_text.strip()
-    if len(words_text) == 0:
-        return
-    lang_code = name_to_code(lang_text, "zh")
-
-    # split words by `,` or `;` that are not inside `()`
-    for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text):
-        tags, word = capture_text_in_parentheses(word_and_tags)
-        tags = [tag for tag in tags if tag != lang_code]  # rm Wiktionary link
-        translation_data = Translation(
-            lang_code=lang_code, lang=lang_text, word=word
-        )
-        tags_without_roman = []
-        for tag in tags:
-            if re.search(r"[a-z]", tag):
-                translation_data.roman = tag
+    tr_data = Translation(word="", sense=sense)
+
+    for child_index, child in enumerate(list_item.filter_empty_str_child()):
+        if child_index == 0:
+            lang_text = ""
+            if isinstance(child, str):
+                if "：" in child:
+                    lang_text = child[: child.index("：")]
+                elif ":" in child:
+                    lang_text = child[: child.index(":")]
             else:
-                tags_without_roman.append(tag)
-
-        if len(tags_without_roman) > 0:
-            translation_data.tags = tags_without_roman
-
-        gender = word.split(" ")[-1]
-        if gender in GENDERS:
-            translation_data.word = word.removesuffix(f" {gender}")
-            translation_data.tags.append(GENDERS.get(gender))
-
-        if len(sense) > 0:
-            translation_data.sense = sense
-        page_data[-1].translations.append(translation_data)
+                lang_text = clean_node(wxr, None, child)
+            if len(lang_text) > 0:
+                tr_data.lang = lang_text.strip()
+                tr_data.lang_code = name_to_code(tr_data.lang, "zh")
+        elif isinstance(child, TemplateNode):
+            template_name = child.template_name
+            if template_name in {
+                "t",
+                "t+",
+                "tt",
+                "tt+",
+                "t-check",
+                "t+check",
+            }:
+                if len(tr_data.word) > 0:
+                    page_data[-1].translations.append(
+                        tr_data.model_copy(deep=True)
+                    )
+                    tr_data = Translation(
+                        word="",
+                        lang=tr_data.lang,
+                        lang_code=tr_data.lang_code,
+                        sense=sense,
+                    )
+                if tr_data.lang_code == "":
+                    tr_data.lang_code = child.template_parameters.get(1, "")
+                if tr_data.lang == "":
+                    tr_data.lang = code_to_name(tr_data.lang_code, "zh")
+                tr_data.word = clean_node(
+                    wxr, None, child.template_parameters.get(2, "")
+                )
+                tr_data.roman = clean_node(
+                    wxr, None, child.template_parameters.get("tr", "")
+                )
+                tr_data.alt = clean_node(
+                    wxr, None, child.template_parameters.get("alt", "")
+                )
+                tr_data.lit = clean_node(
+                    wxr, None, child.template_parameters.get("lit", "")
+                )
+                # find gender tags
+                expanded_template = wxr.wtp.parse(
+                    wxr.wtp.node_to_wikitext(child), expand_all=True
+                )
+                for span_node in expanded_template.find_html("span"):
+                    class_str = span_node.attrs.get("class", "")
+                    if "gender" in class_str:
+                        for abbr_tag in span_node.find_html("abbr"):
+                            if len(abbr_tag.attrs.get("title")) > 0:
+                                tr_data.tags.append(
+                                    clean_node(
+                                        wxr, None, abbr_tag.attrs.get("title")
+                                    )
+                                )
+                    elif tr_data.roman == "" and class_str.startswith("tr "):
+                        tr_data.roman = clean_node(wxr, None, span_node)
+            elif template_name == "t-needed":
+                # ignore empty translation
+                continue
+            else:
+                # qualifier template
+                tag = clean_node(wxr, None, child)
+                if len(tag) > 0:
+                    tr_data.tags.append(tag.strip("()"))
+        elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK:
+            if len(tr_data.word) > 0:
+                page_data[-1].translations.append(tr_data.model_copy(deep=True))
+                tr_data = Translation(
+                    word="",
+                    lang=tr_data.lang,
+                    lang_code=tr_data.lang_code,
+                    sense=sense,
+                )
+            tr_data.word = clean_node(wxr, None, child)
+
+    if len(tr_data.word) > 0:
+        page_data[-1].translations.append(tr_data.model_copy(deep=True))
 
 
 def translation_subpage(
     wxr: WiktextractContext,
     page_data: list[WordEntry],
-    template_args: dict[str, str],
+    template_node: TemplateNode,
 ) -> None:
+    # https://zh.wiktionary.org/wiki/Template:翻譯-見
+    # https://zh.wiktionary.org/wiki/Template:See_translation_subpage
     from .page import ADDITIONAL_EXPAND_TEMPLATES
 
     page_title = wxr.wtp.title
     target_section = None
-    if len(template_args) > 0:
-        target_section = template_args.get(1)
-    if len(template_args) > 1:
-        page_title = template_args.get(2)
+    if template_node.template_name == "see translation subpage":
+        target_section = template_node.template_parameters.get(1)
+    page_title = template_node.template_parameters.get(2, wxr.wtp.title)
 
-    translation_subpage_title = f"{page_title}/翻譯"
+    translation_subpage_title = page_title
+    if page_title == wxr.wtp.title:
+        translation_subpage_title = f"{page_title}/翻譯"
     subpage = wxr.wtp.get_page(translation_subpage_title)
     if subpage is None:
         return
@@ -165,22 +182,12 @@ def find_subpage_section(
     node: Union[WikiNode, str],
     target_section: Union[str, list[str]],
 ) -> Optional[WikiNode]:
-    if isinstance(node, WikiNode):
-        if node.kind in LEVEL_KIND_FLAGS:
-            section_title = clean_node(wxr, None, node.largs)
-            if (
-                isinstance(target_section, str)
-                and section_title == target_section
-            ):
-                return node
-            if (
-                isinstance(target_section, list)
-                and section_title in target_section
-            ):
-                return node
-
-        for child in node.children:
-            returned_node = find_subpage_section(wxr, child, target_section)
-            if returned_node is not None:
-                return returned_node
+    if not isinstance(node, WikiNode):
+        return None
+    for level_node in node.find_child_recursively(LEVEL_KIND_FLAGS):
+        section_title = clean_node(wxr, None, level_node.largs)
+        if isinstance(target_section, str) and section_title == target_section:
+            return level_node
+        if isinstance(target_section, list) and section_title in target_section:
+            return level_node
     return None
diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
@@ -6,8 +6,8 @@
 from wiktextract.extractor.zh.models import Sense, WordEntry
 from wiktextract.extractor.zh.page import (
     extract_gloss,
-    parse_section,
     parse_page,
+    parse_section,
 )
 from wiktextract.thesaurus import close_thesaurus_db
 from wiktextract.wxr_context import WiktextractContext

diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py
@@ -1,5 +1,5 @@
 from unittest import TestCase
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
 
 from wikitextprocessor import Wtp
 from wiktextract.extractor.zh.headword_line import extract_headword_line