diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py
index dd8b5021a..2ad0137cf 100644
--- a/src/wiktextract/extractor/fr/models.py
+++ b/src/wiktextract/extractor/fr/models.py
@@ -66,7 +66,7 @@ class Linkage(FrenchBaseModel):
word: str = ""
tags: list[str] = []
roman: str = ""
- alt: str = Field("", description="ALternative form")
+ alt: str = Field("", description="Alternative form")
translation: str = Field("", description="French translation")
sense: str = Field("", description="Definition of the word")
sense_index: int = Field(
diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
index 86b89e591..674aab89f 100644
--- a/src/wiktextract/extractor/zh/models.py
+++ b/src/wiktextract/extractor/zh/models.py
@@ -65,10 +65,12 @@ class Translation(ChineseBaseModel):
"", description="Wiktionary language code of the translation term"
)
lang: str = Field("", description="Translation language name")
- word: str = Field("", description="Translation term")
+ word: str = Field(description="Translation term")
sense: str = Field("", description="Translation gloss")
tags: list[str] = []
- roman: str = ""
+ roman: str = Field("", description="Roman script")
+ alt: str = Field("", description="Alternative form")
+ lit: str = Field("", description="Literal translation for the term")
class Linkage(ChineseBaseModel):
@@ -127,5 +129,5 @@ class WordEntry(ChineseBaseModel):
descendants: list[Descendant] = []
redirects: list[str] = Field(
[],
- description="Soft redirect page, extracted from template zh-see and ja-see",
+ description="Soft redirect page, extracted from template zh-see ja-see",
)
diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py
index f942d8c5b..61f35eb3d 100644
--- a/src/wiktextract/extractor/zh/page.py
+++ b/src/wiktextract/extractor/zh/page.py
@@ -31,8 +31,6 @@
# Additional templates to be expanded in the pre-expand phase
ADDITIONAL_EXPAND_TEMPLATES = frozenset(
{
- "multitrans",
- "multitrans-nowiki",
"col1",
"col2",
"col3",
@@ -198,6 +196,9 @@ def extract_pronunciation(
def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
+ # page layout documents
+ # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋
+ # https://zh.wiktionary.org/wiki/Wiktionary:体例说明
if wxr.config.verbose:
logging.info(f"Parsing page: {page_title}")
diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
index 8978ef375..eb4786b13 100644
--- a/src/wiktextract/extractor/zh/translation.py
+++ b/src/wiktextract/extractor/zh/translation.py
@@ -1,144 +1,161 @@
-import re
from typing import Optional, Union
-from mediawiki_langcodes import name_to_code
+from mediawiki_langcodes import code_to_name, name_to_code
from wikitextprocessor import NodeKind, WikiNode
-from wikitextprocessor.parser import LEVEL_KIND_FLAGS
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
-from ..share import capture_text_in_parentheses
from .models import Translation, WordEntry
def extract_translation(
- wxr: WiktextractContext, page_data: list[WordEntry], node: WikiNode
+ wxr: WiktextractContext,
+ page_data: list[WordEntry],
+ level_node: WikiNode,
+ sense: str = "",
) -> None:
- sense_text = ""
- for child in node.children:
- if isinstance(child, WikiNode):
- if child.kind == NodeKind.TEMPLATE:
- template_name = child.template_name.lower()
- if (
- template_name in {"trans-top", "翻譯-頂", "trans-top-also"}
- and 1 in child.template_parameters
- ):
- sense_text = clean_node(
- wxr, None, child.template_parameters.get(1)
- )
- elif template_name == "checktrans-top":
- return
- elif template_name == "see translation subpage":
- translation_subpage(
- wxr, page_data, child.template_parameters
- )
- elif child.kind == NodeKind.LIST:
- for list_item_node in child.find_child(NodeKind.LIST_ITEM):
- if not list_item_node.contain_node(NodeKind.LIST):
- process_translation_list_item(
- wxr,
- page_data,
- clean_node(wxr, None, list_item_node.children),
- sense_text,
- )
- else:
- nested_list_index = 0
- for index, item_child in enumerate(
- list_item_node.children
- ):
- if (
- isinstance(item_child, WikiNode)
- and item_child.kind == NodeKind.LIST
- ):
- nested_list_index = index
- break
-
- process_translation_list_item(
- wxr,
- page_data,
- clean_node(
- wxr,
- None,
- list_item_node.children[:nested_list_index],
- ),
- sense_text,
- )
- for nested_list_node in list_item_node.find_child(
- NodeKind.LIST
- ):
- for nested_list_item in nested_list_node.find_child(
- NodeKind.LIST_ITEM
- ):
- process_translation_list_item(
- wxr,
- page_data,
- clean_node(
- wxr, None, nested_list_item.children
- ),
- sense_text,
- )
+ for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
+ if isinstance(child, TemplateNode):
+ template_name = child.template_name.lower()
+ if (
+ template_name in {"trans-top", "翻譯-頂", "trans-top-also"}
+ and 1 in child.template_parameters
+ ):
+ sense = clean_node(wxr, None, child.template_parameters.get(1))
+ elif template_name in {"see translation subpage", "trans-see"}:
+ translation_subpage(wxr, page_data, child)
+ elif template_name == "multitrans":
+ wikitext = "".join(
+ wxr.wtp.node_to_wikitext(c)
+ for c in child.template_parameters.get("data", [])
+ )
+ multitrans = wxr.wtp.parse(wikitext)
+ extract_translation(wxr, page_data, multitrans, sense)
+ else:
+ for list_item in child.find_child_recursively(NodeKind.LIST_ITEM):
+ process_translation_list_item(
+ wxr,
+ page_data,
+ list_item,
+ sense,
+ )
def process_translation_list_item(
wxr: WiktextractContext,
page_data: list[WordEntry],
- expanded_text: str,
+ list_item: WikiNode,
sense: str,
) -> None:
- from .headword_line import GENDERS
-
- split_results = re.split(r":|:", expanded_text, maxsplit=1)
- if len(split_results) != 2:
- return
- lang_text, words_text = split_results
- lang_text = lang_text.strip()
- words_text = words_text.strip()
- if len(words_text) == 0:
- return
- lang_code = name_to_code(lang_text, "zh")
-
- # split words by `,` or `;` that are not inside `()`
- for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text):
- tags, word = capture_text_in_parentheses(word_and_tags)
- tags = [tag for tag in tags if tag != lang_code] # rm Wiktionary link
- translation_data = Translation(
- lang_code=lang_code, lang=lang_text, word=word
- )
- tags_without_roman = []
- for tag in tags:
- if re.search(r"[a-z]", tag):
- translation_data.roman = tag
+ tr_data = Translation(word="", sense=sense)
+
+ for child_index, child in enumerate(list_item.filter_empty_str_child()):
+ if child_index == 0:
+ lang_text = ""
+ if isinstance(child, str):
+ if ":" in child:
+ lang_text = child[: child.index(":")]
+ elif ":" in child:
+ lang_text = child[: child.index(":")]
else:
- tags_without_roman.append(tag)
-
- if len(tags_without_roman) > 0:
- translation_data.tags = tags_without_roman
-
- gender = word.split(" ")[-1]
- if gender in GENDERS:
- translation_data.word = word.removesuffix(f" {gender}")
- translation_data.tags.append(GENDERS.get(gender))
-
- if len(sense) > 0:
- translation_data.sense = sense
- page_data[-1].translations.append(translation_data)
+ lang_text = clean_node(wxr, None, child)
+ if len(lang_text) > 0:
+ tr_data.lang = lang_text.strip()
+ tr_data.lang_code = name_to_code(tr_data.lang, "zh")
+ elif isinstance(child, TemplateNode):
+ template_name = child.template_name
+ if template_name in {
+ "t",
+ "t+",
+ "tt",
+ "tt+",
+ "t-check",
+ "t+check",
+ }:
+ if len(tr_data.word) > 0:
+ page_data[-1].translations.append(
+ tr_data.model_copy(deep=True)
+ )
+ tr_data = Translation(
+ word="",
+ lang=tr_data.lang,
+ lang_code=tr_data.lang_code,
+ sense=sense,
+ )
+ if tr_data.lang_code == "":
+ tr_data.lang_code = child.template_parameters.get(1, "")
+ if tr_data.lang == "":
+ tr_data.lang = code_to_name(tr_data.lang_code, "zh")
+ tr_data.word = clean_node(
+ wxr, None, child.template_parameters.get(2, "")
+ )
+ tr_data.roman = clean_node(
+ wxr, None, child.template_parameters.get("tr", "")
+ )
+ tr_data.alt = clean_node(
+ wxr, None, child.template_parameters.get("alt", "")
+ )
+ tr_data.lit = clean_node(
+ wxr, None, child.template_parameters.get("lit", "")
+ )
+ # find gender tags
+ expanded_template = wxr.wtp.parse(
+ wxr.wtp.node_to_wikitext(child), expand_all=True
+ )
+ for span_node in expanded_template.find_html("span"):
+ class_str = span_node.attrs.get("class", "")
+ if "gender" in class_str:
+ for abbr_tag in span_node.find_html("abbr"):
+ if len(abbr_tag.attrs.get("title")) > 0:
+ tr_data.tags.append(
+ clean_node(
+ wxr, None, abbr_tag.attrs.get("title")
+ )
+ )
+ elif tr_data.roman == "" and class_str.startswith("tr "):
+ tr_data.roman = clean_node(wxr, None, span_node)
+ elif template_name == "t-needed":
+ # ignore empty translation
+ continue
+ else:
+ # qualifier template
+ tag = clean_node(wxr, None, child)
+ if len(tag) > 0:
+ tr_data.tags.append(tag.strip("()"))
+ elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK:
+ if len(tr_data.word) > 0:
+ page_data[-1].translations.append(tr_data.model_copy(deep=True))
+ tr_data = Translation(
+ word="",
+ lang=tr_data.lang,
+ lang_code=tr_data.lang_code,
+ sense=sense,
+ )
+ tr_data.word = clean_node(wxr, None, child)
+
+ if len(tr_data.word) > 0:
+ page_data[-1].translations.append(tr_data.model_copy(deep=True))
def translation_subpage(
wxr: WiktextractContext,
page_data: list[WordEntry],
- template_args: dict[str, str],
+ template_node: TemplateNode,
) -> None:
+ # https://zh.wiktionary.org/wiki/Template:翻譯-見
+ # https://zh.wiktionary.org/wiki/Template:See_translation_subpage
from .page import ADDITIONAL_EXPAND_TEMPLATES
page_title = wxr.wtp.title
target_section = None
- if len(template_args) > 0:
- target_section = template_args.get(1)
- if len(template_args) > 1:
- page_title = template_args.get(2)
+ if template_node.template_name == "see translation subpage":
+ target_section = template_node.template_parameters.get(1)
+ page_title = template_node.template_parameters.get(2, wxr.wtp.title)
- translation_subpage_title = f"{page_title}/翻譯"
+ translation_subpage_title = page_title
+ if page_title == wxr.wtp.title:
+ translation_subpage_title = f"{page_title}/翻譯"
subpage = wxr.wtp.get_page(translation_subpage_title)
if subpage is None:
return
@@ -165,22 +182,12 @@ def find_subpage_section(
node: Union[WikiNode, str],
target_section: Union[str, list[str]],
) -> Optional[WikiNode]:
- if isinstance(node, WikiNode):
- if node.kind in LEVEL_KIND_FLAGS:
- section_title = clean_node(wxr, None, node.largs)
- if (
- isinstance(target_section, str)
- and section_title == target_section
- ):
- return node
- if (
- isinstance(target_section, list)
- and section_title in target_section
- ):
- return node
-
- for child in node.children:
- returned_node = find_subpage_section(wxr, child, target_section)
- if returned_node is not None:
- return returned_node
+ if not isinstance(node, WikiNode):
+ return None
+ for level_node in node.find_child_recursively(LEVEL_KIND_FLAGS):
+ section_title = clean_node(wxr, None, level_node.largs)
+ if isinstance(target_section, str) and section_title == target_section:
+ return level_node
+ if isinstance(target_section, list) and section_title in target_section:
+ return level_node
return None
diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
index 3a5536903..d2af44788 100644
--- a/tests/test_zh_gloss.py
+++ b/tests/test_zh_gloss.py
@@ -6,8 +6,8 @@
from wiktextract.extractor.zh.models import Sense, WordEntry
from wiktextract.extractor.zh.page import (
extract_gloss,
- parse_section,
parse_page,
+ parse_section,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext
diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py
index d9f95cbda..5a2739ef7 100644
--- a/tests/test_zh_headword.py
+++ b/tests/test_zh_headword.py
@@ -1,5 +1,5 @@
from unittest import TestCase
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
from wikitextprocessor import Wtp
from wiktextract.extractor.zh.headword_line import extract_headword_line
diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py
index 2090535b5..7401bf4d5 100644
--- a/tests/test_zh_translation.py
+++ b/tests/test_zh_translation.py
@@ -1,7 +1,6 @@
from unittest import TestCase
-from unittest.mock import patch
-from wikitextprocessor import Page, Wtp
+from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.zh.models import WordEntry
from wiktextract.extractor.zh.translation import extract_translation
@@ -21,25 +20,22 @@ def tearDown(self) -> None:
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)
- @patch(
- "wikitextprocessor.Wtp.get_page",
- return_value=Page(title="", namespace_id=10, body=""),
- )
- def test_normal(self, mock_get_page) -> None:
- # test wikitext from page "你好" and "這裡"
- page_data = [WordEntry(word="你好", lang_code="zh", lang="漢語")]
- wikitext = """
-{{trans-top|靠近說話者的地方}}
-* 阿爾巴尼亞語:këtu (sq)
-* 阿帕切語:
-*: 西阿帕切語:kú
-* 阿拉伯語:هُنَا (hunā)
-*: 埃及阿拉伯語:هنا (henā)
-*俄语:[[привет|приве́т]] (privét) (非正式), [[здравствуйте|здра́вствуйте]] (zdrávstvujte) (正式, 第一个"в"不发音)
-{{trans-bottom}}
-* 斯洛伐克語:pracovať impf
- """
- self.wxr.wtp.start_page("你好")
+ def test_t_template(self):
+ self.wxr.wtp.start_page("太陽風")
+ self.wxr.wtp.add_page(
+ "Template:t+",
+ 10,
+ """{{#switch:{{{3}}}
+|f=f
+|m=m
+}}""",
+ )
+ self.wxr.wtp.add_page("Template:qualifier", 10, "({{{1}}})")
+ page_data = [WordEntry(word="太陽風", lang_code="zh", lang="漢語")]
+ wikitext = """{{trans-top|太陽上層大氣射出的超高速電漿流}}
+* 希伯来语:{{t+|he|רוח השמש|tr=ruakh ha-shemesh}}、{{t+|he|רוח סולרית|f|tr=ruakh solarit}}
+* 塞尔维亚-克罗地亚语:
+*: 西里尔字母:{{qualifier|Ekavian}} {{t+|sh|сунчев ветар|m}}"""
node = self.wxr.wtp.parse(wikitext)
extract_translation(self.wxr, page_data, node)
self.assertEqual(
@@ -49,52 +45,135 @@ def test_normal(self, mock_get_page) -> None:
],
[
{
- "lang_code": "sq",
- "lang": "阿爾巴尼亞語",
- "sense": "靠近說話者的地方",
- "word": "këtu",
+ "lang_code": "he",
+ "lang": "希伯来语",
+ "sense": "太陽上層大氣射出的超高速電漿流",
+ "word": "רוח השמש",
+ "roman": "ruakh ha-shemesh",
+ },
+ {
+ "lang_code": "he",
+ "lang": "希伯来语",
+ "sense": "太陽上層大氣射出的超高速電漿流",
+ "word": "רוח סולרית",
+ "roman": "ruakh solarit",
+ "tags": ["陰性名詞"],
+ },
+ {
+ "lang_code": "sh",
+ "lang": "西里尔字母",
+ "sense": "太陽上層大氣射出的超高速電漿流",
+ "word": "сунчев ветар",
+ "tags": ["Ekavian", "陽性名詞"],
},
+ ],
+ )
+
+ def test_link_words(self):
+ self.wxr.wtp.start_page("你好")
+ page_data = [WordEntry(word="你好", lang_code="zh", lang="漢語")]
+ wikitext = """{{翻譯-頂}}
+*英语:[[how do you do]]; [[how are you]]"""
+ node = self.wxr.wtp.parse(wikitext)
+ extract_translation(self.wxr, page_data, node)
+ self.assertEqual(
+ [
+ d.model_dump(exclude_defaults=True)
+ for d in page_data[0].translations
+ ],
+ [
{
- "lang": "西阿帕切語",
- "sense": "靠近說話者的地方",
- "word": "kú",
+ "lang_code": "en",
+ "lang": "英语",
+ "word": "how do you do",
},
{
- "lang_code": "ar",
- "lang": "阿拉伯語",
- "sense": "靠近說話者的地方",
- "roman": "hunā",
- "word": "هُنَا",
+ "lang_code": "en",
+ "lang": "英语",
+ "word": "how are you",
},
+ ],
+ )
+
+ def test_subpage_multitrans(self):
+ self.wxr.wtp.start_page("英語")
+ self.wxr.wtp.add_page(
+ "英語/翻譯",
+ 0,
+ """==漢語==
+===名詞===
+====翻譯====
+{{trans-top|一種源於英格蘭的語言}}{{multitrans|data=
+* 阿布哈茲語:{{tt|ab|англыз бызшәа}}
+* 阿拉貢語:{{t-needed|an}}
+}}""",
+ )
+ page_data = [WordEntry(word="英語", lang_code="zh", lang="漢語")]
+ wikitext = "{{trans-see|源於英格蘭的語言|英語/翻譯}}"
+ node = self.wxr.wtp.parse(wikitext)
+ extract_translation(self.wxr, page_data, node)
+ self.assertEqual(
+ [
+ d.model_dump(exclude_defaults=True)
+ for d in page_data[0].translations
+ ],
+ [
+ {
+ "lang_code": "ab",
+ "lang": "阿布哈茲語",
+ "word": "англыз бызшәа",
+ "sense": "一種源於英格蘭的語言",
+ }
+ ],
+ )
+
+ def test_strange_russian_translation(self):
+ self.wxr.wtp.start_page("林场")
+ page_data = [WordEntry(word="林场", lang_code="zh", lang="漢語")]
+ node = self.wxr.wtp.parse(
+ "*俄语:1) [[лесничество]], [[лесхоз]]; 2) [[лесосека]]"
+ )
+ extract_translation(self.wxr, page_data, node)
+ self.assertEqual(
+ [
+ d.model_dump(exclude_defaults=True)
+ for d in page_data[0].translations
+ ],
+ [
{
- "lang_code": "arz",
- "lang": "埃及阿拉伯語",
- "sense": "靠近說話者的地方",
- "roman": "henā",
- "word": "هنا",
+ "lang_code": "ru",
+ "lang": "俄语",
+ "word": "лесничество",
},
{
"lang_code": "ru",
"lang": "俄语",
- "sense": "靠近說話者的地方",
- "roman": "privét",
- "tags": ["非正式"],
- "word": "приве́т",
+ "word": "лесхоз",
},
{
"lang_code": "ru",
"lang": "俄语",
- "sense": "靠近說話者的地方",
- "roman": "zdrávstvujte",
- "tags": ['正式, 第一个"в"不发音'],
- "word": "здра́вствуйте",
+ "word": "лесосека",
},
+ ],
+ )
+
+ def test_language_name_template(self):
+ self.wxr.wtp.start_page("解析幾何")
+ page_data = [WordEntry(word="解析幾何", lang_code="zh", lang="漢語")]
+ self.wxr.wtp.add_page("Template:en", 10, "英語")
+ node = self.wxr.wtp.parse("* {{en}}:{{t+|en|analytic geometry}}")
+ extract_translation(self.wxr, page_data, node)
+ self.assertEqual(
+ [
+ d.model_dump(exclude_defaults=True)
+ for d in page_data[0].translations
+ ],
+ [
{
- "lang_code": "sk",
- "lang": "斯洛伐克語",
- "sense": "靠近說話者的地方",
- "tags": ["imperfective aspect"],
- "word": "pracovať",
+ "lang_code": "en",
+ "lang": "英語",
+ "word": "analytic geometry",
},
],
)