From aaf5f7aa72901dc22fd0713f607e99a609b4b83b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 29 Jan 2024 14:21:47 +0800 Subject: [PATCH 1/5] Parse zh edition "t*" translation templates High quality pages use "t*" templates in translation list, we could get the language code from "t" template argument if the language name text can't be converted to code. --- src/wiktextract/extractor/fr/models.py | 2 +- src/wiktextract/extractor/zh/models.py | 8 +- src/wiktextract/extractor/zh/page.py | 5 +- src/wiktextract/extractor/zh/translation.py | 211 ++++++++++---------- tests/test_zh_gloss.py | 2 +- tests/test_zh_headword.py | 2 +- tests/test_zh_translation.py | 114 +++++------ 7 files changed, 172 insertions(+), 172 deletions(-) diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py index dd8b5021a..2ad0137cf 100644 --- a/src/wiktextract/extractor/fr/models.py +++ b/src/wiktextract/extractor/fr/models.py @@ -66,7 +66,7 @@ class Linkage(FrenchBaseModel): word: str = "" tags: list[str] = [] roman: str = "" - alt: str = Field("", description="ALternative form") + alt: str = Field("", description="Alternative form") translation: str = Field("", description="French translation") sense: str = Field("", description="Definition of the word") sense_index: int = Field( diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py index 86b89e591..674aab89f 100644 --- a/src/wiktextract/extractor/zh/models.py +++ b/src/wiktextract/extractor/zh/models.py @@ -65,10 +65,12 @@ class Translation(ChineseBaseModel): "", description="Wiktionary language code of the translation term" ) lang: str = Field("", description="Translation language name") - word: str = Field("", description="Translation term") + word: str = Field(description="Translation term") sense: str = Field("", description="Translation gloss") tags: list[str] = [] - roman: str = "" + roman: str = Field("", description="Roman script") + alt: str = Field("", description="Alternative form") + lit: str = Field("", description="Literal translation for the term") class Linkage(ChineseBaseModel): @@ -127,5 +129,5 @@ class WordEntry(ChineseBaseModel): descendants: list[Descendant] = [] redirects: list[str] = Field( [], - description="Soft redirect page, extracted from template zh-see and ja-see", + description="Soft redirect page, extracted from template zh-see ja-see", ) diff --git a/src/wiktextract/extractor/zh/page.py b/src/wiktextract/extractor/zh/page.py index f942d8c5b..61f35eb3d 100644 --- a/src/wiktextract/extractor/zh/page.py +++ b/src/wiktextract/extractor/zh/page.py @@ -31,8 +31,6 @@ # Additional templates to be expanded in the pre-expand phase ADDITIONAL_EXPAND_TEMPLATES = frozenset( { - "multitrans", - "multitrans-nowiki", "col1", "col2", "col3", @@ -198,6 +196,9 @@ def extract_pronunciation( def parse_page( wxr: WiktextractContext, page_title: str, page_text: str ) -> list[dict[str, Any]]: + # page layout documents + # https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋 + # https://zh.wiktionary.org/wiki/Wiktionary:体例说明 if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 8978ef375..8b28f78ba 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -1,127 +1,128 @@ -import re from typing import Optional, Union -from mediawiki_langcodes import name_to_code +from mediawiki_langcodes import code_to_name, name_to_code from wikitextprocessor import NodeKind, WikiNode -from wikitextprocessor.parser import LEVEL_KIND_FLAGS +from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -from ..share import capture_text_in_parentheses from .models import Translation, WordEntry def extract_translation( - wxr: WiktextractContext, page_data: list[WordEntry], node: WikiNode + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: WikiNode, + sense: str = "", ) -> None: - sense_text = "" - for child in node.children: - if isinstance(child, WikiNode): - if child.kind == NodeKind.TEMPLATE: - template_name = child.template_name.lower() - if ( - template_name in {"trans-top", "翻譯-頂", "trans-top-also"} - and 1 in child.template_parameters - ): - sense_text = clean_node( - wxr, None, child.template_parameters.get(1) - ) - elif template_name == "checktrans-top": - return - elif template_name == "see translation subpage": - translation_subpage( - wxr, page_data, child.template_parameters - ) - elif child.kind == NodeKind.LIST: - for list_item_node in child.find_child(NodeKind.LIST_ITEM): - if not list_item_node.contain_node(NodeKind.LIST): - process_translation_list_item( - wxr, - page_data, - clean_node(wxr, None, list_item_node.children), - sense_text, - ) - else: - nested_list_index = 0 - for index, item_child in enumerate( - list_item_node.children - ): - if ( - isinstance(item_child, WikiNode) - and item_child.kind == NodeKind.LIST - ): - nested_list_index = index - break - - process_translation_list_item( - wxr, - page_data, - clean_node( - wxr, - None, - list_item_node.children[:nested_list_index], - ), - sense_text, - ) - for nested_list_node in list_item_node.find_child( - NodeKind.LIST - ): - for nested_list_item in nested_list_node.find_child( - NodeKind.LIST_ITEM - ): - process_translation_list_item( - wxr, - page_data, - clean_node( - wxr, None, nested_list_item.children - ), - sense_text, - ) + for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): + if isinstance(child, TemplateNode): + template_name = child.template_name.lower() + if ( + template_name in {"trans-top", "翻譯-頂", "trans-top-also"} + and 1 in child.template_parameters + ): + sense = clean_node(wxr, None, child.template_parameters.get(1)) + elif template_name == "see translation subpage": + translation_subpage(wxr, page_data, child.template_parameters) + else: + for list_item in child.find_child_recursively(NodeKind.LIST_ITEM): + process_translation_list_item( + wxr, + page_data, + list_item, + sense, + ) def process_translation_list_item( wxr: WiktextractContext, page_data: list[WordEntry], - expanded_text: str, + list_item: WikiNode, sense: str, ) -> None: - from .headword_line import GENDERS - - split_results = re.split(r":|:", expanded_text, maxsplit=1) - if len(split_results) != 2: - return - lang_text, words_text = split_results - lang_text = lang_text.strip() - words_text = words_text.strip() - if len(words_text) == 0: - return - lang_code = name_to_code(lang_text, "zh") - - # split words by `,` or `;` that are not inside `()` - for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text): - tags, word = capture_text_in_parentheses(word_and_tags) - tags = [tag for tag in tags if tag != lang_code] # rm Wiktionary link - translation_data = Translation( - lang_code=lang_code, lang=lang_text, word=word - ) - tags_without_roman = [] - for tag in tags: - if re.search(r"[a-z]", tag): - translation_data.roman = tag + tr_data = Translation(word="", sense=sense) + + for child in list_item.children: + if isinstance(child, str) and child.strip().endswith((":", ":")): + tr_data.lang = clean_node(wxr, None, child).strip("::") + tr_data.lang_code = name_to_code(tr_data.lang, "zh") + elif isinstance(child, TemplateNode): + template_name = child.template_name + if template_name in { + "t", + "t+", + "tt", + "tt+", + "t-check", + "t+check", + "t-needed", + }: + if len(tr_data.word) > 0: + page_data[-1].translations.append( + tr_data.model_copy(deep=True) + ) + tr_data = Translation( + word="", + lang=tr_data.lang, + lang_code=tr_data.lang_code, + sense=sense, + ) + if tr_data.lang_code == "": + tr_data.lang_code = child.template_parameters[1] + if tr_data.lang == "": + tr_data.lang = code_to_name(tr_data.lang_code, "zh") + tr_data.word = clean_node( + wxr, None, child.template_parameters[2] + ) + tr_data.roman = clean_node( + wxr, None, child.template_parameters.get("tr", "") + ) + tr_data.alt = clean_node( + wxr, None, child.template_parameters.get("alt", "") + ) + tr_data.lit = clean_node( + wxr, None, child.template_parameters.get("lit", "") + ) + # find gender tags + expanded_template = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(child), expand_all=True + ) + for span_node in expanded_template.find_html("span"): + class_str = span_node.attrs.get("class", "") + if "gender" in class_str: + for abbr_tag in span_node.find_html("abbr"): + if len(abbr_tag.attrs.get("title")) > 0: + tr_data.tags.append( + clean_node( + wxr, None, abbr_tag.attrs.get("title") + ) + ) + elif tr_data.roman == "" and class_str.startswith("tr "): + tr_data.roman = clean_node(wxr, None, span_node) + elif template_name == "multitrans": + multitrans = wxr.wtp.parse( + child.template_parameter.get("data", "") + ) + extract_translation(wxr, page_data, multitrans, sense) else: - tags_without_roman.append(tag) - - if len(tags_without_roman) > 0: - translation_data.tags = tags_without_roman - - gender = word.split(" ")[-1] - if gender in GENDERS: - translation_data.word = word.removesuffix(f" {gender}") - translation_data.tags.append(GENDERS.get(gender)) - - if len(sense) > 0: - translation_data.sense = sense - page_data[-1].translations.append(translation_data) + # qualifier template + tag = clean_node(wxr, None, child) + if len(tag) > 0: + tr_data.tags.append(tag.strip("()")) + elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: + if len(tr_data.word) > 0: + page_data[-1].translations.append(tr_data.model_copy(deep=True)) + tr_data = Translation( + word="", + lang=tr_data.lang, + lang_code=tr_data.lang_code, + sense=sense, + ) + tr_data.word = clean_node(wxr, None, child) + + if len(tr_data.word) > 0: + page_data[-1].translations.append(tr_data.model_copy(deep=True)) def translation_subpage( diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py index 3a5536903..d2af44788 100644 --- a/tests/test_zh_gloss.py +++ b/tests/test_zh_gloss.py @@ -6,8 +6,8 @@ from wiktextract.extractor.zh.models import Sense, WordEntry from wiktextract.extractor.zh.page import ( extract_gloss, - parse_section, parse_page, + parse_section, ) from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py index d9f95cbda..5a2739ef7 100644 --- a/tests/test_zh_headword.py +++ b/tests/test_zh_headword.py @@ -1,5 +1,5 @@ from unittest import TestCase -from unittest.mock import Mock, patch +from unittest.mock import Mock from wikitextprocessor import Wtp from wiktextract.extractor.zh.headword_line import extract_headword_line diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 2090535b5..357b6c370 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -1,7 +1,6 @@ from unittest import TestCase -from unittest.mock import patch -from wikitextprocessor import Page, Wtp +from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig from wiktextract.extractor.zh.models import WordEntry from wiktextract.extractor.zh.translation import extract_translation @@ -21,25 +20,22 @@ def tearDown(self) -> None: self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn ) - @patch( - "wikitextprocessor.Wtp.get_page", - return_value=Page(title="", namespace_id=10, body=""), - ) - def test_normal(self, mock_get_page) -> None: - # test wikitext from page "你好" and "這裡" - page_data = [WordEntry(word="你好", lang_code="zh", lang="漢語")] - wikitext = """ -{{trans-top|靠近說話者的地方}} -* 阿爾巴尼亞語:këtu (sq) -* 阿帕切語: -*: 西阿帕切語:kú -* 阿拉伯語:هُنَا‎ (hunā) -*: 埃及阿拉伯語:هنا‎ (henā) -*俄语:[[привет|приве́т]] ‎(privét) (非正式), [[здравствуйте|здра́вствуйте]] ‎(zdrávstvujte) (正式, 第一个"в"不发音) -{{trans-bottom}} -* 斯洛伐克語:pracovať impf - """ - self.wxr.wtp.start_page("你好") + def test_t_template(self): + self.wxr.wtp.start_page("太陽風") + self.wxr.wtp.add_page( + "Template:t+", + 10, + """{{#switch:{{{3}}} +|f=f +|m=m +}}""", + ) + self.wxr.wtp.add_page("Template:qualifier", 10, "({{{1}}})") + page_data = [WordEntry(word="太陽風", lang_code="zh", lang="漢語")] + wikitext = """{{trans-top|太陽上層大氣射出的超高速電漿流}} +* 希伯来语:{{t+|he|רוח השמש|tr=ruakh ha-shemesh}}、{{t+|he|רוח סולרית|f|tr=ruakh solarit}} +* 塞尔维亚-克罗地亚语: +*: 西里尔字母:{{qualifier|Ekavian}} {{t+|sh|сунчев ветар|m}}""" node = self.wxr.wtp.parse(wikitext) extract_translation(self.wxr, page_data, node) self.assertEqual( @@ -49,52 +45,52 @@ def test_normal(self, mock_get_page) -> None: ], [ { - "lang_code": "sq", - "lang": "阿爾巴尼亞語", - "sense": "靠近說話者的地方", - "word": "këtu", - }, - { - "lang": "西阿帕切語", - "sense": "靠近說話者的地方", - "word": "kú", + "lang_code": "he", + "lang": "希伯来语", + "sense": "太陽上層大氣射出的超高速電漿流", + "word": "רוח השמש", + "roman": "ruakh ha-shemesh", }, { - "lang_code": "ar", - "lang": "阿拉伯語", - "sense": "靠近說話者的地方", - "roman": "hunā", - "word": "هُنَا", + "lang_code": "he", + "lang": "希伯来语", + "sense": "太陽上層大氣射出的超高速電漿流", + "word": "רוח סולרית", + "roman": "ruakh solarit", + "tags": ["陰性名詞"], }, { - "lang_code": "arz", - "lang": "埃及阿拉伯語", - "sense": "靠近說話者的地方", - "roman": "henā", - "word": "هنا", - }, - { - "lang_code": "ru", - "lang": "俄语", - "sense": "靠近說話者的地方", - "roman": "privét", - "tags": ["非正式"], - "word": "приве́т", + "lang_code": "sh", + "lang": "西里尔字母", + "sense": "太陽上層大氣射出的超高速電漿流", + "word": "сунчев ветар", + "tags": ["Ekavian", "陽性名詞"], }, + ], + ) + + def test_link_words(self): + self.wxr.wtp.start_page("你好") + page_data = [WordEntry(word="你好", lang_code="zh", lang="漢語")] + wikitext = """{{翻譯-頂}} +*英语:[[how do you do]]; [[how are you]]""" + node = self.wxr.wtp.parse(wikitext) + extract_translation(self.wxr, page_data, node) + self.assertEqual( + [ + d.model_dump(exclude_defaults=True) + for d in page_data[0].translations + ], + [ { - "lang_code": "ru", - "lang": "俄语", - "sense": "靠近說話者的地方", - "roman": "zdrávstvujte", - "tags": ['正式, 第一个"в"不发音'], - "word": "здра́вствуйте", + "lang_code": "en", + "lang": "英语", + "word": "how do you do", }, { - "lang_code": "sk", - "lang": "斯洛伐克語", - "sense": "靠近說話者的地方", - "tags": ["imperfective aspect"], - "word": "pracovať", + "lang_code": "en", + "lang": "英语", + "word": "how are you", }, ], ) From 4f5c2f8e2819dc3b99d01c39d89f4831770de404 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 29 Jan 2024 15:24:06 +0800 Subject: [PATCH 2/5] Process zh edition "trans-see" subpage translation template --- src/wiktextract/extractor/zh/translation.py | 57 ++++++++++----------- tests/test_zh_translation.py | 31 +++++++++++ 2 files changed, 57 insertions(+), 31 deletions(-) diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 8b28f78ba..aed6f63fc 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -23,8 +23,15 @@ def extract_translation( and 1 in child.template_parameters ): sense = clean_node(wxr, None, child.template_parameters.get(1)) - elif template_name == "see translation subpage": - translation_subpage(wxr, page_data, child.template_parameters) + elif template_name in {"see translation subpage", "trans-see"}: + translation_subpage(wxr, page_data, child) + elif template_name == "multitrans": + wikitext = "".join( + wxr.wtp.node_to_wikitext(c) + for c in child.template_parameters.get("data", []) + ) + multitrans = wxr.wtp.parse(wikitext) + extract_translation(wxr, page_data, multitrans, sense) else: for list_item in child.find_child_recursively(NodeKind.LIST_ITEM): process_translation_list_item( @@ -100,11 +107,6 @@ def process_translation_list_item( ) elif tr_data.roman == "" and class_str.startswith("tr "): tr_data.roman = clean_node(wxr, None, span_node) - elif template_name == "multitrans": - multitrans = wxr.wtp.parse( - child.template_parameter.get("data", "") - ) - extract_translation(wxr, page_data, multitrans, sense) else: # qualifier template tag = clean_node(wxr, None, child) @@ -128,18 +130,21 @@ def process_translation_list_item( def translation_subpage( wxr: WiktextractContext, page_data: list[WordEntry], - template_args: dict[str, str], + template_node: TemplateNode, ) -> None: + # https://zh.wiktionary.org/wiki/Template:翻譯-見 + # https://zh.wiktionary.org/wiki/Template:See_translation_subpage from .page import ADDITIONAL_EXPAND_TEMPLATES page_title = wxr.wtp.title target_section = None - if len(template_args) > 0: - target_section = template_args.get(1) - if len(template_args) > 1: - page_title = template_args.get(2) + if template_node.template_name == "see translation subpage": + target_section = template_node.template_parameters.get(1) + page_title = template_node.template_parameters.get(2, wxr.wtp.title) - translation_subpage_title = f"{page_title}/翻譯" + translation_subpage_title = page_title + if page_title == wxr.wtp.title: + translation_subpage_title = f"{page_title}/翻譯" subpage = wxr.wtp.get_page(translation_subpage_title) if subpage is None: return @@ -166,22 +171,12 @@ def find_subpage_section( node: Union[WikiNode, str], target_section: Union[str, list[str]], ) -> Optional[WikiNode]: - if isinstance(node, WikiNode): - if node.kind in LEVEL_KIND_FLAGS: - section_title = clean_node(wxr, None, node.largs) - if ( - isinstance(target_section, str) - and section_title == target_section - ): - return node - if ( - isinstance(target_section, list) - and section_title in target_section - ): - return node - - for child in node.children: - returned_node = find_subpage_section(wxr, child, target_section) - if returned_node is not None: - return returned_node + if not isinstance(node, WikiNode): + return None + for level_node in node.find_child_recursively(LEVEL_KIND_FLAGS): + section_title = clean_node(wxr, None, level_node.largs) + if isinstance(target_section, str) and section_title == target_section: + return level_node + if isinstance(target_section, list) and section_title in target_section: + return level_node return None diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 357b6c370..4ee6a090c 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -94,3 +94,34 @@ def test_link_words(self): }, ], ) + + def test_subpage_multitrans(self): + self.wxr.wtp.start_page("英語") + self.wxr.wtp.add_page( + "英語/翻譯", + 0, + """==漢語== +===名詞=== +====翻譯==== +{{trans-top|一種源於英格蘭的語言}}{{multitrans|data= +* 阿布哈茲語:{{tt|ab|англыз бызшәа}} +}}""", + ) + page_data = [WordEntry(word="英語", lang_code="zh", lang="漢語")] + wikitext = "{{trans-see|源於英格蘭的語言|英語/翻譯}}" + node = self.wxr.wtp.parse(wikitext) + extract_translation(self.wxr, page_data, node) + self.assertEqual( + [ + d.model_dump(exclude_defaults=True) + for d in page_data[0].translations + ], + [ + { + "lang_code": "ab", + "lang": "阿布哈茲語", + "word": "англыз бызшәа", + "sense": "一種源於英格蘭的語言", + } + ], + ) From 565fc09ef77105ac1af6c4e6ec7b9ac710b29120 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 29 Jan 2024 16:00:59 +0800 Subject: [PATCH 3/5] Ignore zh edition empty translation template "t-needed" --- src/wiktextract/extractor/zh/translation.py | 8 +++++--- tests/test_zh_translation.py | 1 + 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index aed6f63fc..92a9c10e2 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -63,7 +63,6 @@ def process_translation_list_item( "tt+", "t-check", "t+check", - "t-needed", }: if len(tr_data.word) > 0: page_data[-1].translations.append( @@ -76,11 +75,11 @@ def process_translation_list_item( sense=sense, ) if tr_data.lang_code == "": - tr_data.lang_code = child.template_parameters[1] + tr_data.lang_code = child.template_parameters.get(1, "") if tr_data.lang == "": tr_data.lang = code_to_name(tr_data.lang_code, "zh") tr_data.word = clean_node( - wxr, None, child.template_parameters[2] + wxr, None, child.template_parameters.get(2, "") ) tr_data.roman = clean_node( wxr, None, child.template_parameters.get("tr", "") @@ -107,6 +106,9 @@ def process_translation_list_item( ) elif tr_data.roman == "" and class_str.startswith("tr "): tr_data.roman = clean_node(wxr, None, span_node) + elif template_name == "t-needed": + # ignore empty translation + continue else: # qualifier template tag = clean_node(wxr, None, child) diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 4ee6a090c..3282ec767 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -105,6 +105,7 @@ def test_subpage_multitrans(self): ====翻譯==== {{trans-top|一種源於英格蘭的語言}}{{multitrans|data= * 阿布哈茲語:{{tt|ab|англыз бызшәа}} +* 阿拉貢語:{{t-needed|an}} }}""", ) page_data = [WordEntry(word="英語", lang_code="zh", lang="漢語")] From 50c48eecc87d25cfa71a1cf790c924c476485989 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 29 Jan 2024 16:41:45 +0800 Subject: [PATCH 4/5] Handle Russian translation list in zh edition Some pages only have Russian translations and use "1)" after the language name text. --- src/wiktextract/extractor/zh/translation.py | 6 ++-- tests/test_zh_translation.py | 31 +++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 92a9c10e2..6bab12381 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -50,9 +50,9 @@ def process_translation_list_item( ) -> None: tr_data = Translation(word="", sense=sense) - for child in list_item.children: - if isinstance(child, str) and child.strip().endswith((":", ":")): - tr_data.lang = clean_node(wxr, None, child).strip("::") + for child_index, child in enumerate(list_item.children): + if child_index == 0 and isinstance(child, str) and ":" in child: + tr_data.lang = clean_node(wxr, None, child[: child.index(":")]) tr_data.lang_code = name_to_code(tr_data.lang, "zh") elif isinstance(child, TemplateNode): template_name = child.template_name diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 3282ec767..2281b9edc 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -126,3 +126,34 @@ def test_subpage_multitrans(self): } ], ) + + def test_strange_russian_translation(self): + self.wxr.wtp.start_page("林场") + page_data = [WordEntry(word="林场", lang_code="zh", lang="漢語")] + node = self.wxr.wtp.parse( + "*俄语:1) [[лесничество]], [[лесхоз]]; 2) [[лесосека]]" + ) + extract_translation(self.wxr, page_data, node) + self.assertEqual( + [ + d.model_dump(exclude_defaults=True) + for d in page_data[0].translations + ], + [ + { + "lang_code": "ru", + "lang": "俄语", + "word": "лесничество", + }, + { + "lang_code": "ru", + "lang": "俄语", + "word": "лесхоз", + }, + { + "lang_code": "ru", + "lang": "俄语", + "word": "лесосека", + }, + ], + ) From f957217e3e831ca22332fe3d6157d7fecd77fa25 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 29 Jan 2024 17:18:00 +0800 Subject: [PATCH 5/5] Get translation language name from expanded template text --- src/wiktextract/extractor/zh/translation.py | 17 +++++++++++++---- tests/test_zh_translation.py | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 6bab12381..eb4786b13 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -50,10 +50,19 @@ def process_translation_list_item( ) -> None: tr_data = Translation(word="", sense=sense) - for child_index, child in enumerate(list_item.children): - if child_index == 0 and isinstance(child, str) and ":" in child: - tr_data.lang = clean_node(wxr, None, child[: child.index(":")]) - tr_data.lang_code = name_to_code(tr_data.lang, "zh") + for child_index, child in enumerate(list_item.filter_empty_str_child()): + if child_index == 0: + lang_text = "" + if isinstance(child, str): + if ":" in child: + lang_text = child[: child.index(":")] + elif ":" in child: + lang_text = child[: child.index(":")] + else: + lang_text = clean_node(wxr, None, child) + if len(lang_text) > 0: + tr_data.lang = lang_text.strip() + tr_data.lang_code = name_to_code(tr_data.lang, "zh") elif isinstance(child, TemplateNode): template_name = child.template_name if template_name in { diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 2281b9edc..7401bf4d5 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -157,3 +157,23 @@ def test_strange_russian_translation(self): }, ], ) + + def test_language_name_template(self): + self.wxr.wtp.start_page("解析幾何") + page_data = [WordEntry(word="解析幾何", lang_code="zh", lang="漢語")] + self.wxr.wtp.add_page("Template:en", 10, "英語") + node = self.wxr.wtp.parse("* {{en}}:{{t+|en|analytic geometry}}") + extract_translation(self.wxr, page_data, node) + self.assertEqual( + [ + d.model_dump(exclude_defaults=True) + for d in page_data[0].translations + ], + [ + { + "lang_code": "en", + "lang": "英語", + "word": "analytic geometry", + }, + ], + )