From b8413ab1ceded05e62290ef3eede070d84e78783 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 29 Feb 2024 17:08:57 +0800 Subject: [PATCH 1/7] Add tag translation data to zh edition --- src/wiktextract/extractor/zh/tags.py | 90 ++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 src/wiktextract/extractor/zh/tags.py diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py new file mode 100644 index 000000000..9e24dff6a --- /dev/null +++ b/src/wiktextract/extractor/zh/tags.py @@ -0,0 +1,90 @@ +from .models import WordEntry + +GENDER_TAGS: dict[str, str] = { + "陰性": "feminine", + "陽性": "masculine", + "中性": "neuter", +} + +NUMBER_TAGS: dict[str, str] = { + "單數": "singular", + "複數": "plural", + "定單數": "definite singular", + "不定複數": "indefinite plural", + "定複數": "definite plural", + "斜格複數": "oblique plural", + "主格單數": "nominative singular", + "主格複數": "nominative plural", + "屬格單數": "genitive singular", + "屬格複數": "genitive plural", + "陰性單數": "feminine singular", + "陽性單數": "masculine singular", + "陰性複數": "feminine plural", + "陽性複數": "masculine plural", + "中性複數": "neuter plural", + "中性單數": "neuter singular", +} + +# https://en.wikipedia.org/wiki/Count_noun +COUNT_TAGS: dict[str, str] = { + "可數": "countable", + "不可數": "uncountable", +} + +OTHER_TAGS: dict[str, str] = { + "指小詞": "diminutive", + "變格類型": "declension pattern", +} + +VERB_TAGS: dict[str, str] = { + "及物": "transitive", + "不及物": "intransitive", +} + +# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms +JA_STEM_FORMS: dict[str, str] = { + "未然形": "imperfective", + "連用形": "continuative", + "終止形": "terminal", + "連體形": "attributive", + "連体形": "attributive", + "假定形": "hypothetical", + "仮定形": "hypothetical", + "命令形": "imperative", +} + +# https://en.wikipedia.org/wiki/Voice_(grammar) +VOICE_TAGS: dict[str, str] = { + "被動形": "passive", + "使役形": "causative", + "可能形": "potential", + "意志形": "volitional", + "否定形": "negative", + "否定連用形": "negative continuative", + "尊敬形": "formal", + "完成形": "perfective", + "接續形": "conjunctive", + "條件形": "hypothetical conditional", +} + + +GRAMMATICAL_TAGS: dict[str, str] = { + **GENDER_TAGS, + **NUMBER_TAGS, + **COUNT_TAGS, + **OTHER_TAGS, + **VERB_TAGS, + **JA_STEM_FORMS, + **VOICE_TAGS, +} + + +def translate_raw_tags(data: WordEntry) -> WordEntry: + raw_tags = [] + for raw_tag in data.raw_tags: + if raw_tag.lower() in GRAMMATICAL_TAGS: + data.tags.append(GRAMMATICAL_TAGS[raw_tag.lower()]) + else: + raw_tags.append(raw_tag) + data.raw_tags = raw_tags + return data From 2f720546855e283152ba2b327a84e0cc55c3fd13 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 29 Feb 2024 17:09:42 +0800 Subject: [PATCH 2/7] Translate zh edition headline form tags and update code to parse the new "head" template HTML layout --- src/wiktextract/extractor/zh/headword_line.py | 96 +++++++------------ src/wiktextract/extractor/zh/tags.py | 31 ++++++ tests/test_zh_headword.py | 19 ++-- 3 files changed, 76 insertions(+), 70 deletions(-) diff --git a/src/wiktextract/extractor/zh/headword_line.py b/src/wiktextract/extractor/zh/headword_line.py index ed55bf55d..d14f90e63 100644 --- a/src/wiktextract/extractor/zh/headword_line.py +++ b/src/wiktextract/extractor/zh/headword_line.py @@ -9,33 +9,7 @@ from ..ruby import extract_ruby from ..share import strip_nodes from .models import Form, WordEntry - -# https://zh.wiktionary.org/wiki/Module:Gender_and_number -GENDERS = { - "f": "feminine", - "m": "masculine", - "n": "neuter", - "c": "common", - # Animacy - "an": "animate", - "in": "inanimate", - # Animal (for Ukrainian, Belarusian, Polish) - "anml": "animal", - # Personal (for Ukrainian, Belarusian, Polish) - "pr": "personal", - # Nonpersonal not currently used - "np": "nonpersonal", - # Virility (for Polish) - "vr": "virile", - "nv": "nonvirile", - # Numbers - "s": "singular number", - "d": "dual number", - "p": "plural number", - # Verb qualifiers - "impf": "imperfective aspect", - "pf": "perfective aspect", -} +from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags def extract_headword_line( @@ -55,32 +29,34 @@ def extract_headword_line( wxr.wtp.node_to_wikitext(node), expand_all=True ) forms_start_index = 0 - for index, child in expanded_node.find_child(NodeKind.HTML, True): - if child.tag == "strong" and "headword" in child.attrs.get("class", ""): - forms_start_index = index + 1 - elif child.tag == "span": - class_names = child.attrs.get("class", "") - if "headword-tr" in class_names: + for span_node in expanded_node.find_html( + "span", attr_name="class", attr_value="headword-line" + ): + for index, span_child in span_node.find_child(NodeKind.HTML, True): + if span_child.tag == "span": forms_start_index = index + 1 - - page_data[-1].forms.append( - Form( - form=clean_node(wxr, page_data[-1], child), - tags=["romanization"], + class_names = span_child.attrs.get("class", "") + if "headword-tr" in class_names: + page_data[-1].forms.append( + Form( + form=clean_node(wxr, page_data[-1], span_child), + tags=["romanization"], + ) ) - ) - elif "gender" in class_names: + elif "gender" in class_names: + for abbr_tag in span_child.find_html("abbr"): + gender = abbr_tag.children[0] + if gender in TEMPLATE_TAG_ARGS: + page_data[-1].tags.append(TEMPLATE_TAG_ARGS[gender]) + else: + page_data[-1].raw_tags.append(gender) + translate_raw_tags(page_data[-1]) + elif ( + span_child.tag == "strong" + and "headword" in span_child.attrs.get("class", "") + ): forms_start_index = index + 1 - for abbr_tag in child.find_html("abbr"): - gender = abbr_tag.children[0] - if gender in GENDERS: - page_data[-1].tags.append(GENDERS[gender]) - else: - page_data[-1].raw_tags.append(gender) - if lang_code == "ja": - for span_child in child.find_html( - "strong", attr_name="class", attr_value="headword" - ): + if lang_code == "ja": ruby_data, node_without_ruby = extract_ruby(wxr, span_child) page_data[-1].forms.append( Form( @@ -91,13 +67,13 @@ def extract_headword_line( tags=["canonical"], ) ) - elif child.tag == "b": - # this is a form tag, already inside form parentheses - break + elif span_child.tag == "b": + # this is a form tag, already inside form parentheses + break - extract_headword_forms( - wxr, page_data, expanded_node.children[forms_start_index:] - ) + extract_headword_forms( + wxr, page_data, span_node.children[forms_start_index:] + ) def extract_headword_forms( @@ -150,8 +126,8 @@ def process_forms_text( and "gender" in next_node.attrs.get("class", "") ): gender = clean_node(wxr, None, next_node) - if gender in GENDERS: - form_tags.append(GENDERS[gender]) + if gender in TEMPLATE_TAG_ARGS: + form_tags.append(TEMPLATE_TAG_ARGS[gender]) else: raw_form_tags.append(gender) @@ -161,6 +137,7 @@ def process_forms_text( tags=form_tags, ruby=ruby_data, ) + translate_raw_tags(form_data) page_data[-1].forms.append(form_data) elif ( node.tag == "span" @@ -180,6 +157,7 @@ def process_forms_text( ) if len(tags_list) > 0: page_data[-1].raw_tags.extend(tags_list) + translate_raw_tags(page_data[-1]) else: clean_node(wxr, page_data[-1], tag_nodes) # find categories @@ -187,7 +165,7 @@ def process_forms_text( def extract_headword_tags(tags_str: str) -> list[str]: tags = [] for tag_str in ( - s.strip() for s in re.split("&|或", tags_str) if len(s.strip()) > 0 + s.strip() for s in re.split("&|或|和", tags_str) if len(s.strip()) > 0 ): tags.append(tag_str) return tags diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py index 9e24dff6a..26bbe5d65 100644 --- a/src/wiktextract/extractor/zh/tags.py +++ b/src/wiktextract/extractor/zh/tags.py @@ -88,3 +88,34 @@ def translate_raw_tags(data: WordEntry) -> WordEntry: raw_tags.append(raw_tag) data.raw_tags = raw_tags return data + + +# https://zh.wiktionary.org/wiki/Template:T +# https://zh.wiktionary.org/wiki/Template:Head +# https://zh.wiktionary.org/wiki/Module:Gender_and_number +TEMPLATE_TAG_ARGS = { + "f": "feminine", + "m": "masculine", + "n": "neuter", + "c": "common", + # Animacy + "an": "animate", + "in": "inanimate", + # Animal (for Ukrainian, Belarusian, Polish) + "anml": "animal", + # Personal (for Ukrainian, Belarusian, Polish) + "pr": "personal", + # Nonpersonal not currently used + "np": "nonpersonal", + # Virility (for Polish) + "vr": "virile", + "nv": "nonvirile", + # Numbers + "s": "singular number", + "d": "dual number", + "p": "plural number", + # Verb qualifiers + "impf": "imperfective aspect", + "pf": "perfective aspect", + "mf": "masculine feminine", +} diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py index 60bffc110..aa6ada938 100644 --- a/tests/test_zh_headword.py +++ b/tests/test_zh_headword.py @@ -26,7 +26,7 @@ def test_english_headword(self) -> None: self.wxr.wtp.add_page( "Template:en-noun", 10, - 'manga ([[可數|可數]] & [[不可數|不可數]],複數 manga [[mangas#英語|mangas]])', + '-{manga}- ([[可數|可數]] [[不可數|不可數]]-{}-,複數-{ manga [[mangas#英語|-{mangas}-]]}-)', ) root = self.wxr.wtp.parse("{{en-noun|~|manga|s}}") page_data = [WordEntry(word="manga", lang_code="en", lang="英語")] @@ -40,10 +40,10 @@ def test_english_headword(self) -> None: "lang_code": "en", "lang": "英語", "forms": [ - {"form": "manga", "raw_tags": ["複數"]}, - {"form": "mangas", "raw_tags": ["複數"]}, + {"form": "manga", "tags": ["plural"]}, + {"form": "mangas", "tags": ["plural"]}, ], - "raw_tags": ["可數", "不可數"], + "tags": ["countable", "uncountable"], } ], ) @@ -56,7 +56,7 @@ def test_headword_gender(self) -> None: self.wxr.wtp.add_page( "Template:nl-noun", 10, - 'manga m (複數 [[manga\'s#荷蘭語|manga\'s]],指小詞 [[mangaatje#荷蘭語|mangaatje]] n)', + '-{manga}- m (複數-{ [[manga\'s#荷蘭語|-{manga\'s}-]]}-,指小詞-{ [[mangaatje#荷蘭語|-{mangaatje}-]] n}-)', ) root = self.wxr.wtp.parse("{{nl-noun|m|-'s|mangaatje}}") page_data = [WordEntry(word="manga", lang_code="en", lang="英語")] @@ -70,13 +70,10 @@ def test_headword_gender(self) -> None: "lang_code": "en", "lang": "英語", "forms": [ - {"form": "manga's", "raw_tags": ["複數"]}, + {"form": "manga's", "tags": ["plural"]}, { "form": "mangaatje", - "raw_tags": [ - "指小詞", - ], - "tags": ["neuter"], + "tags": ["neuter", "diminutive"], }, ], "tags": ["masculine"], @@ -92,7 +89,7 @@ def test_headword_roman(self) -> None: self.wxr.wtp.add_page( "Template:head", 10, - '-κρατίᾱς (-kratíāsf', + '-{-κρατίᾱς}- (-kratíāsf', ) root = self.wxr.wtp.parse("{{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}") page_data = [ From 8ff624ef7a89f463f8bf575c8bec8594c5ccc2ae Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 29 Feb 2024 18:14:07 +0800 Subject: [PATCH 3/7] Translate some tags in zh edition's translation list --- src/wiktextract/extractor/zh/translation.py | 25 ++++++++------------- tests/test_zh_translation.py | 8 ++++--- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index b086388ce..cebf47624 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -8,6 +8,7 @@ from .models import Translation, WordEntry from .section_titles import TRANSLATIONS_TITLES +from .tags import TEMPLATE_TAG_ARGS def extract_translation( @@ -101,22 +102,14 @@ def process_translation_list_item( tr_data.lit = clean_node( wxr, None, child.template_parameters.get("lit", "") ) - # find gender tags - expanded_template = wxr.wtp.parse( - wxr.wtp.node_to_wikitext(child), expand_all=True - ) - for span_node in expanded_template.find_html("span"): - class_str = span_node.attrs.get("class", "") - if "gender" in class_str: - for abbr_tag in span_node.find_html("abbr"): - if len(abbr_tag.attrs.get("title")) > 0: - tr_data.raw_tags.append( - clean_node( - wxr, None, abbr_tag.attrs.get("title") - ) - ) - elif tr_data.roman == "" and class_str.startswith("tr "): - tr_data.roman = clean_node(wxr, None, span_node) + for arg_key, arg_value in child.template_parameters.items(): + if ( + isinstance(arg_key, int) and arg_key >= 3 + ) or arg_key == "g": # template "l" uses the "g" arg + for tag_arg in arg_value.split("-"): + if tag_arg in TEMPLATE_TAG_ARGS: + tr_data.tags.append(TEMPLATE_TAG_ARGS[tag_arg]) + elif template_name == "t-needed": # ignore empty translation continue diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 1b3ad9611..b08583e96 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -57,14 +57,15 @@ def test_t_template(self): "sense": "太陽上層大氣射出的超高速電漿流", "word": "רוח סולרית", "roman": "ruakh solarit", - "raw_tags": ["陰性名詞"], + "tags": ["feminine"], }, { "lang_code": "sh", "lang": "西里尔字母", "sense": "太陽上層大氣射出的超高速電漿流", "word": "сунчев ветар", - "raw_tags": ["Ekavian", "陽性名詞"], + "tags": ["masculine"], + "raw_tags": ["Ekavian"], }, ], ) @@ -211,7 +212,8 @@ def test_l_template(self): "lang_code": "cs", "lang": "捷克语", "word": "patližán", - "raw_tags": ["陽性名詞", "口语词汇"], + "tags": ["masculine"], + "raw_tags": ["口语词汇"], }, ], ) From 4a5a06a409a60219b11fca35cf183b11284a956d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Mar 2024 13:36:58 +0800 Subject: [PATCH 4/7] Only add the "label" template before gloss text to `raw_tags` This template is used to add grammatical information to gloss text. --- src/wiktextract/extractor/zh/gloss.py | 70 ++++++++------------------ src/wiktextract/extractor/zh/models.py | 1 - tests/test_zh_gloss.py | 7 +-- 3 files changed, 24 insertions(+), 54 deletions(-) diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py index 3bd6a11a9..fdbec7c60 100644 --- a/src/wiktextract/extractor/zh/gloss.py +++ b/src/wiktextract/extractor/zh/gloss.py @@ -1,6 +1,7 @@ import re from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -8,6 +9,9 @@ from .example import extract_examples from .models import Sense, WordEntry +# https://zh.wiktionary.org/wiki/Template:Label +LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) + def extract_gloss( wxr: WiktextractContext, @@ -17,11 +21,19 @@ def extract_gloss( ) -> None: lang_code = page_data[-1].lang_code for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): - gloss_nodes = [ - child - for child in list_item_node.children - if not isinstance(child, WikiNode) or child.kind != NodeKind.LIST - ] + gloss_nodes = [] + raw_tags = [] + for node in list_item_node.children: + if ( + isinstance(node, TemplateNode) + and node.template_name in LABEL_TEMPLATES + ): + raw_tags.append(clean_node(wxr, None, node).strip("()")) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + continue + else: + gloss_nodes.append(node) + if lang_code == "ja": expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True @@ -29,13 +41,13 @@ def extract_gloss( ruby_data, nodes_without_ruby = extract_ruby( wxr, expanded_node.children ) - raw_gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby) + gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby) else: ruby_data = [] - raw_gloss_text = clean_node(wxr, gloss_data, gloss_nodes) - new_gloss_data = merge_gloss_data( - gloss_data, extract_gloss_and_tags(raw_gloss_text) - ) + gloss_text = clean_node(wxr, gloss_data, gloss_nodes) + new_gloss_data = gloss_data.model_copy(deep=True) + new_gloss_data.raw_tags.extend(raw_tags) + new_gloss_data.glosses.append(gloss_text) if len(ruby_data) > 0: new_gloss_data.ruby = ruby_data @@ -50,41 +62,3 @@ def extract_gloss( if not has_nested_gloss: page_data[-1].senses.append(new_gloss_data) - - -def merge_gloss_data(data_a: Sense, data_b: Sense) -> Sense: - new_data = Sense() - for data in data_a, data_b: - for field in data.model_fields: - pre_data = getattr(new_data, field) - pre_data.extend(getattr(data, field)) - return new_data - - -def extract_gloss_and_tags(raw_gloss: str) -> Sense: - left_brackets = ("(", "(") - right_brackets = (")", ")") - if raw_gloss.startswith(left_brackets) or raw_gloss.endswith( - right_brackets - ): - tags = [] - split_tag_regex = r", ?|,|或" - front_tag_end = -1 - rear_tag_start = len(raw_gloss) - for index, left_bracket in enumerate(left_brackets): - if raw_gloss.startswith(left_bracket): - front_tag_end = raw_gloss.find(right_brackets[index]) - front_label = raw_gloss[1:front_tag_end] - tags += re.split(split_tag_regex, front_label) - for index, right_bracket in enumerate(right_brackets): - if raw_gloss.endswith(right_bracket): - rear_tag_start = raw_gloss.rfind(left_brackets[index]) - rear_label = raw_gloss.rstrip("".join(right_brackets))[ - rear_tag_start + 1 : - ] - tags += re.split(split_tag_regex, rear_label) - - gloss = raw_gloss[front_tag_end + 1 : rear_tag_start].strip() - return Sense(glosses=[gloss], raw_glosses=[raw_gloss], raw_tags=tags) - else: - return Sense(glosses=[raw_gloss]) diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py index 05276ec85..c4adc5959 100644 --- a/src/wiktextract/extractor/zh/models.py +++ b/src/wiktextract/extractor/zh/models.py @@ -31,7 +31,6 @@ class Example(ChineseBaseModel): class Sense(ChineseBaseModel): glosses: list[str] = [] - raw_glosses: list[str] = Field([], description="Gloss text without tags") tags: list[str] = [] raw_tags: list[str] = [] categories: list[str] = [] diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py index 9d8959859..5beeed8a6 100644 --- a/tests/test_zh_gloss.py +++ b/tests/test_zh_gloss.py @@ -40,12 +40,13 @@ def test_example_list(self) -> None: ## 有趣的,滑稽的,可笑的 ## 奇怪的,不正常的 ## 不合理的,不合邏輯的 -# (棄用) [[有趣]]的: +# {{lb|ja|棄用}} [[有趣]]的: ## [[有趣]]的 ## [[美味]]的 ## [[漂亮]]的 ## [[很好]]的,[[卓越]]的""" self.wxr.wtp.start_page("test") + self.wxr.wtp.add_page("Template:lb", 10, "({{{2|}}})") node = self.wxr.wtp.parse(wikitext) extract_gloss(self.wxr, page_data, node.children[0], Sense()) self.assertEqual( @@ -56,22 +57,18 @@ def test_example_list(self) -> None: {"glosses": ["好玩的:", "不合理的,不合邏輯的"]}, { "glosses": ["有趣的:", "有趣的"], - "raw_glosses": ["(棄用) 有趣的:"], "raw_tags": ["棄用"], }, { "glosses": ["有趣的:", "美味的"], - "raw_glosses": ["(棄用) 有趣的:"], "raw_tags": ["棄用"], }, { "glosses": ["有趣的:", "漂亮的"], - "raw_glosses": ["(棄用) 有趣的:"], "raw_tags": ["棄用"], }, { "glosses": ["有趣的:", "很好的,卓越的"], - "raw_glosses": ["(棄用) 有趣的:"], "raw_tags": ["棄用"], }, ], From 3ec084cbe27a0f74dfac96b060e56a31e2c56b68 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Mar 2024 14:57:44 +0800 Subject: [PATCH 5/7] Only add qualifier templates in gloss and translation list to "raw_tags" --- src/wiktextract/extractor/zh/gloss.py | 11 ++++++----- src/wiktextract/extractor/zh/translation.py | 22 ++++++++------------- tests/test_zh_translation.py | 4 ++-- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py index fdbec7c60..c50a808c6 100644 --- a/src/wiktextract/extractor/zh/gloss.py +++ b/src/wiktextract/extractor/zh/gloss.py @@ -24,11 +24,12 @@ def extract_gloss( gloss_nodes = [] raw_tags = [] for node in list_item_node.children: - if ( - isinstance(node, TemplateNode) - and node.template_name in LABEL_TEMPLATES - ): - raw_tags.append(clean_node(wxr, None, node).strip("()")) + if isinstance(node, TemplateNode): + raw_tag = clean_node(wxr, None, node) + if node.template_name in LABEL_TEMPLATES: + raw_tags.append(raw_tag.strip("()")) + elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): + raw_tags.append(raw_tag.strip("〈〉")) elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: continue else: diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index cebf47624..29a54fcab 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -113,21 +113,15 @@ def process_translation_list_item( elif template_name == "t-needed": # ignore empty translation continue + elif template_name in ("qualifier", "q"): + raw_tag = clean_node(wxr, None, child) + tr_data.raw_tags.append(raw_tag.strip("()")) else: - # qualifier template - expanded_template = wxr.wtp.parse( - wxr.wtp.node_to_wikitext(child), expand_all=True - ) - find_title = False - for span_node in expanded_template.find_html("span"): - tag = span_node.attrs.get("title", "") - if len(tag) > 0: - tr_data.raw_tags.append(tag.strip()) - find_title = True - if not find_title: - tag = clean_node(wxr, None, child) - if len(tag) > 0: - tr_data.raw_tags.append(tag.strip("()")) + # zh qualifier templates that use template "注释" + # https://zh.wiktionary.org/wiki/Template:注释 + raw_tag = clean_node(wxr, None, child) + if raw_tag.startswith("〈") and raw_tag.endswith("〉"): + tr_data.raw_tags.append(raw_tag.strip("〈〉")) elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: if len(tr_data.word) > 0: page_data[-1].translations.append(tr_data.model_copy(deep=True)) diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index b08583e96..42cbcc1f6 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -189,7 +189,7 @@ def test_l_template(self): {{#if:{{{g|}}}|m}}""", ) self.wxr.wtp.add_page( - "Template:口", 10, '〉' + "Template:口", 10, '〈〉' ) page_data = [WordEntry(word="茄子", lang_code="zh", lang="漢語")] node = self.wxr.wtp.parse( @@ -213,7 +213,7 @@ def test_l_template(self): "lang": "捷克语", "word": "patližán", "tags": ["masculine"], - "raw_tags": ["口语词汇"], + "raw_tags": ["口"], }, ], ) From 63d6a7a7a335414d9fddb0ce959b134cb295828c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Mar 2024 16:16:21 +0800 Subject: [PATCH 6/7] Translate some "label" and "qualifier" template raw tags --- src/wiktextract/extractor/zh/gloss.py | 3 ++- src/wiktextract/extractor/zh/tags.py | 28 +++++++++++++++++++-- src/wiktextract/extractor/zh/translation.py | 3 ++- tests/test_zh_gloss.py | 8 +++--- tests/test_zh_translation.py | 3 +-- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py index c50a808c6..8e9b98d8a 100644 --- a/src/wiktextract/extractor/zh/gloss.py +++ b/src/wiktextract/extractor/zh/gloss.py @@ -1,4 +1,3 @@ -import re from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode @@ -8,6 +7,7 @@ from ..ruby import extract_ruby from .example import extract_examples from .models import Sense, WordEntry +from .tags import translate_raw_tags # https://zh.wiktionary.org/wiki/Template:Label LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) @@ -62,4 +62,5 @@ def extract_gloss( extract_examples(wxr, new_gloss_data, child_node) if not has_nested_gloss: + translate_raw_tags(new_gloss_data) page_data[-1].senses.append(new_gloss_data) diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py index 26bbe5d65..b380e1e03 100644 --- a/src/wiktextract/extractor/zh/tags.py +++ b/src/wiktextract/extractor/zh/tags.py @@ -78,12 +78,36 @@ **VOICE_TAGS, } +# https://zh.wiktionary.org/wiki/Template:Label +# https://zh.wiktionary.org/wiki/Template:Qualifier +# https://zh.wiktionary.org/wiki/Template:古 +# https://zh.wiktionary.org/wiki/Template:注释 +LABEL_TAGS = { + "棄用": "obsolete", + "古": "archaic", + "陽": "masculine", + "陰": "feminine", + "喻": "figuratively", + "書": "literary", + "口": "colloquial", + "俚": "slang", + "俗": "slang", + "方": "dialectal", + "废": "obsolete", + "貶": "derogatory", + "罕": "rare", + "引": "broadly", +} + + +ALL_TAGS = {**GRAMMATICAL_TAGS, **LABEL_TAGS} + def translate_raw_tags(data: WordEntry) -> WordEntry: raw_tags = [] for raw_tag in data.raw_tags: - if raw_tag.lower() in GRAMMATICAL_TAGS: - data.tags.append(GRAMMATICAL_TAGS[raw_tag.lower()]) + if raw_tag.lower() in ALL_TAGS: + data.tags.append(ALL_TAGS[raw_tag.lower()]) else: raw_tags.append(raw_tag) data.raw_tags = raw_tags diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 29a54fcab..b069a5e88 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -8,7 +8,7 @@ from .models import Translation, WordEntry from .section_titles import TRANSLATIONS_TITLES -from .tags import TEMPLATE_TAG_ARGS +from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags def extract_translation( @@ -134,6 +134,7 @@ def process_translation_list_item( tr_data.word = clean_node(wxr, None, child) if len(tr_data.word) > 0: + translate_raw_tags(tr_data) page_data[-1].translations.append(tr_data.model_copy(deep=True)) diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py index 5beeed8a6..28ac6147f 100644 --- a/tests/test_zh_gloss.py +++ b/tests/test_zh_gloss.py @@ -57,19 +57,19 @@ def test_example_list(self) -> None: {"glosses": ["好玩的:", "不合理的,不合邏輯的"]}, { "glosses": ["有趣的:", "有趣的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, { "glosses": ["有趣的:", "美味的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, { "glosses": ["有趣的:", "漂亮的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, { "glosses": ["有趣的:", "很好的,卓越的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, ], ) diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 42cbcc1f6..921ef3b4c 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -212,8 +212,7 @@ def test_l_template(self): "lang_code": "cs", "lang": "捷克语", "word": "patližán", - "tags": ["masculine"], - "raw_tags": ["口"], + "tags": ["masculine", "colloquial"], }, ], ) From 7296a7e3100b362ec539f7350e20246a4903a367 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Mar 2024 17:14:58 +0800 Subject: [PATCH 7/7] Translate "qualifier" template tags in linkage list --- src/wiktextract/extractor/zh/gloss.py | 1 - src/wiktextract/extractor/zh/linkage.py | 2 ++ src/wiktextract/extractor/zh/tags.py | 10 ++++++++++ tests/test_zh_linkage.py | 17 +++++++++++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py index 8e9b98d8a..9074548e5 100644 --- a/src/wiktextract/extractor/zh/gloss.py +++ b/src/wiktextract/extractor/zh/gloss.py @@ -1,4 +1,3 @@ - from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode from wiktextract.page import clean_node diff --git a/src/wiktextract/extractor/zh/linkage.py b/src/wiktextract/extractor/zh/linkage.py index 6885ada9c..266604381 100644 --- a/src/wiktextract/extractor/zh/linkage.py +++ b/src/wiktextract/extractor/zh/linkage.py @@ -13,6 +13,7 @@ ) from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item from .models import Linkage, WordEntry +from .tags import translate_raw_tags def extract_linkages( @@ -54,6 +55,7 @@ def extract_linkages( linkage_data.raw_tags.append( clean_node(wxr, None, item_child).strip("()") ) + translate_raw_tags(linkage_data) elif template_name.lower() in DESCENDANT_TEMPLATES: not_term_indexes.add(index) extract_descendant_list_item( diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py index b380e1e03..d1c33f8d9 100644 --- a/src/wiktextract/extractor/zh/tags.py +++ b/src/wiktextract/extractor/zh/tags.py @@ -84,12 +84,22 @@ # https://zh.wiktionary.org/wiki/Template:注释 LABEL_TAGS = { "棄用": "obsolete", + "非標準": "nonstandard", + "非正式": "informal", + "古舊": "dated", + "新詞": "neologism", + "定語": "attributive", + "書面": "literary", + "貶義": "derogatory", + "比喻": "figuratively", + "俗語": "slang", "古": "archaic", "陽": "masculine", "陰": "feminine", "喻": "figuratively", "書": "literary", "口": "colloquial", + "口語": "colloquial", "俚": "slang", "俗": "slang", "方": "dialectal", diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py index a2913bfd2..e2ba8295b 100644 --- a/tests/test_zh_linkage.py +++ b/tests/test_zh_linkage.py @@ -67,3 +67,20 @@ def test_ja_r_template(self): "word": "家主", }, ) + + def test_qual_tag(self): + page_data = [WordEntry(lang="漢語", lang_code="zh", word="駱駝")] + self.wxr.wtp.add_page("Template:qual", 10, "({{{1}}})") + self.wxr.wtp.add_page("Template:zh-l", 10, "{{{1}}}") + self.wxr.wtp.start_page("駱駝") + node = self.wxr.wtp.parse("* {{qual|比喻}} {{zh-l|沙漠之舟}}") + extract_linkages(self.wxr, page_data, node.children, "synonyms", "") + self.assertEqual( + [ + s.model_dump(exclude_defaults=True) + for s in page_data[0].synonyms + ], + [ + {"tags": ["figuratively"], "word": "沙漠之舟"}, + ], + )