From 63d6a7a7a335414d9fddb0ce959b134cb295828c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 1 Mar 2024 16:16:21 +0800 Subject: [PATCH] Translate some "label" and "qualifier" template raw tags --- src/wiktextract/extractor/zh/gloss.py | 3 ++- src/wiktextract/extractor/zh/tags.py | 28 +++++++++++++++++++-- src/wiktextract/extractor/zh/translation.py | 3 ++- tests/test_zh_gloss.py | 8 +++--- tests/test_zh_translation.py | 3 +-- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py index c50a808c6..8e9b98d8a 100644 --- a/src/wiktextract/extractor/zh/gloss.py +++ b/src/wiktextract/extractor/zh/gloss.py @@ -1,4 +1,3 @@ -import re from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import TemplateNode @@ -8,6 +7,7 @@ from ..ruby import extract_ruby from .example import extract_examples from .models import Sense, WordEntry +from .tags import translate_raw_tags # https://zh.wiktionary.org/wiki/Template:Label LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) @@ -62,4 +62,5 @@ def extract_gloss( extract_examples(wxr, new_gloss_data, child_node) if not has_nested_gloss: + translate_raw_tags(new_gloss_data) page_data[-1].senses.append(new_gloss_data) diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py index 26bbe5d65..b380e1e03 100644 --- a/src/wiktextract/extractor/zh/tags.py +++ b/src/wiktextract/extractor/zh/tags.py @@ -78,12 +78,36 @@ **VOICE_TAGS, } +# https://zh.wiktionary.org/wiki/Template:Label +# https://zh.wiktionary.org/wiki/Template:Qualifier +# https://zh.wiktionary.org/wiki/Template:古 +# https://zh.wiktionary.org/wiki/Template:注释 +LABEL_TAGS = { + "棄用": "obsolete", + "古": "archaic", + "陽": "masculine", + "陰": "feminine", + "喻": "figuratively", + "書": "literary", + "口": "colloquial", + "俚": "slang", + "俗": "slang", + "方": "dialectal", + "废": "obsolete", + "貶": "derogatory", + "罕": "rare", + "引": "broadly", +} + + +ALL_TAGS = {**GRAMMATICAL_TAGS, **LABEL_TAGS} + def translate_raw_tags(data: WordEntry) -> WordEntry: raw_tags = [] for raw_tag in data.raw_tags: - if raw_tag.lower() in GRAMMATICAL_TAGS: - data.tags.append(GRAMMATICAL_TAGS[raw_tag.lower()]) + if raw_tag.lower() in ALL_TAGS: + data.tags.append(ALL_TAGS[raw_tag.lower()]) else: raw_tags.append(raw_tag) data.raw_tags = raw_tags diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py index 29a54fcab..b069a5e88 100644 --- a/src/wiktextract/extractor/zh/translation.py +++ b/src/wiktextract/extractor/zh/translation.py @@ -8,7 +8,7 @@ from .models import Translation, WordEntry from .section_titles import TRANSLATIONS_TITLES -from .tags import TEMPLATE_TAG_ARGS +from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags def extract_translation( @@ -134,6 +134,7 @@ def process_translation_list_item( tr_data.word = clean_node(wxr, None, child) if len(tr_data.word) > 0: + translate_raw_tags(tr_data) page_data[-1].translations.append(tr_data.model_copy(deep=True)) diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py index 5beeed8a6..28ac6147f 100644 --- a/tests/test_zh_gloss.py +++ b/tests/test_zh_gloss.py @@ -57,19 +57,19 @@ def test_example_list(self) -> None: {"glosses": ["好玩的:", "不合理的,不合邏輯的"]}, { "glosses": ["有趣的:", "有趣的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, { "glosses": ["有趣的:", "美味的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, { "glosses": ["有趣的:", "漂亮的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, { "glosses": ["有趣的:", "很好的,卓越的"], - "raw_tags": ["棄用"], + "tags": ["obsolete"], }, ], ) diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py index 42cbcc1f6..921ef3b4c 100644 --- a/tests/test_zh_translation.py +++ b/tests/test_zh_translation.py @@ -212,8 +212,7 @@ def test_l_template(self): "lang_code": "cs", "lang": "捷克语", "word": "patližán", - "tags": ["masculine"], - "raw_tags": ["口"], + "tags": ["masculine", "colloquial"], }, ], )