Skip to content

Commit

Permalink
Translate some "label" and "qualifier" template raw tags
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Mar 1, 2024
1 parent 3ec084c commit 63d6a7a
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 10 deletions.
3 changes: 2 additions & 1 deletion src/wiktextract/extractor/zh/gloss.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import re

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
Expand All @@ -8,6 +7,7 @@
from ..ruby import extract_ruby
from .example import extract_examples
from .models import Sense, WordEntry
from .tags import translate_raw_tags

# https://zh.wiktionary.org/wiki/Template:Label
LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
Expand Down Expand Up @@ -62,4 +62,5 @@ def extract_gloss(
extract_examples(wxr, new_gloss_data, child_node)

if not has_nested_gloss:
translate_raw_tags(new_gloss_data)
page_data[-1].senses.append(new_gloss_data)
28 changes: 26 additions & 2 deletions src/wiktextract/extractor/zh/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,36 @@
**VOICE_TAGS,
}

# https://zh.wiktionary.org/wiki/Template:Label
# https://zh.wiktionary.org/wiki/Template:Qualifier
# https://zh.wiktionary.org/wiki/Template:古
# https://zh.wiktionary.org/wiki/Template:注释
LABEL_TAGS = {
"棄用": "obsolete",
"古": "archaic",
"陽": "masculine",
"陰": "feminine",
"喻": "figuratively",
"書": "literary",
"口": "colloquial",
"俚": "slang",
"俗": "slang",
"方": "dialectal",
"废": "obsolete",
"貶": "derogatory",
"罕": "rare",
"引": "broadly",
}


ALL_TAGS = {**GRAMMATICAL_TAGS, **LABEL_TAGS}


def translate_raw_tags(data: WordEntry) -> WordEntry:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag.lower() in GRAMMATICAL_TAGS:
data.tags.append(GRAMMATICAL_TAGS[raw_tag.lower()])
if raw_tag.lower() in ALL_TAGS:
data.tags.append(ALL_TAGS[raw_tag.lower()])
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
Expand Down
3 changes: 2 additions & 1 deletion src/wiktextract/extractor/zh/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from .models import Translation, WordEntry
from .section_titles import TRANSLATIONS_TITLES
from .tags import TEMPLATE_TAG_ARGS
from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags


def extract_translation(
Expand Down Expand Up @@ -134,6 +134,7 @@ def process_translation_list_item(
tr_data.word = clean_node(wxr, None, child)

if len(tr_data.word) > 0:
translate_raw_tags(tr_data)
page_data[-1].translations.append(tr_data.model_copy(deep=True))


Expand Down
8 changes: 4 additions & 4 deletions tests/test_zh_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,19 @@ def test_example_list(self) -> None:
{"glosses": ["好玩的:", "不合理的,不合邏輯的"]},
{
"glosses": ["有趣的:", "有趣的"],
"raw_tags": ["棄用"],
"tags": ["obsolete"],
},
{
"glosses": ["有趣的:", "美味的"],
"raw_tags": ["棄用"],
"tags": ["obsolete"],
},
{
"glosses": ["有趣的:", "漂亮的"],
"raw_tags": ["棄用"],
"tags": ["obsolete"],
},
{
"glosses": ["有趣的:", "很好的,卓越的"],
"raw_tags": ["棄用"],
"tags": ["obsolete"],
},
],
)
Expand Down
3 changes: 1 addition & 2 deletions tests/test_zh_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,7 @@ def test_l_template(self):
"lang_code": "cs",
"lang": "捷克语",
"word": "patližán",
"tags": ["masculine"],
"raw_tags": ["口"],
"tags": ["masculine", "colloquial"],
},
],
)

0 comments on commit 63d6a7a

Please sign in to comment.