From b8413ab1ceded05e62290ef3eede070d84e78783 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Thu, 29 Feb 2024 17:08:57 +0800
Subject: [PATCH 1/7] Add tag translation data to zh edition

---
 src/wiktextract/extractor/zh/tags.py | 90 ++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)
 create mode 100644 src/wiktextract/extractor/zh/tags.py

diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py
new file mode 100644
index 000000000..9e24dff6a
--- /dev/null
+++ b/src/wiktextract/extractor/zh/tags.py
@@ -0,0 +1,90 @@
+from .models import WordEntry
+
+GENDER_TAGS: dict[str, str] = {
+    "陰性": "feminine",
+    "陽性": "masculine",
+    "中性": "neuter",
+}
+
+NUMBER_TAGS: dict[str, str] = {
+    "單數": "singular",
+    "複數": "plural",
+    "定單數": "definite singular",
+    "不定複數": "indefinite plural",
+    "定複數": "definite plural",
+    "斜格複數": "oblique plural",
+    "主格單數": "nominative singular",
+    "主格複數": "nominative plural",
+    "屬格單數": "genitive singular",
+    "屬格複數": "genitive plural",
+    "陰性單數": "feminine singular",
+    "陽性單數": "masculine singular",
+    "陰性複數": "feminine plural",
+    "陽性複數": "masculine plural",
+    "中性複數": "neuter plural",
+    "中性單數": "neuter singular",
+}
+
+# https://en.wikipedia.org/wiki/Count_noun
+COUNT_TAGS: dict[str, str] = {
+    "可數": "countable",
+    "不可數": "uncountable",
+}
+
+OTHER_TAGS: dict[str, str] = {
+    "指小詞": "diminutive",
+    "變格類型": "declension pattern",
+}
+
+VERB_TAGS: dict[str, str] = {
+    "及物": "transitive",
+    "不及物": "intransitive",
+}
+
+# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms
+JA_STEM_FORMS: dict[str, str] = {
+    "未然形": "imperfective",
+    "連用形": "continuative",
+    "終止形": "terminal",
+    "連體形": "attributive",
+    "連体形": "attributive",
+    "假定形": "hypothetical",
+    "仮定形": "hypothetical",
+    "命令形": "imperative",
+}
+
+# https://en.wikipedia.org/wiki/Voice_(grammar)
+VOICE_TAGS: dict[str, str] = {
+    "被動形": "passive",
+    "使役形": "causative",
+    "可能形": "potential",
+    "意志形": "volitional",
+    "否定形": "negative",
+    "否定連用形": "negative continuative",
+    "尊敬形": "formal",
+    "完成形": "perfective",
+    "接續形": "conjunctive",
+    "條件形": "hypothetical conditional",
+}
+
+
+GRAMMATICAL_TAGS: dict[str, str] = {
+    **GENDER_TAGS,
+    **NUMBER_TAGS,
+    **COUNT_TAGS,
+    **OTHER_TAGS,
+    **VERB_TAGS,
+    **JA_STEM_FORMS,
+    **VOICE_TAGS,
+}
+
+
+def translate_raw_tags(data: WordEntry) -> WordEntry:
+    raw_tags = []
+    for raw_tag in data.raw_tags:
+        if raw_tag.lower() in GRAMMATICAL_TAGS:
+            data.tags.append(GRAMMATICAL_TAGS[raw_tag.lower()])
+        else:
+            raw_tags.append(raw_tag)
+    data.raw_tags = raw_tags
+    return data

From 2f720546855e283152ba2b327a84e0cc55c3fd13 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Thu, 29 Feb 2024 17:09:42 +0800
Subject: [PATCH 2/7] Translate zh edition headline form tags

and update code to parse the new "head" template HTML layout
---
 src/wiktextract/extractor/zh/headword_line.py | 96 +++++++------------
 src/wiktextract/extractor/zh/tags.py          | 31 ++++++
 tests/test_zh_headword.py                     | 19 ++--
 3 files changed, 76 insertions(+), 70 deletions(-)

diff --git a/src/wiktextract/extractor/zh/headword_line.py b/src/wiktextract/extractor/zh/headword_line.py
index ed55bf55d..d14f90e63 100644
--- a/src/wiktextract/extractor/zh/headword_line.py
+++ b/src/wiktextract/extractor/zh/headword_line.py
@@ -9,33 +9,7 @@
 from ..ruby import extract_ruby
 from ..share import strip_nodes
 from .models import Form, WordEntry
-
-# https://zh.wiktionary.org/wiki/Module:Gender_and_number
-GENDERS = {
-    "f": "feminine",
-    "m": "masculine",
-    "n": "neuter",
-    "c": "common",
-    # Animacy
-    "an": "animate",
-    "in": "inanimate",
-    # Animal (for Ukrainian, Belarusian, Polish)
-    "anml": "animal",
-    # Personal (for Ukrainian, Belarusian, Polish)
-    "pr": "personal",
-    # Nonpersonal not currently used
-    "np": "nonpersonal",
-    # Virility (for Polish)
-    "vr": "virile",
-    "nv": "nonvirile",
-    # Numbers
-    "s": "singular number",
-    "d": "dual number",
-    "p": "plural number",
-    # Verb qualifiers
-    "impf": "imperfective aspect",
-    "pf": "perfective aspect",
-}
+from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
 
 
 def extract_headword_line(
@@ -55,32 +29,34 @@ def extract_headword_line(
         wxr.wtp.node_to_wikitext(node), expand_all=True
     )
     forms_start_index = 0
-    for index, child in expanded_node.find_child(NodeKind.HTML, True):
-        if child.tag == "strong" and "headword" in child.attrs.get("class", ""):
-            forms_start_index = index + 1
-        elif child.tag == "span":
-            class_names = child.attrs.get("class", "")
-            if "headword-tr" in class_names:
+    for span_node in expanded_node.find_html(
+        "span", attr_name="class", attr_value="headword-line"
+    ):
+        for index, span_child in span_node.find_child(NodeKind.HTML, True):
+            if span_child.tag == "span":
                 forms_start_index = index + 1
-
-                page_data[-1].forms.append(
-                    Form(
-                        form=clean_node(wxr, page_data[-1], child),
-                        tags=["romanization"],
+                class_names = span_child.attrs.get("class", "")
+                if "headword-tr" in class_names:
+                    page_data[-1].forms.append(
+                        Form(
+                            form=clean_node(wxr, page_data[-1], span_child),
+                            tags=["romanization"],
+                        )
                     )
-                )
-            elif "gender" in class_names:
+                elif "gender" in class_names:
+                    for abbr_tag in span_child.find_html("abbr"):
+                        gender = abbr_tag.children[0]
+                        if gender in TEMPLATE_TAG_ARGS:
+                            page_data[-1].tags.append(TEMPLATE_TAG_ARGS[gender])
+                        else:
+                            page_data[-1].raw_tags.append(gender)
+                            translate_raw_tags(page_data[-1])
+            elif (
+                span_child.tag == "strong"
+                and "headword" in span_child.attrs.get("class", "")
+            ):
                 forms_start_index = index + 1
-                for abbr_tag in child.find_html("abbr"):
-                    gender = abbr_tag.children[0]
-                    if gender in GENDERS:
-                        page_data[-1].tags.append(GENDERS[gender])
-                    else:
-                        page_data[-1].raw_tags.append(gender)
-            if lang_code == "ja":
-                for span_child in child.find_html(
-                    "strong", attr_name="class", attr_value="headword"
-                ):
+                if lang_code == "ja":
                     ruby_data, node_without_ruby = extract_ruby(wxr, span_child)
                     page_data[-1].forms.append(
                         Form(
@@ -91,13 +67,13 @@ def extract_headword_line(
                             tags=["canonical"],
                         )
                     )
-        elif child.tag == "b":
-            # this is a form <b> tag, already inside form parentheses
-            break
+            elif span_child.tag == "b":
+                # this is a form <b> tag, already inside form parentheses
+                break
 
-    extract_headword_forms(
-        wxr, page_data, expanded_node.children[forms_start_index:]
-    )
+        extract_headword_forms(
+            wxr, page_data, span_node.children[forms_start_index:]
+        )
 
 
 def extract_headword_forms(
@@ -150,8 +126,8 @@ def process_forms_text(
                         and "gender" in next_node.attrs.get("class", "")
                     ):
                         gender = clean_node(wxr, None, next_node)
-                        if gender in GENDERS:
-                            form_tags.append(GENDERS[gender])
+                        if gender in TEMPLATE_TAG_ARGS:
+                            form_tags.append(TEMPLATE_TAG_ARGS[gender])
                         else:
                             raw_form_tags.append(gender)
 
@@ -161,6 +137,7 @@ def process_forms_text(
                     tags=form_tags,
                     ruby=ruby_data,
                 )
+                translate_raw_tags(form_data)
                 page_data[-1].forms.append(form_data)
             elif (
                 node.tag == "span"
@@ -180,6 +157,7 @@ def process_forms_text(
         )
         if len(tags_list) > 0:
             page_data[-1].raw_tags.extend(tags_list)
+            translate_raw_tags(page_data[-1])
     else:
         clean_node(wxr, page_data[-1], tag_nodes)  # find categories
 
@@ -187,7 +165,7 @@ def process_forms_text(
 def extract_headword_tags(tags_str: str) -> list[str]:
     tags = []
     for tag_str in (
-        s.strip() for s in re.split("&|或", tags_str) if len(s.strip()) > 0
+        s.strip() for s in re.split("&|或|和", tags_str) if len(s.strip()) > 0
     ):
         tags.append(tag_str)
     return tags
diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py
index 9e24dff6a..26bbe5d65 100644
--- a/src/wiktextract/extractor/zh/tags.py
+++ b/src/wiktextract/extractor/zh/tags.py
@@ -88,3 +88,34 @@ def translate_raw_tags(data: WordEntry) -> WordEntry:
             raw_tags.append(raw_tag)
     data.raw_tags = raw_tags
     return data
+
+
+# https://zh.wiktionary.org/wiki/Template:T
+# https://zh.wiktionary.org/wiki/Template:Head
+# https://zh.wiktionary.org/wiki/Module:Gender_and_number
+TEMPLATE_TAG_ARGS = {
+    "f": "feminine",
+    "m": "masculine",
+    "n": "neuter",
+    "c": "common",
+    # Animacy
+    "an": "animate",
+    "in": "inanimate",
+    # Animal (for Ukrainian, Belarusian, Polish)
+    "anml": "animal",
+    # Personal (for Ukrainian, Belarusian, Polish)
+    "pr": "personal",
+    # Nonpersonal not currently used
+    "np": "nonpersonal",
+    # Virility (for Polish)
+    "vr": "virile",
+    "nv": "nonvirile",
+    # Numbers
+    "s": "singular number",
+    "d": "dual number",
+    "p": "plural number",
+    # Verb qualifiers
+    "impf": "imperfective aspect",
+    "pf": "perfective aspect",
+    "mf": "masculine feminine",
+}
diff --git a/tests/test_zh_headword.py b/tests/test_zh_headword.py
index 60bffc110..aa6ada938 100644
--- a/tests/test_zh_headword.py
+++ b/tests/test_zh_headword.py
@@ -26,7 +26,7 @@ def test_english_headword(self) -> None:
         self.wxr.wtp.add_page(
             "Template:en-noun",
             10,
-            '<strong class="Latn headword" lang="en">manga</strong> ([[可數|可數]] & [[不可數|不可數]]，複數 <b class="Latn form-of lang-en p-form-of" lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b class="Latn form-of lang-en p-form-of" lang="en">[[mangas#英語|mangas]]</b>)',
+            '<span class="headword-line"><strong class="Latn headword" lang="en">-{manga}-</strong> ([[可數|可數]] <small>和</small> [[不可數|不可數]]-{}-，複數-{ <b lang="en"><strong class="selflink">manga</strong></b> <small>或</small> <b>[[mangas#英語|-{mangas}-]]</b>}-)</span>',
         )
         root = self.wxr.wtp.parse("{{en-noun|~|manga|s}}")
         page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
@@ -40,10 +40,10 @@ def test_english_headword(self) -> None:
                     "lang_code": "en",
                     "lang": "英語",
                     "forms": [
-                        {"form": "manga", "raw_tags": ["複數"]},
-                        {"form": "mangas", "raw_tags": ["複數"]},
+                        {"form": "manga", "tags": ["plural"]},
+                        {"form": "mangas", "tags": ["plural"]},
                     ],
-                    "raw_tags": ["可數", "不可數"],
+                    "tags": ["countable", "uncountable"],
                 }
             ],
         )
@@ -56,7 +56,7 @@ def test_headword_gender(self) -> None:
         self.wxr.wtp.add_page(
             "Template:nl-noun",
             10,
-            '<strong class="Latn headword" lang="nl">manga</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數 <b class="Latn form-of lang-nl p-form-of" lang="nl">[[manga\'s#荷蘭語|manga\'s]]</b>，指小詞 <b class="Latn form-of lang-nl 指小詞-form-of" lang="nl">[[mangaatje#荷蘭語|mangaatje]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>)',
+            '<span class="headword-line"><strong class="Latn headword" lang="nl">-{manga}-</strong>&nbsp;<span class="gender"><abbr title="陽性名詞">m</abbr></span> (複數-{ <b>[[manga\'s#荷蘭語|-{manga\'s}-]]</b>}-，指小詞-{ <b>[[mangaatje#荷蘭語|-{mangaatje}-]]</b>&nbsp;<span class="gender"><abbr title="中性名詞">n</abbr></span>}-)</span>',
         )
         root = self.wxr.wtp.parse("{{nl-noun|m|-'s|mangaatje}}")
         page_data = [WordEntry(word="manga", lang_code="en", lang="英語")]
@@ -70,13 +70,10 @@ def test_headword_gender(self) -> None:
                     "lang_code": "en",
                     "lang": "英語",
                     "forms": [
-                        {"form": "manga's", "raw_tags": ["複數"]},
+                        {"form": "manga's", "tags": ["plural"]},
                         {
                             "form": "mangaatje",
-                            "raw_tags": [
-                                "指小詞",
-                            ],
-                            "tags": ["neuter"],
+                            "tags": ["neuter", "diminutive"],
                         },
                     ],
                     "tags": ["masculine"],
@@ -92,7 +89,7 @@ def test_headword_roman(self) -> None:
         self.wxr.wtp.add_page(
             "Template:head",
             10,
-            '<strong class="polytonic headword" lang="grc">-κρατίᾱς</strong> (<span lang="grc-Latn" class="headword-tr tr Latn" dir="ltr">-kratíās</span>)&nbsp;<span class="gender"><abbr title="陰性名詞">f</abbr></span>',
+            '<span class="headword-line"><strong class="Polyt headword" lang="grc">-{-κρατίᾱς}-</strong> (<span lang="grc-Latn" class="headword-tr tr Latn" dir="ltr">-kratíās</span>)&nbsp;<span class="gender"><abbr title="陰性名詞">f</abbr></span></span>',
         )
         root = self.wxr.wtp.parse("{{head|grc|後綴變格形|g=f|head=-κρατίᾱς}}")
         page_data = [

From 8ff624ef7a89f463f8bf575c8bec8594c5ccc2ae Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Thu, 29 Feb 2024 18:14:07 +0800
Subject: [PATCH 3/7] Translate some tags in zh edition's translation list

---
 src/wiktextract/extractor/zh/translation.py | 25 ++++++++-------------
 tests/test_zh_translation.py                |  8 ++++---
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
index b086388ce..cebf47624 100644
--- a/src/wiktextract/extractor/zh/translation.py
+++ b/src/wiktextract/extractor/zh/translation.py
@@ -8,6 +8,7 @@
 
 from .models import Translation, WordEntry
 from .section_titles import TRANSLATIONS_TITLES
+from .tags import TEMPLATE_TAG_ARGS
 
 
 def extract_translation(
@@ -101,22 +102,14 @@ def process_translation_list_item(
                 tr_data.lit = clean_node(
                     wxr, None, child.template_parameters.get("lit", "")
                 )
-                # find gender tags
-                expanded_template = wxr.wtp.parse(
-                    wxr.wtp.node_to_wikitext(child), expand_all=True
-                )
-                for span_node in expanded_template.find_html("span"):
-                    class_str = span_node.attrs.get("class", "")
-                    if "gender" in class_str:
-                        for abbr_tag in span_node.find_html("abbr"):
-                            if len(abbr_tag.attrs.get("title")) > 0:
-                                tr_data.raw_tags.append(
-                                    clean_node(
-                                        wxr, None, abbr_tag.attrs.get("title")
-                                    )
-                                )
-                    elif tr_data.roman == "" and class_str.startswith("tr "):
-                        tr_data.roman = clean_node(wxr, None, span_node)
+                for arg_key, arg_value in child.template_parameters.items():
+                    if (
+                        isinstance(arg_key, int) and arg_key >= 3
+                    ) or arg_key == "g":  # template "l" uses the "g" arg
+                        for tag_arg in arg_value.split("-"):
+                            if tag_arg in TEMPLATE_TAG_ARGS:
+                                tr_data.tags.append(TEMPLATE_TAG_ARGS[tag_arg])
+
             elif template_name == "t-needed":
                 # ignore empty translation
                 continue
diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py
index 1b3ad9611..b08583e96 100644
--- a/tests/test_zh_translation.py
+++ b/tests/test_zh_translation.py
@@ -57,14 +57,15 @@ def test_t_template(self):
                     "sense": "太陽上層大氣射出的超高速電漿流",
                     "word": "רוח סולרית",
                     "roman": "ruakh solarit",
-                    "raw_tags": ["陰性名詞"],
+                    "tags": ["feminine"],
                 },
                 {
                     "lang_code": "sh",
                     "lang": "西里尔字母",
                     "sense": "太陽上層大氣射出的超高速電漿流",
                     "word": "сунчев ветар",
-                    "raw_tags": ["Ekavian", "陽性名詞"],
+                    "tags": ["masculine"],
+                    "raw_tags": ["Ekavian"],
                 },
             ],
         )
@@ -211,7 +212,8 @@ def test_l_template(self):
                     "lang_code": "cs",
                     "lang": "捷克语",
                     "word": "patližán",
-                    "raw_tags": ["陽性名詞", "口语词汇"],
+                    "tags": ["masculine"],
+                    "raw_tags": ["口语词汇"],
                 },
             ],
         )

From 4a5a06a409a60219b11fca35cf183b11284a956d Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 1 Mar 2024 13:36:58 +0800
Subject: [PATCH 4/7] Only add the "label" template before gloss text to
 `raw_tags`

This template is used to add grammatical information to gloss text.
---
 src/wiktextract/extractor/zh/gloss.py  | 70 ++++++++------------------
 src/wiktextract/extractor/zh/models.py |  1 -
 tests/test_zh_gloss.py                 |  7 +--
 3 files changed, 24 insertions(+), 54 deletions(-)

diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py
index 3bd6a11a9..fdbec7c60 100644
--- a/src/wiktextract/extractor/zh/gloss.py
+++ b/src/wiktextract/extractor/zh/gloss.py
@@ -1,6 +1,7 @@
 import re
 
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import TemplateNode
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
@@ -8,6 +9,9 @@
 from .example import extract_examples
 from .models import Sense, WordEntry
 
+# https://zh.wiktionary.org/wiki/Template:Label
+LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
+
 
 def extract_gloss(
     wxr: WiktextractContext,
@@ -17,11 +21,19 @@ def extract_gloss(
 ) -> None:
     lang_code = page_data[-1].lang_code
     for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
-        gloss_nodes = [
-            child
-            for child in list_item_node.children
-            if not isinstance(child, WikiNode) or child.kind != NodeKind.LIST
-        ]
+        gloss_nodes = []
+        raw_tags = []
+        for node in list_item_node.children:
+            if (
+                isinstance(node, TemplateNode)
+                and node.template_name in LABEL_TEMPLATES
+            ):
+                raw_tags.append(clean_node(wxr, None, node).strip("()"))
+            elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+                continue
+            else:
+                gloss_nodes.append(node)
+
         if lang_code == "ja":
             expanded_node = wxr.wtp.parse(
                 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True
@@ -29,13 +41,13 @@ def extract_gloss(
             ruby_data, nodes_without_ruby = extract_ruby(
                 wxr, expanded_node.children
             )
-            raw_gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
+            gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
         else:
             ruby_data = []
-            raw_gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
-        new_gloss_data = merge_gloss_data(
-            gloss_data, extract_gloss_and_tags(raw_gloss_text)
-        )
+            gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
+        new_gloss_data = gloss_data.model_copy(deep=True)
+        new_gloss_data.raw_tags.extend(raw_tags)
+        new_gloss_data.glosses.append(gloss_text)
         if len(ruby_data) > 0:
             new_gloss_data.ruby = ruby_data
 
@@ -50,41 +62,3 @@ def extract_gloss(
 
         if not has_nested_gloss:
             page_data[-1].senses.append(new_gloss_data)
-
-
-def merge_gloss_data(data_a: Sense, data_b: Sense) -> Sense:
-    new_data = Sense()
-    for data in data_a, data_b:
-        for field in data.model_fields:
-            pre_data = getattr(new_data, field)
-            pre_data.extend(getattr(data, field))
-    return new_data
-
-
-def extract_gloss_and_tags(raw_gloss: str) -> Sense:
-    left_brackets = ("(", "（")
-    right_brackets = (")", "）")
-    if raw_gloss.startswith(left_brackets) or raw_gloss.endswith(
-        right_brackets
-    ):
-        tags = []
-        split_tag_regex = r", ?|，|或"
-        front_tag_end = -1
-        rear_tag_start = len(raw_gloss)
-        for index, left_bracket in enumerate(left_brackets):
-            if raw_gloss.startswith(left_bracket):
-                front_tag_end = raw_gloss.find(right_brackets[index])
-                front_label = raw_gloss[1:front_tag_end]
-                tags += re.split(split_tag_regex, front_label)
-        for index, right_bracket in enumerate(right_brackets):
-            if raw_gloss.endswith(right_bracket):
-                rear_tag_start = raw_gloss.rfind(left_brackets[index])
-                rear_label = raw_gloss.rstrip("".join(right_brackets))[
-                    rear_tag_start + 1 :
-                ]
-                tags += re.split(split_tag_regex, rear_label)
-
-        gloss = raw_gloss[front_tag_end + 1 : rear_tag_start].strip()
-        return Sense(glosses=[gloss], raw_glosses=[raw_gloss], raw_tags=tags)
-    else:
-        return Sense(glosses=[raw_gloss])
diff --git a/src/wiktextract/extractor/zh/models.py b/src/wiktextract/extractor/zh/models.py
index 05276ec85..c4adc5959 100644
--- a/src/wiktextract/extractor/zh/models.py
+++ b/src/wiktextract/extractor/zh/models.py
@@ -31,7 +31,6 @@ class Example(ChineseBaseModel):
 
 class Sense(ChineseBaseModel):
     glosses: list[str] = []
-    raw_glosses: list[str] = Field([], description="Gloss text without tags")
     tags: list[str] = []
     raw_tags: list[str] = []
     categories: list[str] = []
diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
index 9d8959859..5beeed8a6 100644
--- a/tests/test_zh_gloss.py
+++ b/tests/test_zh_gloss.py
@@ -40,12 +40,13 @@ def test_example_list(self) -> None:
 ## 有趣的，滑稽的，可笑的
 ## 奇怪的，不正常的
 ## 不合理的，不合邏輯的
-# (棄用) [[有趣]]的：
+# {{lb|ja|棄用}} [[有趣]]的：
 ## [[有趣]]的
 ## [[美味]]的
 ## [[漂亮]]的
 ## [[很好]]的，[[卓越]]的"""
         self.wxr.wtp.start_page("test")
+        self.wxr.wtp.add_page("Template:lb", 10, "({{{2|}}})")
         node = self.wxr.wtp.parse(wikitext)
         extract_gloss(self.wxr, page_data, node.children[0], Sense())
         self.assertEqual(
@@ -56,22 +57,18 @@ def test_example_list(self) -> None:
                 {"glosses": ["好玩的：", "不合理的，不合邏輯的"]},
                 {
                     "glosses": ["有趣的：", "有趣的"],
-                    "raw_glosses": ["(棄用) 有趣的："],
                     "raw_tags": ["棄用"],
                 },
                 {
                     "glosses": ["有趣的：", "美味的"],
-                    "raw_glosses": ["(棄用) 有趣的："],
                     "raw_tags": ["棄用"],
                 },
                 {
                     "glosses": ["有趣的：", "漂亮的"],
-                    "raw_glosses": ["(棄用) 有趣的："],
                     "raw_tags": ["棄用"],
                 },
                 {
                     "glosses": ["有趣的：", "很好的，卓越的"],
-                    "raw_glosses": ["(棄用) 有趣的："],
                     "raw_tags": ["棄用"],
                 },
             ],

From 3ec084cbe27a0f74dfac96b060e56a31e2c56b68 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 1 Mar 2024 14:57:44 +0800
Subject: [PATCH 5/7] Only add qualifier templates in gloss and translation
 list to "raw_tags"

---
 src/wiktextract/extractor/zh/gloss.py       | 11 ++++++-----
 src/wiktextract/extractor/zh/translation.py | 22 ++++++++-------------
 tests/test_zh_translation.py                |  4 ++--
 3 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py
index fdbec7c60..c50a808c6 100644
--- a/src/wiktextract/extractor/zh/gloss.py
+++ b/src/wiktextract/extractor/zh/gloss.py
@@ -24,11 +24,12 @@ def extract_gloss(
         gloss_nodes = []
         raw_tags = []
         for node in list_item_node.children:
-            if (
-                isinstance(node, TemplateNode)
-                and node.template_name in LABEL_TEMPLATES
-            ):
-                raw_tags.append(clean_node(wxr, None, node).strip("()"))
+            if isinstance(node, TemplateNode):
+                raw_tag = clean_node(wxr, None, node)
+                if node.template_name in LABEL_TEMPLATES:
+                    raw_tags.append(raw_tag.strip("()"))
+                elif raw_tag.startswith("〈") and raw_tag.endswith("〉"):
+                    raw_tags.append(raw_tag.strip("〈〉"))
             elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
                 continue
             else:
diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
index cebf47624..29a54fcab 100644
--- a/src/wiktextract/extractor/zh/translation.py
+++ b/src/wiktextract/extractor/zh/translation.py
@@ -113,21 +113,15 @@ def process_translation_list_item(
             elif template_name == "t-needed":
                 # ignore empty translation
                 continue
+            elif template_name in ("qualifier", "q"):
+                raw_tag = clean_node(wxr, None, child)
+                tr_data.raw_tags.append(raw_tag.strip("()"))
             else:
-                # qualifier template
-                expanded_template = wxr.wtp.parse(
-                    wxr.wtp.node_to_wikitext(child), expand_all=True
-                )
-                find_title = False
-                for span_node in expanded_template.find_html("span"):
-                    tag = span_node.attrs.get("title", "")
-                    if len(tag) > 0:
-                        tr_data.raw_tags.append(tag.strip())
-                        find_title = True
-                if not find_title:
-                    tag = clean_node(wxr, None, child)
-                    if len(tag) > 0:
-                        tr_data.raw_tags.append(tag.strip("()"))
+                # zh qualifier templates that use template "注释"
+                # https://zh.wiktionary.org/wiki/Template:注释
+                raw_tag = clean_node(wxr, None, child)
+                if raw_tag.startswith("〈") and raw_tag.endswith("〉"):
+                    tr_data.raw_tags.append(raw_tag.strip("〈〉"))
         elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK:
             if len(tr_data.word) > 0:
                 page_data[-1].translations.append(tr_data.model_copy(deep=True))
diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py
index b08583e96..42cbcc1f6 100644
--- a/tests/test_zh_translation.py
+++ b/tests/test_zh_translation.py
@@ -189,7 +189,7 @@ def test_l_template(self):
 {{#if:{{{g|}}}|<span class="gender"><abbr title="陽性名詞">m</abbr></span>}}""",
         )
         self.wxr.wtp.add_page(
-            "Template:口", 10, '<span title="口语词汇">口</span>〉'
+            "Template:口", 10, '〈<span title="口语词汇">口</span>〉'
         )
         page_data = [WordEntry(word="茄子", lang_code="zh", lang="漢語")]
         node = self.wxr.wtp.parse(
@@ -213,7 +213,7 @@ def test_l_template(self):
                     "lang": "捷克语",
                     "word": "patližán",
                     "tags": ["masculine"],
-                    "raw_tags": ["口语词汇"],
+                    "raw_tags": ["口"],
                 },
             ],
         )

From 63d6a7a7a335414d9fddb0ce959b134cb295828c Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 1 Mar 2024 16:16:21 +0800
Subject: [PATCH 6/7] Translate some "label" and "qualifier" template raw tags

---
 src/wiktextract/extractor/zh/gloss.py       |  3 ++-
 src/wiktextract/extractor/zh/tags.py        | 28 +++++++++++++++++++--
 src/wiktextract/extractor/zh/translation.py |  3 ++-
 tests/test_zh_gloss.py                      |  8 +++---
 tests/test_zh_translation.py                |  3 +--
 5 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py
index c50a808c6..8e9b98d8a 100644
--- a/src/wiktextract/extractor/zh/gloss.py
+++ b/src/wiktextract/extractor/zh/gloss.py
@@ -1,4 +1,3 @@
-import re
 
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
@@ -8,6 +7,7 @@
 from ..ruby import extract_ruby
 from .example import extract_examples
 from .models import Sense, WordEntry
+from .tags import translate_raw_tags
 
 # https://zh.wiktionary.org/wiki/Template:Label
 LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
@@ -62,4 +62,5 @@ def extract_gloss(
                     extract_examples(wxr, new_gloss_data, child_node)
 
         if not has_nested_gloss:
+            translate_raw_tags(new_gloss_data)
             page_data[-1].senses.append(new_gloss_data)
diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py
index 26bbe5d65..b380e1e03 100644
--- a/src/wiktextract/extractor/zh/tags.py
+++ b/src/wiktextract/extractor/zh/tags.py
@@ -78,12 +78,36 @@
     **VOICE_TAGS,
 }
 
+# https://zh.wiktionary.org/wiki/Template:Label
+# https://zh.wiktionary.org/wiki/Template:Qualifier
+# https://zh.wiktionary.org/wiki/Template:古
+# https://zh.wiktionary.org/wiki/Template:注释
+LABEL_TAGS = {
+    "棄用": "obsolete",
+    "古": "archaic",
+    "陽": "masculine",
+    "陰": "feminine",
+    "喻": "figuratively",
+    "書": "literary",
+    "口": "colloquial",
+    "俚": "slang",
+    "俗": "slang",
+    "方": "dialectal",
+    "废": "obsolete",
+    "貶": "derogatory",
+    "罕": "rare",
+    "引": "broadly",
+}
+
+
+ALL_TAGS = {**GRAMMATICAL_TAGS, **LABEL_TAGS}
+
 
 def translate_raw_tags(data: WordEntry) -> WordEntry:
     raw_tags = []
     for raw_tag in data.raw_tags:
-        if raw_tag.lower() in GRAMMATICAL_TAGS:
-            data.tags.append(GRAMMATICAL_TAGS[raw_tag.lower()])
+        if raw_tag.lower() in ALL_TAGS:
+            data.tags.append(ALL_TAGS[raw_tag.lower()])
         else:
             raw_tags.append(raw_tag)
     data.raw_tags = raw_tags
diff --git a/src/wiktextract/extractor/zh/translation.py b/src/wiktextract/extractor/zh/translation.py
index 29a54fcab..b069a5e88 100644
--- a/src/wiktextract/extractor/zh/translation.py
+++ b/src/wiktextract/extractor/zh/translation.py
@@ -8,7 +8,7 @@
 
 from .models import Translation, WordEntry
 from .section_titles import TRANSLATIONS_TITLES
-from .tags import TEMPLATE_TAG_ARGS
+from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
 
 
 def extract_translation(
@@ -134,6 +134,7 @@ def process_translation_list_item(
             tr_data.word = clean_node(wxr, None, child)
 
     if len(tr_data.word) > 0:
+        translate_raw_tags(tr_data)
         page_data[-1].translations.append(tr_data.model_copy(deep=True))
 
 
diff --git a/tests/test_zh_gloss.py b/tests/test_zh_gloss.py
index 5beeed8a6..28ac6147f 100644
--- a/tests/test_zh_gloss.py
+++ b/tests/test_zh_gloss.py
@@ -57,19 +57,19 @@ def test_example_list(self) -> None:
                 {"glosses": ["好玩的：", "不合理的，不合邏輯的"]},
                 {
                     "glosses": ["有趣的：", "有趣的"],
-                    "raw_tags": ["棄用"],
+                    "tags": ["obsolete"],
                 },
                 {
                     "glosses": ["有趣的：", "美味的"],
-                    "raw_tags": ["棄用"],
+                    "tags": ["obsolete"],
                 },
                 {
                     "glosses": ["有趣的：", "漂亮的"],
-                    "raw_tags": ["棄用"],
+                    "tags": ["obsolete"],
                 },
                 {
                     "glosses": ["有趣的：", "很好的，卓越的"],
-                    "raw_tags": ["棄用"],
+                    "tags": ["obsolete"],
                 },
             ],
         )
diff --git a/tests/test_zh_translation.py b/tests/test_zh_translation.py
index 42cbcc1f6..921ef3b4c 100644
--- a/tests/test_zh_translation.py
+++ b/tests/test_zh_translation.py
@@ -212,8 +212,7 @@ def test_l_template(self):
                     "lang_code": "cs",
                     "lang": "捷克语",
                     "word": "patližán",
-                    "tags": ["masculine"],
-                    "raw_tags": ["口"],
+                    "tags": ["masculine", "colloquial"],
                 },
             ],
         )

From 7296a7e3100b362ec539f7350e20246a4903a367 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Fri, 1 Mar 2024 17:14:58 +0800
Subject: [PATCH 7/7] Translate "qualifier" template tags in linkage list

---
 src/wiktextract/extractor/zh/gloss.py   |  1 -
 src/wiktextract/extractor/zh/linkage.py |  2 ++
 src/wiktextract/extractor/zh/tags.py    | 10 ++++++++++
 tests/test_zh_linkage.py                | 17 +++++++++++++++++
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/wiktextract/extractor/zh/gloss.py b/src/wiktextract/extractor/zh/gloss.py
index 8e9b98d8a..9074548e5 100644
--- a/src/wiktextract/extractor/zh/gloss.py
+++ b/src/wiktextract/extractor/zh/gloss.py
@@ -1,4 +1,3 @@
-
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import TemplateNode
 from wiktextract.page import clean_node
diff --git a/src/wiktextract/extractor/zh/linkage.py b/src/wiktextract/extractor/zh/linkage.py
index 6885ada9c..266604381 100644
--- a/src/wiktextract/extractor/zh/linkage.py
+++ b/src/wiktextract/extractor/zh/linkage.py
@@ -13,6 +13,7 @@
 )
 from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item
 from .models import Linkage, WordEntry
+from .tags import translate_raw_tags
 
 
 def extract_linkages(
@@ -54,6 +55,7 @@ def extract_linkages(
                             linkage_data.raw_tags.append(
                                 clean_node(wxr, None, item_child).strip("()")
                             )
+                            translate_raw_tags(linkage_data)
                         elif template_name.lower() in DESCENDANT_TEMPLATES:
                             not_term_indexes.add(index)
                             extract_descendant_list_item(
diff --git a/src/wiktextract/extractor/zh/tags.py b/src/wiktextract/extractor/zh/tags.py
index b380e1e03..d1c33f8d9 100644
--- a/src/wiktextract/extractor/zh/tags.py
+++ b/src/wiktextract/extractor/zh/tags.py
@@ -84,12 +84,22 @@
 # https://zh.wiktionary.org/wiki/Template:注释
 LABEL_TAGS = {
     "棄用": "obsolete",
+    "非標準": "nonstandard",
+    "非正式": "informal",
+    "古舊": "dated",
+    "新詞": "neologism",
+    "定語": "attributive",
+    "書面": "literary",
+    "貶義": "derogatory",
+    "比喻": "figuratively",
+    "俗語": "slang",
     "古": "archaic",
     "陽": "masculine",
     "陰": "feminine",
     "喻": "figuratively",
     "書": "literary",
     "口": "colloquial",
+    "口語": "colloquial",
     "俚": "slang",
     "俗": "slang",
     "方": "dialectal",
diff --git a/tests/test_zh_linkage.py b/tests/test_zh_linkage.py
index a2913bfd2..e2ba8295b 100644
--- a/tests/test_zh_linkage.py
+++ b/tests/test_zh_linkage.py
@@ -67,3 +67,20 @@ def test_ja_r_template(self):
                 "word": "家主",
             },
         )
+
+    def test_qual_tag(self):
+        page_data = [WordEntry(lang="漢語", lang_code="zh", word="駱駝")]
+        self.wxr.wtp.add_page("Template:qual", 10, "({{{1}}})")
+        self.wxr.wtp.add_page("Template:zh-l", 10, "{{{1}}}")
+        self.wxr.wtp.start_page("駱駝")
+        node = self.wxr.wtp.parse("* {{qual|比喻}} {{zh-l|沙漠之舟}}")
+        extract_linkages(self.wxr, page_data, node.children, "synonyms", "")
+        self.assertEqual(
+            [
+                s.model_dump(exclude_defaults=True)
+                for s in page_data[0].synonyms
+            ],
+            [
+                {"tags": ["figuratively"], "word": "沙漠之舟"},
+            ],
+        )