Skip to content

Commit

Permalink
Merge pull request #1006 from xxyzz/zh
Browse files Browse the repository at this point in the history
[de, zh] add "pos_title" field
xxyzz authored Jan 27, 2025
2 parents c7bdf0e + 613a9c7 commit 1ed3d89
Showing 9 changed files with 29 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
@@ -136,6 +136,7 @@ class WordEntry(BaseModelWrap):

word: str = Field(description="word string")
pos: str = Field(default="", description="Part of speech type")
pos_title: str = ""
other_pos: list[str] = []
# pos_title: str = Field(default=None, description="Original POS title")
lang_code: str = Field(
4 changes: 4 additions & 0 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
@@ -131,9 +131,12 @@ def process_pos_section(
level_node: LevelNode,
) -> None:
pos_data_list = []
pos_title = ""
for template_node in level_node.find_content(NodeKind.TEMPLATE):
if template_node.template_name == "Wortart":
pos_argument = template_node.template_parameters.get(1, "").strip()
if pos_title == "":
pos_title = pos_argument
if pos_argument in IGNORE_POS:
continue
elif pos_argument in FORM_POS:
@@ -169,6 +172,7 @@ def process_pos_section(
page_data[-1].tags.append(tag)
if pos_index == 0:
page_data[-1].pos = pos
page_data[-1].pos_title = pos_title
elif pos != page_data[-1].pos and pos not in page_data[-1].other_pos:
page_data[-1].other_pos.append(pos)

1 change: 1 addition & 0 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
@@ -134,6 +134,7 @@ class WordEntry(ChineseBaseModel):
lang_code: str = Field(description="Wiktionary language code")
lang: str = Field(description="Localized language name")
pos: str = Field(description="Part of speech type")
pos_title: str = ""
etymology_text: str = ""
etymology_examples: list[Example] = []
senses: list[Sense] = Field(default=[], description="Sense list")
5 changes: 3 additions & 2 deletions src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
@@ -116,12 +116,13 @@ def process_pos_block(
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_text: str,
pos_title: str,
):
pos_data = POS_TITLES[pos_text]
pos_data = POS_TITLES[pos_title]
pos_type = pos_data["pos"]
base_data.pos = pos_type
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
page_data[-1].tags.extend(pos_data.get("tags", []))
for index, child in enumerate(level_node.filter_empty_str_child()):
if isinstance(child, WikiNode):
6 changes: 6 additions & 0 deletions tests/test_de_gloss.py
Original file line number Diff line number Diff line change
@@ -259,6 +259,7 @@ def test_form_of(self):
"lang": "Deutsch",
"lang_code": "de",
"pos": "adj",
"pos_title": "Deklinierte Form",
"senses": [
{
"form_of": [{"word": "konjugiert"}],
@@ -300,6 +301,7 @@ def test_no_bedeutungen_section(self):
"lang": "Litauisch",
"lang_code": "lt",
"pos": "unknown",
"pos_title": "Deklinierte Form",
"senses": [
{
"form_of": [{"word": "abakas"}],
@@ -330,6 +332,7 @@ def test_grammatische_merkmale_no_form_of_pos_title(self):
"lang": "Latein",
"lang_code": "la",
"pos": "verb",
"pos_title": "Infinitiv",
"senses": [
{
"form_of": [{"word": "abire"}],
@@ -363,6 +366,7 @@ def test_no_gloss_list(self):
"lang": "Interlingua",
"lang_code": "ia",
"pos": "unknown",
"pos_title": "Konjugierte Form",
"senses": [
{"glosses": ["Indikativ Präsens Aktiv des Verbs amar"]}
],
@@ -390,6 +394,7 @@ def test_unordered_list(self):
"lang": "Prußisch",
"lang_code": "prg",
"pos": "prep",
"pos_title": "Präposition",
"senses": [{"glosses": ["Nebenform der Präposition esse"]}],
"word": "assa",
}
@@ -413,6 +418,7 @@ def test_description_list_plus_unordered_list(self):
"lang": "Polnisch",
"lang_code": "pl",
"pos": "noun",
"pos_title": "Deklinierte Form",
"senses": [
{
"form_of": [{"word": "auto"}],
3 changes: 3 additions & 0 deletions tests/test_de_page.py
Original file line number Diff line number Diff line change
@@ -42,6 +42,7 @@ def test_de_parse_page(self):
"lang_code": "de",
"word": "Beispiel",
"pos": "noun",
"pos_title": "Substantiv",
"senses": [{"tags": ["no-gloss"]}],
}
],
@@ -69,6 +70,7 @@ def test_de_parse_page_skipping_head_templates(self):
"lang_code": "de",
"word": "Beispiel",
"pos": "noun",
"pos_title": "Substantiv",
"senses": [{"tags": ["no-gloss"]}],
}
],
@@ -91,6 +93,7 @@ def test_multiple_pos(self):
"lang": "Deutsch",
"lang_code": "de",
"pos": "noun",
"pos_title": "Substantiv",
"other_pos": ["name"],
"senses": [
{
3 changes: 3 additions & 0 deletions tests/test_zh_gloss.py
Original file line number Diff line number Diff line change
@@ -161,6 +161,7 @@ def test_gloss_text_only_page(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "unknown",
"pos_title": "釋義",
"senses": [
{
"glosses": [
@@ -318,6 +319,7 @@ def test_inflection_of_template(self):
"lang": "冰島語",
"lang_code": "is",
"pos": "noun",
"pos_title": "名詞",
"senses": [
{
"form_of": [{"word": "lindi"}],
@@ -544,6 +546,7 @@ def test_erhua(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "adv",
"pos_title": "副詞",
"senses": [
{
"categories": ["官話漢語", "官話兒化詞"],
3 changes: 3 additions & 0 deletions tests/test_zh_linkage.py
Original file line number Diff line number Diff line change
@@ -129,6 +129,7 @@ def test_linkage_above_pos(self):
"lang": "英語",
"lang_code": "en",
"pos": "name",
"pos_title": "專有名詞",
"senses": [{"glosses": ["偵探漫畫"]}],
"synonyms": [{"word": "Tec"}],
"word": "'Tec",
@@ -268,13 +269,15 @@ def test_level_3_linkage_section(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "noun",
"pos_title": "名詞",
"senses": [{"glosses": ["愚昧的人民"]}],
"word": "愚民",
},
{
"lang": "漢語",
"lang_code": "zh",
"pos": "verb",
"pos_title": "動詞",
"descendants": [
{
"lang_code": "ja",
5 changes: 5 additions & 0 deletions tests/test_zh_pronunciation.py
Original file line number Diff line number Diff line change
@@ -147,6 +147,7 @@ def test_level3_pron_level3_pos(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "verb",
"pos_title": "動詞",
"senses": [{"glosses": ["塞住耳朵"]}],
"sounds": [
{"zh_pron": "chōng'ěr", "tags": ["Mandarin", "Pinyin"]}
@@ -162,6 +163,7 @@ def test_level3_pron_level3_pos(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "noun",
"pos_title": "名詞",
"senses": [
{"glosses": ["古冠冕旁的瑱玉,因其下垂及耳,而得名"]}
],
@@ -222,6 +224,7 @@ def test_level3_pron_level4_pos(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "noun",
"pos_title": "名詞",
"senses": [{"glosses": ["眾人,某個範圍中所有的人"]}],
"sounds": [
{
@@ -235,6 +238,7 @@ def test_level3_pron_level4_pos(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "noun",
"pos_title": "名詞",
"senses": [{"glosses": ["卿大夫之家"]}],
"sounds": [
{"zh_pron": "dàjiā", "tags": ["Mandarin", "Pinyin"]}
@@ -245,6 +249,7 @@ def test_level3_pron_level4_pos(self):
"lang": "漢語",
"lang_code": "zh",
"pos": "noun",
"pos_title": "名詞",
"senses": [{"glosses": ["對女子的尊稱"]}],
"sounds": [
{"zh_pron": "dàgū", "tags": ["Mandarin", "Pinyin"]}

0 comments on commit 1ed3d89

Please sign in to comment.