Skip to content

Commit

Permalink
Merge pull request #821 from xxyzz/zh
Browse files Browse the repository at this point in the history
[zh] improve extract gloss and example data code
  • Loading branch information
xxyzz authored Sep 16, 2024
2 parents 616f225 + 4468de7 commit 889635e
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 55 deletions.
99 changes: 55 additions & 44 deletions src/wiktextract/extractor/zh/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,88 +155,99 @@ def extract_template_zh_x(
has_dl_tag = False
results = []
for dl_tag in expanded_node.find_html_recursively("dl"):
example_data = parent_example.model_copy(deep=True)
has_dl_tag = True
ref = ""
roman = ""
translation = ""
roman_raw_tags = []
for dd_tag in dl_tag.find_html("dd"):
dd_text = clean_node(wxr, None, dd_tag)
if dd_text.startswith("出自:"):
ref = dd_text.removeprefix("出自:")
example_data.ref = dd_text.removeprefix("出自:")
else:
is_roman = False
for span_tag in dd_tag.find_html_recursively(
"span", attr_name="lang", attr_value="Latn"
):
roman = clean_node(wxr, None, span_tag)
example_data.roman = clean_node(wxr, None, span_tag)
is_roman = True
for span_tag in dd_tag.find_html_recursively("span"):
span_text = clean_node(wxr, None, span_tag)
if span_text.startswith("[") and span_text.endswith(
"]"
):
roman_raw_tags.append(span_text.strip("[]"))
example_data.raw_tags.append(span_text.strip("[]"))
if not is_roman:
translation = dd_text

example_text = ""
last_span_is_example = False
for span_tag in dl_tag.find_html_recursively("span"):
if span_tag.attrs.get("class", "") in ["Hant", "Hans"]:
example_text = clean_node(wxr, None, span_tag)
last_span_is_example = True
elif last_span_is_example:
last_span_is_example = False
if len(example_text) > 0:
raw_tag = clean_node(wxr, None, span_tag)
example = parent_example.model_copy(deep=True)
example.text = example_text
example.roman = roman
example.translation = translation
example.raw_tags.extend(raw_tag.strip("[]").split(","))
example.raw_tags.extend(roman_raw_tags)
if len(ref) > 0: # don't override parent quote-* template
example.ref = ref
translate_raw_tags(example)
results.append(example)
example_data.translation = dd_text
results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))

# no source, single line example
if not has_dl_tag:
roman = ""
raw_tags = []
example_data = parent_example.model_copy(deep=True)
for span_tag in expanded_node.find_html(
"span", attr_name="lang", attr_value="Latn"
):
roman = clean_node(wxr, None, span_tag)
example_data.roman = clean_node(wxr, None, span_tag)
for span_tag in expanded_node.find_html("span"):
span_text = clean_node(wxr, None, span_tag)
if span_text.startswith("[") and span_text.endswith("]"):
raw_tags.append(span_text.strip("[]"))
translation = clean_node(
example_data.raw_tags.append(span_text.strip("[]"))
example_data.translation = clean_node(
wxr, None, template_node.template_parameters.get(2, "")
)
literal_meaning = clean_node(
example_data.literal_meaning = clean_node(
wxr, None, template_node.template_parameters.get("lit", "")
)
for span_tag in expanded_node.find_html("span"):
span_lang = span_tag.attrs.get("lang", "")
if span_lang in ["zh-Hant", "zh-Hans"]:
example_text = clean_node(wxr, None, span_tag)
if len(example_text) > 0:
example_data = parent_example.model_copy(deep=True)
example_data.text = example_text
example_data.roman = roman
example_data.tags.append(
new_example = example_data.model_copy(deep=True)
new_example.text = example_text
new_example.tags.append(
"Traditional Chinese"
if span_lang == "zh-Hant"
else "Simplified Chinese"
)
example_data.translation = translation
example_data.literal_meaning = literal_meaning
example_data.raw_tags.extend(raw_tags)
translate_raw_tags(example_data)
results.append(example_data)
translate_raw_tags(new_example)
results.append(new_example)
return results


def extract_zh_x_dl_span_tag(
wxr: WiktextractContext, dl_tag: HTMLNode, example: Example
) -> list[Example]:
# process example text span tag and dialect span tag
results = []
is_first_hide = True
for span_tag in dl_tag.find_html("span"):
span_lang = span_tag.attrs.get("lang", "")
if span_lang in ["zh-Hant", "zh-Hans"]:
new_example = example.model_copy(deep=True)
new_example.text = clean_node(wxr, None, span_tag)
results.append(new_example)
elif "vsHide" in span_tag.attrs.get("class", ""):
# template has arg "collapsed=y"
results.extend(
extract_zh_x_dl_span_tag(
wxr,
span_tag,
results[-1]
if is_first_hide and len(results) > 0
else example,
)
)
is_first_hide = False
elif "font-size:x-small" in span_tag.attrs.get("style", ""):
for link_node in span_tag.find_child(NodeKind.LINK):
raw_tag = clean_node(wxr, None, link_node)
if len(raw_tag) > 0:
if len(results) > 0:
results[-1].raw_tags.append(raw_tag)
else:
example.raw_tags.append(raw_tag)

if dl_tag.tag == "dl":
for data in results:
translate_raw_tags(data)
return results


Expand Down
40 changes: 39 additions & 1 deletion src/wiktextract/extractor/zh/gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .example import extract_example_list_item
from .models import AltForm, Sense, WordEntry
from .models import AltForm, Classifier, Sense, WordEntry
from .tags import translate_raw_tags

# https://zh.wiktionary.org/wiki/Template:Label
Expand Down Expand Up @@ -39,6 +39,8 @@ def extract_gloss(
wxr, node, gloss_data, page_data
):
pass
elif node.template_name == "zh-mw":
process_zh_mw_template(wxr, node, gloss_data)
else:
gloss_nodes.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
Expand Down Expand Up @@ -188,3 +190,39 @@ def process_erhua_form_of_template(
"stand sp",
"sup sp",
}


def process_zh_mw_template(
wxr: WiktextractContext, node: TemplateNode, sense: Sense
) -> None:
# Chinese inline classifier template
# https://zh.wiktionary.org/wiki/Template:分類詞
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
classifiers = []
last_word = ""
for span_tag in expanded_node.find_html_recursively("span"):
span_class = span_tag.attrs.get("class", "")
if span_class in ["Hani", "Hant", "Hans"]:
word = clean_node(wxr, None, span_tag)
if word != "/":
classifier = Classifier(classifier=word)
if span_class == "Hant":
classifier.tags.append("Traditional Chinese")
elif span_class == "Hans":
classifier.tags.append("Simplified Chinese")

if len(classifiers) > 0 and last_word != "/":
sense.classifiers.extend(classifiers)
classifiers.clear()
classifiers.append(classifier)
last_word = word
elif "title" in span_tag.attrs:
raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
if len(raw_tag) > 0:
for classifier in classifiers:
classifier.raw_tags.append(raw_tag)
sense.classifiers.extend(classifiers)
for classifier in sense.classifiers:
translate_raw_tags(classifier)
7 changes: 7 additions & 0 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ class AltForm(ChineseBaseModel):
tags: list[str] = []


class Classifier(ChineseBaseModel):
classifier: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class Sense(ChineseBaseModel):
glosses: list[str] = []
tags: list[str] = []
Expand All @@ -51,6 +57,7 @@ class Sense(ChineseBaseModel):
)
alt_of: list[AltForm] = []
form_of: list[AltForm] = []
classifiers: list[Classifier] = []


class Form(ChineseBaseModel):
Expand Down
17 changes: 16 additions & 1 deletion src/wiktextract/extractor/zh/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,23 @@
"吳語": "Wu",
}

# classifier tags
# https://zh.wiktionary.org/wiki/Template:zh-mw
# https://zh.wiktionary.org/wiki/Module:Zh/templates
ZH_TAGS = {
"官話": "Mandarin",
"贛語": "Gan",
"客家話": "Hakka",
"晉語": "Jin",
"閩北語": "Northern Min",
"閩東語": "Eastern Min",
"閩南語": "Southern Min",
"潮州話": "Teochew",
"湘語": "Xiang",
}


ALL_TAGS = {**GRAMMATICAL_TAGS, **LABEL_TAGS, **ZH_X_TAGS}
ALL_TAGS = {**GRAMMATICAL_TAGS, **LABEL_TAGS, **ZH_X_TAGS, **ZH_TAGS}


def translate_raw_tags(data: WordEntry) -> WordEntry:
Expand Down
8 changes: 4 additions & 4 deletions tests/test_zh_etymology.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def test_zh_x(self):
{
"ref": "宋.蘇軾《春夜》",
"tags": [
"Pinyin",
"Classical Chinese",
"Traditional Chinese",
"Pinyin",
],
"text": "春宵一刻直千金,花有清香月有陰。",
"roman": "Chūnxiāo yīkè zhí qiānjīn, huā yǒu qīngxiāng yuè yǒu yīn.",
Expand All @@ -59,9 +59,9 @@ def test_zh_x(self):
{
"ref": "宋.蘇軾《春夜》",
"tags": [
"Pinyin",
"Classical Chinese",
"Simplified Chinese",
"Pinyin",
],
"text": "春宵一刻直千金,花有清香月有阴。",
"roman": "Chūnxiāo yīkè zhí qiānjīn, huā yǒu qīngxiāng yuè yǒu yīn.",
Expand Down Expand Up @@ -95,9 +95,9 @@ def test_zh_x_in_list(self):
{
"ref": "813年,韓愈《進學解》",
"tags": [
"Pinyin",
"Classical Chinese",
"Traditional Chinese",
"Pinyin",
],
"text": "焚膏油以繼晷,恆兀兀以窮年。",
"roman": "Fén gāoyóu yǐ jì guǐ, héng wùwù yǐ qióng nián.",
Expand All @@ -106,9 +106,9 @@ def test_zh_x_in_list(self):
{
"ref": "813年,韓愈《進學解》",
"tags": [
"Pinyin",
"Classical Chinese",
"Simplified Chinese",
"Pinyin",
],
"text": "焚膏油以继晷,恒兀兀以穷年。",
"roman": "Fén gāoyóu yǐ jì guǐ, héng wùwù yǐ qióng nián.",
Expand Down
42 changes: 37 additions & 5 deletions tests/test_zh_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def test_zh_x(self):
{
"ref": "《尚書·梓材》",
"tags": [
"Pinyin",
"Classical Chinese",
"Traditional Chinese",
"Pinyin",
],
"text": "王曰:「封,以厥庶民暨厥臣達大家,以厥臣達王惟邦君。」",
"roman": "Wáng yuē: “Fēng, yǐ jué shùmín jì jué chén dá dàjiā, yǐ jué chén dá wáng wéi bāngjūn.”",
Expand All @@ -71,9 +71,9 @@ def test_zh_x(self):
{
"ref": "《尚書·梓材》",
"tags": [
"Pinyin",
"Classical Chinese",
"Simplified Chinese",
"Pinyin",
],
"text": "王曰:「封,以厥庶民暨厥臣达大家,以厥臣达王惟邦君。」",
"roman": "Wáng yuē: “Fēng, yǐ jué shùmín jì jué chén dá dàjiā, yǐ jué chén dá wáng wéi bāngjūn.”",
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_quote_book_above_zh_x(self):
self.wxr.wtp.add_page(
"Template:zh-x",
10,
"""<dl class="zhusex"><span lang="zh-Hant" class="Hant">-{<!-- -->[[如果#漢語|如果]][[唔係#漢語|唔係]][[今日#漢語|今日]][[拆穿#漢語|拆穿]][[你#漢語|你]][[槓嘢#漢語|槓野]],[[畀#漢語|俾]][[你#漢語|你]][[混#漢語|混]][[咗#漢語|左]][[入#漢語|入]][[稅局#漢語|稅局]][[重#漢語|重]][[死人#漢語|死人]][[呀#漢語|呀]]!<!-- -->}-</span> <span>&#91;[[w:廣州話|廣州話]],[[w:繁体中文|繁體]]&#93;</span><br><span lang="zh-Hans" class="Hans">-{<!-- -->[[如果#漢語|如果]][[唔系#漢語|唔系]][[今日#漢語|今日]][[拆穿#漢語|拆穿]][[你#漢語|你]][[杠嘢#漢語|杠野]],[[畀#漢語|俾]][[你#漢語|你]][[混#漢語|混]][[咗#漢語|左]][[入#漢語|入]][[税局#漢語|税局]][[重#漢語|重]][[死人#漢語|死人]][[呀#漢語|呀]]!<!-- -->}-</span> <span>&#91;[[w:廣州話|廣州話]],[[w:简体中文|簡體]]&#93;</span><dd><span lang="zh-Latn"><i>roman</i></span> <span>&#91;[[w:廣州話拼音方案|廣州話拼音]]&#93;</span></dd><dd>如果不是今天揭穿你的老底,給你混進稅務局就更'''糟糕'''了!</dd></dl>[[Category:有使用例的粵語詞]]""",
"""<dl class="zhusex"><span lang="zh-Hant" class="Hant">-{<!-- -->[[如果#漢語|如果]][[唔係#漢語|唔係]][[今日#漢語|今日]][[拆穿#漢語|拆穿]][[你#漢語|你]][[槓嘢#漢語|槓野]],[[畀#漢語|俾]][[你#漢語|你]][[混#漢語|混]][[咗#漢語|左]][[入#漢語|入]][[稅局#漢語|稅局]][[重#漢語|重]][[死人#漢語|死人]][[呀#漢語|呀]]!<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:廣州話|廣州話]],[[w:繁体中文|繁體]]&#93;</span><br><span lang="zh-Hans" class="Hans">-{<!-- -->[[如果#漢語|如果]][[唔系#漢語|唔系]][[今日#漢語|今日]][[拆穿#漢語|拆穿]][[你#漢語|你]][[杠嘢#漢語|杠野]],[[畀#漢語|俾]][[你#漢語|你]][[混#漢語|混]][[咗#漢語|左]][[入#漢語|入]][[税局#漢語|税局]][[重#漢語|重]][[死人#漢語|死人]][[呀#漢語|呀]]!<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:廣州話|廣州話]],[[w:简体中文|簡體]]&#93;</span><dd><span lang="zh-Latn"><i>roman</i></span> <span>&#91;[[w:廣州話拼音方案|廣州話拼音]]&#93;</span></dd><dd>如果不是今天揭穿你的老底,給你混進稅務局就更'''糟糕'''了!</dd></dl>[[Category:有使用例的粵語詞]]""",
)
sense_data = Sense()
root = self.wxr.wtp.parse("""#* {{quote-book|zh}}
Expand All @@ -181,15 +181,15 @@ def test_quote_book_above_zh_x(self):
[e.model_dump(exclude_defaults=True) for e in sense_data.examples],
[
{
"raw_tags": ["廣州話", "廣州話拼音"],
"raw_tags": ["廣州話拼音", "廣州話"],
"ref": "1957, 王力",
"text": "如果唔係今日拆穿你槓野,俾你混左入稅局重死人呀!",
"roman": "roman",
"tags": ["Traditional Chinese"],
"translation": "如果不是今天揭穿你的老底,給你混進稅務局就更糟糕了!",
},
{
"raw_tags": ["廣州話", "廣州話拼音"],
"raw_tags": ["廣州話拼音", "廣州話"],
"ref": "1957, 王力",
"text": "如果唔系今日拆穿你杠野,俾你混左入税局重死人呀!",
"roman": "roman",
Expand Down Expand Up @@ -351,3 +351,35 @@ def test_Q(self):
},
],
)

def test_ref_dd_span_tags(self):
self.wxr.wtp.start_page("同志")
self.wxr.wtp.add_page(
"Template:zh-x",
10,
"""<dl class="zhusex"><span lang="zh-Hant" class="Hant">-{<!-- -->[[同樣#漢語|同樣]][[受#漢語|受]][[到#漢語|到]][[歡迎#漢語|歡迎]][[的#漢語|的]]<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:現代標準漢語|現代標準漢語]],[[w:繁体中文|繁體]]&#93;</span><br><span lang="zh-Hans" class="Hans">-{<!-- -->[[同样#漢語|同样]][[受#漢語|受]][[到#漢語|到]][[欢迎#漢語|欢迎]][[的#漢語|的]]<!-- -->}-</span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:現代標準漢語|現代標準漢語]],[[w:简体中文|簡體]]&#93;</span><dd><small>出自:'''2012'''年,-{<!----><span class="Hant" lang="zh">-{馬嘉蘭}-</span><!---->}-、-{<!----><span class="Hant" lang="zh">-{臺灣文學中的性越界}-</span><!---->}-,編輯-{<!----><span class="Hani" lang="zh">-{廖炳惠}-</span><!---->}-、-{<!----><span class="Hant" lang="zh">-{孫康宜}-</span><!---->}-、-{<!----><span class="Hani" lang="zh">-{王德威}-</span><!---->}-,-{<!----><span class="Hant" lang="zh">-{《臺灣及其脈絡》}-</span><!---->}-[https://books.google.com/books?id=cdCFqwRYNNwC&pg=PA329 第329頁]</small></dd><dd><span lang="zh-Latn" style="color:#404D52"><i>Tóngyàng shòudào huānyíng de</i></span> <span style="color:darkgreen; font-size:x-small;">&#91;[[w:漢語拼音|漢語拼音]]&#93;</span></dd></dl>""",
)
sense_data = Sense()
root = self.wxr.wtp.parse(
"#: {{zh-x|同樣 受-到 歡迎 的||ref='''2012'''年,{{lang|zh|馬嘉蘭}}、{{lang|zh|臺灣文學中的性越界}},編輯{{lang|zh|廖炳惠}}、{{lang|zh|孫康宜}}、{{lang|zh|王德威}},{{lang|zh|《臺灣及其脈絡》}}[https://books.google.com/books?id=cdCFqwRYNNwC&pg=PA329 第329頁]}}"
)
extract_example_list_item(
self.wxr, sense_data, root.children[0].children[0], []
)
self.assertEqual(
[e.model_dump(exclude_defaults=True) for e in sense_data.examples],
[
{
"text": "同樣受到歡迎的",
"roman": "Tóngyàng shòudào huānyíng de",
"tags": ['Pinyin', 'Standard Chinese', 'Traditional Chinese'],
"ref": "2012年,馬嘉蘭、臺灣文學中的性越界,編輯廖炳惠、孫康宜、王德威,《臺灣及其脈絡》第329頁",
},
{
"text": "同样受到欢迎的",
"roman": "Tóngyàng shòudào huānyíng de",
"tags": ['Pinyin', 'Standard Chinese', 'Simplified Chinese'],
"ref": "2012年,馬嘉蘭、臺灣文學中的性越界,編輯廖炳惠、孫康宜、王德威,《臺灣及其脈絡》第329頁",
},
],
)
Loading

0 comments on commit 889635e

Please sign in to comment.