Skip to content

Commit

Permalink
Merge pull request #527 from xxyzz/zh
Browse files Browse the repository at this point in the history
Translate some zh edition raw tags
  • Loading branch information
xxyzz authored Mar 1, 2024
2 parents 278af03 + 7296a7e commit e5232d8
Show file tree
Hide file tree
Showing 10 changed files with 273 additions and 164 deletions.
75 changes: 25 additions & 50 deletions src/wiktextract/extractor/zh/gloss.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import re

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from ..ruby import extract_ruby
from .example import extract_examples
from .models import Sense, WordEntry
from .tags import translate_raw_tags

# https://zh.wiktionary.org/wiki/Template:Label
LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])


def extract_gloss(
Expand All @@ -17,25 +20,34 @@ def extract_gloss(
) -> None:
lang_code = page_data[-1].lang_code
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
gloss_nodes = [
child
for child in list_item_node.children
if not isinstance(child, WikiNode) or child.kind != NodeKind.LIST
]
gloss_nodes = []
raw_tags = []
for node in list_item_node.children:
if isinstance(node, TemplateNode):
raw_tag = clean_node(wxr, None, node)
if node.template_name in LABEL_TEMPLATES:
raw_tags.append(raw_tag.strip("()"))
elif raw_tag.startswith("〈") and raw_tag.endswith("〉"):
raw_tags.append(raw_tag.strip("〈〉"))
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
continue
else:
gloss_nodes.append(node)

if lang_code == "ja":
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True
)
ruby_data, nodes_without_ruby = extract_ruby(
wxr, expanded_node.children
)
raw_gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
else:
ruby_data = []
raw_gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
new_gloss_data = merge_gloss_data(
gloss_data, extract_gloss_and_tags(raw_gloss_text)
)
gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
new_gloss_data = gloss_data.model_copy(deep=True)
new_gloss_data.raw_tags.extend(raw_tags)
new_gloss_data.glosses.append(gloss_text)
if len(ruby_data) > 0:
new_gloss_data.ruby = ruby_data

Expand All @@ -49,42 +61,5 @@ def extract_gloss(
extract_examples(wxr, new_gloss_data, child_node)

if not has_nested_gloss:
translate_raw_tags(new_gloss_data)
page_data[-1].senses.append(new_gloss_data)


def merge_gloss_data(data_a: Sense, data_b: Sense) -> Sense:
new_data = Sense()
for data in data_a, data_b:
for field in data.model_fields:
pre_data = getattr(new_data, field)
pre_data.extend(getattr(data, field))
return new_data


def extract_gloss_and_tags(raw_gloss: str) -> Sense:
left_brackets = ("(", "(")
right_brackets = (")", ")")
if raw_gloss.startswith(left_brackets) or raw_gloss.endswith(
right_brackets
):
tags = []
split_tag_regex = r", ?|,|或"
front_tag_end = -1
rear_tag_start = len(raw_gloss)
for index, left_bracket in enumerate(left_brackets):
if raw_gloss.startswith(left_bracket):
front_tag_end = raw_gloss.find(right_brackets[index])
front_label = raw_gloss[1:front_tag_end]
tags += re.split(split_tag_regex, front_label)
for index, right_bracket in enumerate(right_brackets):
if raw_gloss.endswith(right_bracket):
rear_tag_start = raw_gloss.rfind(left_brackets[index])
rear_label = raw_gloss.rstrip("".join(right_brackets))[
rear_tag_start + 1 :
]
tags += re.split(split_tag_regex, rear_label)

gloss = raw_gloss[front_tag_end + 1 : rear_tag_start].strip()
return Sense(glosses=[gloss], raw_glosses=[raw_gloss], raw_tags=tags)
else:
return Sense(glosses=[raw_gloss])
96 changes: 37 additions & 59 deletions src/wiktextract/extractor/zh/headword_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,33 +9,7 @@
from ..ruby import extract_ruby
from ..share import strip_nodes
from .models import Form, WordEntry

# https://zh.wiktionary.org/wiki/Module:Gender_and_number
GENDERS = {
"f": "feminine",
"m": "masculine",
"n": "neuter",
"c": "common",
# Animacy
"an": "animate",
"in": "inanimate",
# Animal (for Ukrainian, Belarusian, Polish)
"anml": "animal",
# Personal (for Ukrainian, Belarusian, Polish)
"pr": "personal",
# Nonpersonal not currently used
"np": "nonpersonal",
# Virility (for Polish)
"vr": "virile",
"nv": "nonvirile",
# Numbers
"s": "singular number",
"d": "dual number",
"p": "plural number",
# Verb qualifiers
"impf": "imperfective aspect",
"pf": "perfective aspect",
}
from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags


def extract_headword_line(
Expand All @@ -55,32 +29,34 @@ def extract_headword_line(
wxr.wtp.node_to_wikitext(node), expand_all=True
)
forms_start_index = 0
for index, child in expanded_node.find_child(NodeKind.HTML, True):
if child.tag == "strong" and "headword" in child.attrs.get("class", ""):
forms_start_index = index + 1
elif child.tag == "span":
class_names = child.attrs.get("class", "")
if "headword-tr" in class_names:
for span_node in expanded_node.find_html(
"span", attr_name="class", attr_value="headword-line"
):
for index, span_child in span_node.find_child(NodeKind.HTML, True):
if span_child.tag == "span":
forms_start_index = index + 1

page_data[-1].forms.append(
Form(
form=clean_node(wxr, page_data[-1], child),
tags=["romanization"],
class_names = span_child.attrs.get("class", "")
if "headword-tr" in class_names:
page_data[-1].forms.append(
Form(
form=clean_node(wxr, page_data[-1], span_child),
tags=["romanization"],
)
)
)
elif "gender" in class_names:
elif "gender" in class_names:
for abbr_tag in span_child.find_html("abbr"):
gender = abbr_tag.children[0]
if gender in TEMPLATE_TAG_ARGS:
page_data[-1].tags.append(TEMPLATE_TAG_ARGS[gender])
else:
page_data[-1].raw_tags.append(gender)
translate_raw_tags(page_data[-1])
elif (
span_child.tag == "strong"
and "headword" in span_child.attrs.get("class", "")
):
forms_start_index = index + 1
for abbr_tag in child.find_html("abbr"):
gender = abbr_tag.children[0]
if gender in GENDERS:
page_data[-1].tags.append(GENDERS[gender])
else:
page_data[-1].raw_tags.append(gender)
if lang_code == "ja":
for span_child in child.find_html(
"strong", attr_name="class", attr_value="headword"
):
if lang_code == "ja":
ruby_data, node_without_ruby = extract_ruby(wxr, span_child)
page_data[-1].forms.append(
Form(
Expand All @@ -91,13 +67,13 @@ def extract_headword_line(
tags=["canonical"],
)
)
elif child.tag == "b":
# this is a form <b> tag, already inside form parentheses
break
elif span_child.tag == "b":
# this is a form <b> tag, already inside form parentheses
break

extract_headword_forms(
wxr, page_data, expanded_node.children[forms_start_index:]
)
extract_headword_forms(
wxr, page_data, span_node.children[forms_start_index:]
)


def extract_headword_forms(
Expand Down Expand Up @@ -150,8 +126,8 @@ def process_forms_text(
and "gender" in next_node.attrs.get("class", "")
):
gender = clean_node(wxr, None, next_node)
if gender in GENDERS:
form_tags.append(GENDERS[gender])
if gender in TEMPLATE_TAG_ARGS:
form_tags.append(TEMPLATE_TAG_ARGS[gender])
else:
raw_form_tags.append(gender)

Expand All @@ -161,6 +137,7 @@ def process_forms_text(
tags=form_tags,
ruby=ruby_data,
)
translate_raw_tags(form_data)
page_data[-1].forms.append(form_data)
elif (
node.tag == "span"
Expand All @@ -180,14 +157,15 @@ def process_forms_text(
)
if len(tags_list) > 0:
page_data[-1].raw_tags.extend(tags_list)
translate_raw_tags(page_data[-1])
else:
clean_node(wxr, page_data[-1], tag_nodes) # find categories


def extract_headword_tags(tags_str: str) -> list[str]:
tags = []
for tag_str in (
s.strip() for s in re.split("&|或", tags_str) if len(s.strip()) > 0
s.strip() for s in re.split("&|或|和", tags_str) if len(s.strip()) > 0
):
tags.append(tag_str)
return tags
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/zh/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item
from .models import Linkage, WordEntry
from .tags import translate_raw_tags


def extract_linkages(
Expand Down Expand Up @@ -54,6 +55,7 @@ def extract_linkages(
linkage_data.raw_tags.append(
clean_node(wxr, None, item_child).strip("()")
)
translate_raw_tags(linkage_data)
elif template_name.lower() in DESCENDANT_TEMPLATES:
not_term_indexes.add(index)
extract_descendant_list_item(
Expand Down
1 change: 0 additions & 1 deletion src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ class Example(ChineseBaseModel):

class Sense(ChineseBaseModel):
glosses: list[str] = []
raw_glosses: list[str] = Field([], description="Gloss text without tags")
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
Expand Down
Loading

0 comments on commit e5232d8

Please sign in to comment.