Skip to content

Commit

Permalink
Merge pull request #476 from xxyzz/zh
Browse files Browse the repository at this point in the history
Improve zh edition translation code
  • Loading branch information
xxyzz authored Jan 29, 2024
2 parents 889e026 + f957217 commit 8c5a85c
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 188 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class Linkage(FrenchBaseModel):
word: str = ""
tags: list[str] = []
roman: str = ""
alt: str = Field("", description="ALternative form")
alt: str = Field("", description="Alternative form")
translation: str = Field("", description="French translation")
sense: str = Field("", description="Definition of the word")
sense_index: int = Field(
Expand Down
8 changes: 5 additions & 3 deletions src/wiktextract/extractor/zh/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,12 @@ class Translation(ChineseBaseModel):
"", description="Wiktionary language code of the translation term"
)
lang: str = Field("", description="Translation language name")
word: str = Field("", description="Translation term")
word: str = Field(description="Translation term")
sense: str = Field("", description="Translation gloss")
tags: list[str] = []
roman: str = ""
roman: str = Field("", description="Roman script")
alt: str = Field("", description="Alternative form")
lit: str = Field("", description="Literal translation for the term")


class Linkage(ChineseBaseModel):
Expand Down Expand Up @@ -127,5 +129,5 @@ class WordEntry(ChineseBaseModel):
descendants: list[Descendant] = []
redirects: list[str] = Field(
[],
description="Soft redirect page, extracted from template zh-see and ja-see",
description="Soft redirect page, extracted from template zh-see ja-see",
)
5 changes: 3 additions & 2 deletions src/wiktextract/extractor/zh/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@
# Additional templates to be expanded in the pre-expand phase
ADDITIONAL_EXPAND_TEMPLATES = frozenset(
{
"multitrans",
"multitrans-nowiki",
"col1",
"col2",
"col3",
Expand Down Expand Up @@ -198,6 +196,9 @@ def extract_pronunciation(
def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout documents
# https://zh.wiktionary.org/wiki/Wiktionary:佈局解釋
# https://zh.wiktionary.org/wiki/Wiktionary:体例说明
if wxr.config.verbose:
logging.info(f"Parsing page: {page_title}")

Expand Down
265 changes: 136 additions & 129 deletions src/wiktextract/extractor/zh/translation.py
Original file line number Diff line number Diff line change
@@ -1,144 +1,161 @@
import re
from typing import Optional, Union

from mediawiki_langcodes import name_to_code
from mediawiki_langcodes import code_to_name, name_to_code
from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LEVEL_KIND_FLAGS
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from ..share import capture_text_in_parentheses
from .models import Translation, WordEntry


def extract_translation(
wxr: WiktextractContext, page_data: list[WordEntry], node: WikiNode
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: WikiNode,
sense: str = "",
) -> None:
sense_text = ""
for child in node.children:
if isinstance(child, WikiNode):
if child.kind == NodeKind.TEMPLATE:
template_name = child.template_name.lower()
if (
template_name in {"trans-top", "翻譯-頂", "trans-top-also"}
and 1 in child.template_parameters
):
sense_text = clean_node(
wxr, None, child.template_parameters.get(1)
)
elif template_name == "checktrans-top":
return
elif template_name == "see translation subpage":
translation_subpage(
wxr, page_data, child.template_parameters
)
elif child.kind == NodeKind.LIST:
for list_item_node in child.find_child(NodeKind.LIST_ITEM):
if not list_item_node.contain_node(NodeKind.LIST):
process_translation_list_item(
wxr,
page_data,
clean_node(wxr, None, list_item_node.children),
sense_text,
)
else:
nested_list_index = 0
for index, item_child in enumerate(
list_item_node.children
):
if (
isinstance(item_child, WikiNode)
and item_child.kind == NodeKind.LIST
):
nested_list_index = index
break

process_translation_list_item(
wxr,
page_data,
clean_node(
wxr,
None,
list_item_node.children[:nested_list_index],
),
sense_text,
)
for nested_list_node in list_item_node.find_child(
NodeKind.LIST
):
for nested_list_item in nested_list_node.find_child(
NodeKind.LIST_ITEM
):
process_translation_list_item(
wxr,
page_data,
clean_node(
wxr, None, nested_list_item.children
),
sense_text,
)
for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
if isinstance(child, TemplateNode):
template_name = child.template_name.lower()
if (
template_name in {"trans-top", "翻譯-頂", "trans-top-also"}
and 1 in child.template_parameters
):
sense = clean_node(wxr, None, child.template_parameters.get(1))
elif template_name in {"see translation subpage", "trans-see"}:
translation_subpage(wxr, page_data, child)
elif template_name == "multitrans":
wikitext = "".join(
wxr.wtp.node_to_wikitext(c)
for c in child.template_parameters.get("data", [])
)
multitrans = wxr.wtp.parse(wikitext)
extract_translation(wxr, page_data, multitrans, sense)
else:
for list_item in child.find_child_recursively(NodeKind.LIST_ITEM):
process_translation_list_item(
wxr,
page_data,
list_item,
sense,
)


def process_translation_list_item(
wxr: WiktextractContext,
page_data: list[WordEntry],
expanded_text: str,
list_item: WikiNode,
sense: str,
) -> None:
from .headword_line import GENDERS

split_results = re.split(r":|:", expanded_text, maxsplit=1)
if len(split_results) != 2:
return
lang_text, words_text = split_results
lang_text = lang_text.strip()
words_text = words_text.strip()
if len(words_text) == 0:
return
lang_code = name_to_code(lang_text, "zh")

# split words by `,` or `;` that are not inside `()`
for word_and_tags in re.split(r"[,;、](?![^(]*\))\s*", words_text):
tags, word = capture_text_in_parentheses(word_and_tags)
tags = [tag for tag in tags if tag != lang_code] # rm Wiktionary link
translation_data = Translation(
lang_code=lang_code, lang=lang_text, word=word
)
tags_without_roman = []
for tag in tags:
if re.search(r"[a-z]", tag):
translation_data.roman = tag
tr_data = Translation(word="", sense=sense)

for child_index, child in enumerate(list_item.filter_empty_str_child()):
if child_index == 0:
lang_text = ""
if isinstance(child, str):
if ":" in child:
lang_text = child[: child.index(":")]
elif ":" in child:
lang_text = child[: child.index(":")]
else:
tags_without_roman.append(tag)

if len(tags_without_roman) > 0:
translation_data.tags = tags_without_roman

gender = word.split(" ")[-1]
if gender in GENDERS:
translation_data.word = word.removesuffix(f" {gender}")
translation_data.tags.append(GENDERS.get(gender))

if len(sense) > 0:
translation_data.sense = sense
page_data[-1].translations.append(translation_data)
lang_text = clean_node(wxr, None, child)
if len(lang_text) > 0:
tr_data.lang = lang_text.strip()
tr_data.lang_code = name_to_code(tr_data.lang, "zh")
elif isinstance(child, TemplateNode):
template_name = child.template_name
if template_name in {
"t",
"t+",
"tt",
"tt+",
"t-check",
"t+check",
}:
if len(tr_data.word) > 0:
page_data[-1].translations.append(
tr_data.model_copy(deep=True)
)
tr_data = Translation(
word="",
lang=tr_data.lang,
lang_code=tr_data.lang_code,
sense=sense,
)
if tr_data.lang_code == "":
tr_data.lang_code = child.template_parameters.get(1, "")
if tr_data.lang == "":
tr_data.lang = code_to_name(tr_data.lang_code, "zh")
tr_data.word = clean_node(
wxr, None, child.template_parameters.get(2, "")
)
tr_data.roman = clean_node(
wxr, None, child.template_parameters.get("tr", "")
)
tr_data.alt = clean_node(
wxr, None, child.template_parameters.get("alt", "")
)
tr_data.lit = clean_node(
wxr, None, child.template_parameters.get("lit", "")
)
# find gender tags
expanded_template = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(child), expand_all=True
)
for span_node in expanded_template.find_html("span"):
class_str = span_node.attrs.get("class", "")
if "gender" in class_str:
for abbr_tag in span_node.find_html("abbr"):
if len(abbr_tag.attrs.get("title")) > 0:
tr_data.tags.append(
clean_node(
wxr, None, abbr_tag.attrs.get("title")
)
)
elif tr_data.roman == "" and class_str.startswith("tr "):
tr_data.roman = clean_node(wxr, None, span_node)
elif template_name == "t-needed":
# ignore empty translation
continue
else:
# qualifier template
tag = clean_node(wxr, None, child)
if len(tag) > 0:
tr_data.tags.append(tag.strip("()"))
elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK:
if len(tr_data.word) > 0:
page_data[-1].translations.append(tr_data.model_copy(deep=True))
tr_data = Translation(
word="",
lang=tr_data.lang,
lang_code=tr_data.lang_code,
sense=sense,
)
tr_data.word = clean_node(wxr, None, child)

if len(tr_data.word) > 0:
page_data[-1].translations.append(tr_data.model_copy(deep=True))


def translation_subpage(
wxr: WiktextractContext,
page_data: list[WordEntry],
template_args: dict[str, str],
template_node: TemplateNode,
) -> None:
# https://zh.wiktionary.org/wiki/Template:翻譯-見
# https://zh.wiktionary.org/wiki/Template:See_translation_subpage
from .page import ADDITIONAL_EXPAND_TEMPLATES

page_title = wxr.wtp.title
target_section = None
if len(template_args) > 0:
target_section = template_args.get(1)
if len(template_args) > 1:
page_title = template_args.get(2)
if template_node.template_name == "see translation subpage":
target_section = template_node.template_parameters.get(1)
page_title = template_node.template_parameters.get(2, wxr.wtp.title)

translation_subpage_title = f"{page_title}/翻譯"
translation_subpage_title = page_title
if page_title == wxr.wtp.title:
translation_subpage_title = f"{page_title}/翻譯"
subpage = wxr.wtp.get_page(translation_subpage_title)
if subpage is None:
return
Expand All @@ -165,22 +182,12 @@ def find_subpage_section(
node: Union[WikiNode, str],
target_section: Union[str, list[str]],
) -> Optional[WikiNode]:
if isinstance(node, WikiNode):
if node.kind in LEVEL_KIND_FLAGS:
section_title = clean_node(wxr, None, node.largs)
if (
isinstance(target_section, str)
and section_title == target_section
):
return node
if (
isinstance(target_section, list)
and section_title in target_section
):
return node

for child in node.children:
returned_node = find_subpage_section(wxr, child, target_section)
if returned_node is not None:
return returned_node
if not isinstance(node, WikiNode):
return None
for level_node in node.find_child_recursively(LEVEL_KIND_FLAGS):
section_title = clean_node(wxr, None, level_node.largs)
if isinstance(target_section, str) and section_title == target_section:
return level_node
if isinstance(target_section, list) and section_title in target_section:
return level_node
return None
2 changes: 1 addition & 1 deletion tests/test_zh_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from wiktextract.extractor.zh.models import Sense, WordEntry
from wiktextract.extractor.zh.page import (
extract_gloss,
parse_section,
parse_page,
parse_section,
)
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext
Expand Down
2 changes: 1 addition & 1 deletion tests/test_zh_headword.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from unittest import TestCase
from unittest.mock import Mock, patch
from unittest.mock import Mock

from wikitextprocessor import Wtp
from wiktextract.extractor.zh.headword_line import extract_headword_line
Expand Down
Loading

0 comments on commit 8c5a85c

Please sign in to comment.