Skip to content

Commit

Permalink
Merge pull request #982 from xxyzz/th
Browse files Browse the repository at this point in the history
[th] extract gloss tag and example templates
  • Loading branch information
xxyzz authored Jan 8, 2025
2 parents 3839b34 + 1f1ff3e commit ef1c2aa
Show file tree
Hide file tree
Showing 5 changed files with 428 additions and 6 deletions.
164 changes: 162 additions & 2 deletions src/wiktextract/extractor/th/example.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,33 @@
from wikitextprocessor import NodeKind, TemplateNode, WikiNode
import re

from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .models import Example, Sense
from .tags import translate_raw_tags


def extract_example_list_item(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
ref: str = "",
) -> None:
for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name in ["ux", "usex", "ko-usex"]:
extract_ux_template(wxr, sense, node)
elif node.template_name in ["zh-x", "zh-usex"]:
extract_template_zh_x(wxr, sense, node)
elif node.template_name in ["ja-x", "ja-usex"]:
extract_template_ja_usex(wxr, sense, node, ref)
elif node.template_name.startswith("quote-"):
ref = extract_quote_template(wxr, sense, node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, child_list_item, ref)


def extract_ux_template(
Expand Down Expand Up @@ -42,6 +58,150 @@ def extract_ux_template(
wxr, None, t_node.template_parameters.get("ref", "")
)
if e_data.text != "":
translate_raw_tags(e_data)
sense.examples.append(e_data)
for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, sense, link_node)


def extract_template_zh_x(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
examples = []
for dl_tag in expanded_node.find_html("dl"):
examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))
if len(examples) == 0:
examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))

translation = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
for e_data in examples:
e_data.translation = translation
translate_raw_tags(e_data)

for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, sense, link_node)

sense.examples.extend(examples)


def extract_zh_x_dl_tag(
wxr: WiktextractContext, dl_tag: HTMLNode
) -> list[Example]:
examples = []
for span_tag in dl_tag.find_html("span"):
if "lang" in span_tag.attrs:
e_text = clean_node(wxr, None, span_tag)
if e_text != "":
examples.append(Example(text=e_text))
else:
raw_tags = clean_node(wxr, None, span_tag).strip("[] ")
for raw_tag in re.split(r", | and ", raw_tags):
raw_tag = raw_tag.strip()
if raw_tag != "" and len(examples) > 0:
examples[-1].raw_tags.append(raw_tag)
for dd_tag in dl_tag.find_html("dd"):
for span_tag in dd_tag.find_html("span"):
if "Latn" in span_tag.attrs.get("lang", ""):
roman = clean_node(wxr, None, span_tag)
for e_data in examples:
e_data.roman = roman
else:
raw_tag = clean_node(wxr, None, span_tag).strip("[] ")
if raw_tag != "":
for e_data in examples:
e_data.raw_tags.append(raw_tag)
return examples


def extract_zh_x_no_dl_tag(
wxr: WiktextractContext, expanded_node: WikiNode
) -> list[Example]:
examples = []
for span_tag in expanded_node.find_html("span"):
lang = span_tag.attrs.get("lang", "")
match lang:
case "zh-Latn":
roman = clean_node(wxr, None, span_tag)
for e_data in examples:
e_data.roman = roman
case "zh-Hant" | "zh-Hans":
e_text = clean_node(wxr, None, span_tag)
example = Example(text=e_text)
example.tags.append(
"Traditional Chinese"
if lang == "zh-Hant"
else "Simplified Chinese"
)
if example.text != "":
examples.append(example)

return examples


def extract_quote_template(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> str:
ref = ""
if all(
arg not in t_node.template_parameters for arg in ["text", "passage", 7]
):
ref = clean_node(wxr, sense, t_node)
else:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
example = Example(text="")
for span_tag in expanded_node.find_html_recursively("span"):
span_class = span_tag.attrs.get("class", "")
if "cited-source" == span_class:
example.ref = clean_node(wxr, None, span_tag)
elif "e-quotation" in span_class:
example.text = clean_node(wxr, None, span_tag)
elif "e-translation" in span_class:
example.translation = clean_node(wxr, None, span_tag)
for i_tag in expanded_node.find_html_recursively(
"i", attr_name="class", attr_value="e-transliteration"
):
example.roman = clean_node(wxr, None, i_tag)
break
if example.text != "":
sense.examples.append(example)
clean_node(wxr, sense, expanded_node)

return ref


def extract_template_ja_usex(
wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, ref: str
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
example = Example(text="", ref=ref)
for span_tag in expanded_node.find_html(
"span", attr_name="class", attr_value="Jpan"
):
ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
example.text = clean_node(wxr, None, node_without_ruby)
example.ruby = ruby_data
for span_tag in expanded_node.find_html_recursively(
"span", attr_name="class", attr_value="tr"
):
example.roman = clean_node(wxr, None, span_tag)
example.translation = clean_node(
wxr, None, t_node.template_parameters.get(3, "")
)
example.literal_meaning = clean_node(
wxr, None, t_node.template_parameters.get("lit", "")
)
if example.text != "":
sense.examples.append(example)
for link_node in expanded_node.find_child(NodeKind.LINK):
clean_node(wxr, sense, link_node)
31 changes: 27 additions & 4 deletions src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .section_titles import POS_DATA
from .tags import translate_raw_tags


def extract_pos_section(
Expand Down Expand Up @@ -32,9 +33,18 @@ def extract_gloss_list_item(
list_item: WikiNode,
) -> None:
sense = Sense()
gloss_str = clean_node(
wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
)
gloss_nodes = []
for node in list_item.children:
if isinstance(node, TemplateNode) and node.template_name in [
"label",
"lb",
"lbl",
]:
extract_label_template(wxr, sense, node)
elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
gloss_nodes.append(node)

gloss_str = clean_node(wxr, sense, gloss_nodes)
for child_list in list_item.find_child(NodeKind.LIST):
if child_list.sarg.startswith("#") and child_list.sarg.endswith(
(":", "*")
Expand All @@ -44,4 +54,17 @@ def extract_gloss_list_item(

if gloss_str != "":
sense.glosses.append(gloss_str)
translate_raw_tags(sense)
word_entry.senses.append(sense)


def extract_label_template(
wxr: WiktextractContext,
sense: Sense,
t_node: TemplateNode,
) -> None:
raw_tag_str = clean_node(wxr, sense, t_node).strip("() ")
for raw_tag in raw_tag_str.split(","):
raw_tag = raw_tag.strip()
if raw_tag != "":
sense.raw_tags.append(raw_tag)
26 changes: 26 additions & 0 deletions src/wiktextract/extractor/th/tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from .models import WordEntry

EXAMPLE_TAGS = {
# แม่แบบ:zh-x, มอดูล:zh-usex/data
"MSC": "Modern Standard Chinese",
"Pinyin": "Pinyin",
"trad.": "Traditional Chinese",
"simp.": "Simplified Chinese",
}


TAGS = {**EXAMPLE_TAGS}


def translate_raw_tags(data: WordEntry) -> None:
raw_tags = []
for raw_tag in data.raw_tags:
if raw_tag in TAGS and hasattr(data, "tags"):
tr_tag = TAGS[raw_tag]
if isinstance(tr_tag, str):
data.tags.append(tr_tag)
elif isinstance(tr_tag, list):
data.tags.extend(tr_tag)
else:
raw_tags.append(raw_tag)
data.raw_tags = raw_tags
Loading

0 comments on commit ef1c2aa

Please sign in to comment.