diff --git a/src/wiktextract/extractor/th/example.py b/src/wiktextract/extractor/th/example.py new file mode 100644 index 000000000..a3387e58d --- /dev/null +++ b/src/wiktextract/extractor/th/example.py @@ -0,0 +1,47 @@ +from wikitextprocessor import NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Example, Sense + + +def extract_example_list_item( + wxr: WiktextractContext, sense: Sense, list_item: WikiNode +) -> None: + for node in list_item.children: + if isinstance(node, TemplateNode): + if node.template_name in ["ux", "usex", "ko-usex"]: + extract_ux_template(wxr, sense, node) + + +def extract_ux_template( + wxr: WiktextractContext, sense: Sense, t_node: TemplateNode +) -> None: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + e_data = Example(text="") + for i_tag in expanded_node.find_html_recursively("i"): + i_class = i_tag.attrs.get("class", "") + if "e-example" in i_class: + e_data.text = clean_node(wxr, None, i_tag) + elif "e-transliteration" in i_class: + e_data.roman = clean_node(wxr, None, i_tag) + for span_tag in expanded_node.find_html_recursively("span"): + span_class = span_tag.attrs.get("class", "") + if "e-translation" in span_class: + e_data.translation = clean_node(wxr, None, span_tag) + elif "e-literally" in span_class: + e_data.literal_meaning = clean_node(wxr, None, span_tag) + elif "qualifier-content" in span_class: + raw_tag = clean_node(wxr, None, span_tag) + if raw_tag != "": + e_data.raw_tags.append(raw_tag) + + e_data.ref = clean_node( + wxr, None, t_node.template_parameters.get("ref", "") + ) + if e_data.text != "": + sense.examples.append(e_data) + for link_node in expanded_node.find_child(NodeKind.LINK): + clean_node(wxr, sense, link_node) diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index 969ea485d..3d3d0cf9a 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -10,11 +10,30 @@ class ThaiBaseModel(BaseModel): ) +class Example(ThaiBaseModel): + text: str + translation: str = "" + literal_meaning: str = "" + roman: str = Field( + default="", description="Romanization of the example sentence" + ) + ref: str = Field( + default="", + description="Source of the sentence, like book title and page number", + ) + ruby: list[tuple[str, ...]] = Field( + default=[], description="Japanese Kanji and furigana" + ) + tags: list[str] = [] + raw_tags: list[str] = [] + + class Sense(ThaiBaseModel): glosses: list[str] = [] tags: list[str] = [] raw_tags: list[str] = [] categories: list[str] = [] + examples: list[Example] = [] class WordEntry(ThaiBaseModel): diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py index 47c8ac9d6..93120ae80 100644 --- a/src/wiktextract/extractor/th/pos.py +++ b/src/wiktextract/extractor/th/pos.py @@ -2,6 +2,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from .example import extract_example_list_item from .models import Sense, WordEntry from .section_titles import POS_DATA @@ -20,8 +21,8 @@ def extract_pos_section( page_data[-1].tags.extend(pos_data.get("tags", [])) for list_node in level_node.find_child(NodeKind.LIST): - if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): - for list_item in list_node.find_child(NodeKind.LIST_ITEM): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): extract_gloss_list_item(wxr, page_data[-1], list_item) @@ -34,6 +35,13 @@ def extract_gloss_list_item( gloss_str = clean_node( wxr, sense, list(list_item.invert_find_child(NodeKind.LIST)) ) + for child_list in list_item.find_child(NodeKind.LIST): + if child_list.sarg.startswith("#") and child_list.sarg.endswith( + (":", "*") + ): + for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): + extract_example_list_item(wxr, sense, e_list_item) + if gloss_str != "": sense.glosses.append(gloss_str) word_entry.senses.append(sense) diff --git a/tests/test_th_example.py b/tests/test_th_example.py new file mode 100644 index 000000000..191024b1f --- /dev/null +++ b/tests/test_th_example.py @@ -0,0 +1,50 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.th.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestThExample(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="th"), + WiktionaryConfig( + dump_file_lang_code="th", capture_language_codes=None + ), + ) + + def test_ux(self): + self.wxr.wtp.add_page( + "แม่แบบ:ko-usex", + 10, + """