diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index ec1acbe0..77278964 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -28,6 +28,11 @@ class Example(ThaiBaseModel): raw_tags: list[str] = [] +class AltForm(ThaiBaseModel): + word: str + roman: str = "" + + class Sense(ThaiBaseModel): glosses: list[str] = [] tags: list[str] = [] @@ -35,6 +40,7 @@ class Sense(ThaiBaseModel): categories: list[str] = [] examples: list[Example] = [] classifiers: list[str] = [] + form_of: list[AltForm] = [] class Form(ThaiBaseModel): diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py index 386ae2c6..5e08e9c8 100644 --- a/src/wiktextract/extractor/th/pos.py +++ b/src/wiktextract/extractor/th/pos.py @@ -1,11 +1,16 @@ import itertools -from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode +from wikitextprocessor import ( + LevelNode, + NodeKind, + TemplateNode, + WikiNode, +) from ...page import clean_node from ...wxr_context import WiktextractContext from .example import extract_example_list_item -from .models import Form, Sense, WordEntry +from .models import AltForm, Form, Sense, WordEntry from .section_titles import POS_DATA from .tags import translate_raw_tags @@ -57,6 +62,15 @@ def extract_gloss_list_item( extract_label_template(wxr, sense, node) elif isinstance(node, TemplateNode) and node.template_name == "cls": extract_cls_template(wxr, sense, node) + elif isinstance(node, TemplateNode) and ( + node.template_name.endswith(" of") + or node.template_name == "altform" + ): + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(node), expand_all=True + ) + extract_form_of_template(wxr, sense, expanded_node) + gloss_nodes.append(expanded_node) elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): gloss_nodes.append(node) @@ -167,3 +181,22 @@ def extract_note_section( ) if note_str != "": word_entry.notes.append(note_str) + + +def extract_form_of_template( + wxr: WiktextractContext, + sense: Sense, + expanded_node: WikiNode, +) -> None: + form = AltForm(word="") + for i_tag in expanded_node.find_html_recursively("i"): + form.word = clean_node(wxr, None, i_tag) + break + for span_tag in expanded_node.find_html_recursively("span"): + if "mention-tr" in span_tag.attrs.get("class", ""): + form.roman = clean_node(wxr, None, span_tag) + break + if form.word != "": + sense.form_of.append(form) + if "form-of" not in sense.tags: + sense.tags.append("form-of") diff --git a/tests/test_th_example.py b/tests/test_th_example.py index d0a042c7..814a9674 100644 --- a/tests/test_th_example.py +++ b/tests/test_th_example.py @@ -187,7 +187,7 @@ def test_ja_x(self): self.wxr.wtp.add_page( "แม่แบบ:syn of", 10, - "คำพ้องความของ 北極熊 (ฮกเกียวกุงุมะ, “หมีขั้วโลก”)", + """คำพ้องความของ [[北極熊#ภาษาญี่ปุ่น|北極熊]] (ฮกเกียวกุงุมะ, หมีขั้วโลก)""", ) page_data = parse_page( self.wxr, @@ -236,5 +236,7 @@ def test_ja_x(self): "ref": "1990 มิถุนายน 15, Takahashi, Rumiko, “:แม่แบบ:jaru [PART.5 Snatching the Scroll of Secrets]”, in :แม่แบบ:wj [Ranma ½], volume 11 (fiction), Tokyo: Shogakukan, →ISBN, page 72:", } ], + "form_of": [{"word": "北極熊", "roman": "ฮกเกียวกุงุมะ"}], + "tags": ["form-of"], }, ) diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py index bc10beec..1b106d01 100644 --- a/tests/test_th_gloss.py +++ b/tests/test_th_gloss.py @@ -234,3 +234,27 @@ def test_lo_alt(self): {"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"}, ], ) + + def test_alt_form_template(self): + self.wxr.wtp.add_page( + "แม่แบบ:altform", + 10, + """อีกรูปหนึ่งของ [[ᨸᩣ᩠ᨠ#ภาษาคำเมือง|ᨸᩣ᩠ᨠ]] (ปาก)""", + ) + page_data = parse_page( + self.wxr, + "ปาก", + """== ภาษาคำเมือง == +=== คำนาม === +{{nod-noun}} + +# {{altform|nod|ᨸᩣ᩠ᨠ}}""", + ) + self.assertEqual( + page_data[0]["senses"][0], + { + "glosses": ["อีกรูปหนึ่งของ ᨸᩣ᩠ᨠ (ปาก)"], + "form_of": [{"word": "ᨸᩣ᩠ᨠ", "roman": "ปาก"}], + "tags": ["form-of"], + }, + )