From 3ee45d6be52941a94d239ab72d5d1aa4dab8686b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 15 Jan 2025 14:12:05 +0800 Subject: [PATCH 1/3] [th] extract "lo-alt" form template --- src/wiktextract/extractor/th/alt_form.py | 38 ++++++++++++++++++++---- src/wiktextract/extractor/th/models.py | 1 + tests/test_th_gloss.py | 23 ++++++++++++++ 3 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/wiktextract/extractor/th/alt_form.py b/src/wiktextract/extractor/th/alt_form.py index 55e9febf..fff7052e 100644 --- a/src/wiktextract/extractor/th/alt_form.py +++ b/src/wiktextract/extractor/th/alt_form.py @@ -1,4 +1,4 @@ -from wikitextprocessor import LevelNode, NodeKind, TemplateNode +from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode from ...page import clean_node from ...wxr_context import WiktextractContext @@ -18,6 +18,10 @@ def extract_alt_form_section( ): extract_alt_template(wxr, word_entry, node) + for t_node in level_node.find_child(NodeKind.TEMPLATE): + if t_node.template_name == "lo-alt": + extract_lo_alt_template(wxr, word_entry, t_node) + def extract_alt_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode @@ -25,9 +29,18 @@ def extract_alt_template( expanded_node = wxr.wtp.parse( wxr.wtp.node_to_wikitext(t_node), expand_all=True ) + lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) + extract_alt_expanded_nodes(wxr, word_entry, expanded_node, lang_code) + +def extract_alt_expanded_nodes( + wxr: WiktextractContext, + word_entry: WordEntry, + root: WikiNode, + lang_code: str, +) -> None: raw_tags = [] - for italic_node in expanded_node.find_child(NodeKind.ITALIC): + for italic_node in root.find_child(NodeKind.ITALIC): raw_tags_str = clean_node(wxr, None, italic_node) for raw_tag in raw_tags_str.split(","): raw_tag = raw_tag.strip() @@ -35,9 +48,24 @@ def extract_alt_template( raw_tags.append(raw_tag) break - lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) - for span_tag in expanded_node.find_html("span"): - if span_tag.attrs.get("lang", "") == lang_code: + for span_tag in root.find_html("span"): + span_lang = span_tag.attrs.get("lang", "") + if span_lang == lang_code: form = Form(form=clean_node(wxr, None, span_tag), raw_tags=raw_tags) translate_raw_tags(form) word_entry.forms.append(form) + elif span_lang.endswith("-Latn") and len(word_entry.forms) > 0: + word_entry.forms[-1].roman = clean_node(wxr, None, span_tag) + + clean_node(wxr, word_entry, root) + + +def extract_lo_alt_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for list_node in expanded_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + extract_alt_expanded_nodes(wxr, word_entry, list_item, "lo") diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index c04492a9..d42347df 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -41,6 +41,7 @@ class Form(ThaiBaseModel): form: str tags: list[str] = [] raw_tags: list[str] = [] + roman: str = "" class Translation(ThaiBaseModel): diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py index 11954325..0bb2c6ae 100644 --- a/tests/test_th_gloss.py +++ b/tests/test_th_gloss.py @@ -208,3 +208,26 @@ def test_alt_template(self): {"form": "เดิร", "raw_tags": ["เลิกใช้"]}, ], ) + + def test_lo_alt(self): + self.wxr.wtp.add_page( + "แม่แบบ:lo-alt", + 10, + """* (''ล้าสมัย'') [[ທຸຣຽນ#ภาษาลาว|ທຸຣຽນ]] (ทุรย̂น)""", + ) + page_data = parse_page( + self.wxr, + "ທຸລຽນ", + """== ภาษาลาว == +=== รูปแบบอื่น === +{{lo-alt|d=ທຸຣຽນ}} +=== คำนาม === +{{lo-noun}} +# [[ทุเรียน]]""", + ) + self.assertEqual( + page_data[0]["forms"], + [ + {"form": "ທຸຣຽນ", "raw_tags": ["ล้าสมัย"], "roman": "ทุรย̂น"}, + ], + ) From ef7147c2724ca18a1644f1f20cfa18358ced4817 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 15 Jan 2025 17:44:08 +0800 Subject: [PATCH 2/3] [th] extract notes section and add more section titles --- src/wiktextract/extractor/th/models.py | 5 +++++ src/wiktextract/extractor/th/page.py | 12 ++++++---- src/wiktextract/extractor/th/pos.py | 16 ++++++++++++++ .../extractor/th/section_titles.py | 22 +++++++++++++++++++ 4 files changed, 51 insertions(+), 4 deletions(-) diff --git a/src/wiktextract/extractor/th/models.py b/src/wiktextract/extractor/th/models.py index d42347df..82f5b626 100644 --- a/src/wiktextract/extractor/th/models.py +++ b/src/wiktextract/extractor/th/models.py @@ -96,3 +96,8 @@ class WordEntry(ThaiBaseModel): derived: list[Linkage] = [] related: list[Linkage] = [] descendants: list[Descendant] = [] + anagrams: list[Linkage] = [] + notes: list[str] = [] + hyponyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + idioms: list[Linkage] = [] diff --git a/src/wiktextract/extractor/th/page.py b/src/wiktextract/extractor/th/page.py index 03bf20d2..418dbaf2 100644 --- a/src/wiktextract/extractor/th/page.py +++ b/src/wiktextract/extractor/th/page.py @@ -11,7 +11,7 @@ from .etymology import extract_etymology_section from .linkage import extract_linkage_section from .models import Sense, WordEntry -from .pos import extract_pos_section +from .pos import extract_note_section, extract_pos_section from .section_titles import LINKAGE_SECTIONS, POS_DATA from .translation import extract_translation_section @@ -29,7 +29,7 @@ def parse_section( extract_pos_section(wxr, page_data, base_data, level_node, title_text) elif title_text == "รากศัพท์": extract_etymology_section(wxr, base_data, level_node) - elif title_text == "คำแปลภาษาอื่น": + elif title_text in ["คำแปลภาษาอื่น", "คำแปล"]: extract_translation_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) @@ -44,13 +44,17 @@ def parse_section( extract_descendant_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) - elif title_text == "การออกเสียง": + elif title_text.startswith(("การออกเสียง", "การอ่านออกเสียง")): pass # sounds elif title_text == "รูปแบบอื่น": extract_alt_form_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) - elif title_text not in ["ดูเพิ่ม"]: + elif title_text == "การใช้": + extract_note_section( + wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + ) + elif title_text not in ["ดูเพิ่ม", "อ้างอิง", "อ่านเพิ่ม", "อ่านเพิ่มเติม"]: wxr.wtp.debug(f"Unknown title: {title_text}") for next_level in level_node.find_child(LEVEL_KIND_FLAGS): diff --git a/src/wiktextract/extractor/th/pos.py b/src/wiktextract/extractor/th/pos.py index d5d272e5..fab32407 100644 --- a/src/wiktextract/extractor/th/pos.py +++ b/src/wiktextract/extractor/th/pos.py @@ -151,3 +151,19 @@ def extract_th_verb_adj_template( ) clean_node(wxr, word_entry, expanded_node) + + +def extract_note_section( + wxr: WiktextractContext, + word_entry: WordEntry, + level_node: LevelNode, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + note_str = clean_node( + wxr, + word_entry, + list(list_item.invert_find_child(NodeKind.LIST)), + ) + if note_str != "": + word_entry.notes.append(note_str) diff --git a/src/wiktextract/extractor/th/section_titles.py b/src/wiktextract/extractor/th/section_titles.py index ff14f847..5bd7a6f5 100644 --- a/src/wiktextract/extractor/th/section_titles.py +++ b/src/wiktextract/extractor/th/section_titles.py @@ -12,6 +12,20 @@ "คำอุทาน": {"pos": "intj"}, "วลี": {"pos": "phrase"}, "เลข": {"pos": "num", "tags": ["number"]}, + "อักษรจีน": {"pos": "character"}, + "การถอดเป็นอักษรโรมัน": {"pos": "romanization"}, + "สัญลักษณ์": {"pos": "symbol"}, + "คำวิเศษณ์": {"pos": "adj"}, + "ตัวอักษร": {"pos": "character"}, + "รูปผันคำกำกับนาม": {"pos": "noun", "tags": ["form-of"]}, + "รูปผันคำสรรพนาม": {"pos": "pron", "tags": ["form-of"]}, + "คำกำกับนาม": {"pos": "noun"}, + "ตัวเลข": {"pos": "num", "tags": ["number"]}, + "พาร์ทิซิเพิล": {"pos": "verb", "tags": ["participle"]}, + "พยางค์": {"pos": "syllable"}, + "คันจิ": {"pos": "character", "tags": ["kanji"]}, + "คำอาการนาม": {"pos": "adj_noun"}, + "อุปสรรค": {"pos": "prefix", "tags": ["morpheme"]}, } @@ -20,4 +34,12 @@ "คำพ้องความ": "synonyms", "ลูกคำ": "derived", "คำเกี่ยวข้อง": "related", + "คำที่เกี่ยวข้อง": "related", + "คำที่รับมา": "derived", + "คำตรงกันข้าม": "antonyms", + "คำสลับอักษร": "anagrams", + "การสลับอักษร": "anagrams", + "คำลูกกลุ่ม": "hyponyms", + "คำจ่ากลุ่ม": "hypernyms", + "สำนวน": "idioms", } From 4fd882ad95a3a6f1a2c5b06c8b2a43fa94b2536b Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 15 Jan 2025 18:17:51 +0800 Subject: [PATCH 3/3] [it,pt,th] delete temp db file in tests --- tests/test_it_etymology.py | 3 +++ tests/test_it_example.py | 3 +++ tests/test_it_forms.py | 3 +++ tests/test_it_gloss.py | 3 +++ tests/test_it_linkage.py | 3 +++ tests/test_it_sound.py | 3 +++ tests/test_it_translation.py | 3 +++ tests/test_pt_etymology.py | 3 +++ tests/test_pt_example.py | 3 +++ tests/test_pt_form.py | 3 +++ tests/test_pt_gloss.py | 3 +++ tests/test_pt_head_line.py | 3 +++ tests/test_pt_linkage.py | 3 +++ tests/test_pt_sound.py | 3 +++ tests/test_pt_translation.py | 3 +++ tests/test_th_desc.py | 3 +++ tests/test_th_example.py | 3 +++ tests/test_th_gloss.py | 3 +++ tests/test_th_linkage.py | 3 +++ tests/test_th_translation.py | 3 +++ 20 files changed, 60 insertions(+) diff --git a/tests/test_it_etymology.py b/tests/test_it_etymology.py index 110bc18a..1b9c8101 100644 --- a/tests/test_it_etymology.py +++ b/tests/test_it_etymology.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_quote_template(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") data = parse_page( diff --git a/tests/test_it_example.py b/tests/test_it_example.py index e8079358..d367f3fe 100644 --- a/tests/test_it_example.py +++ b/tests/test_it_example.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_list_example(self): self.wxr.wtp.add_page("Template:-br-", 10, "Bretone") data = parse_page( diff --git a/tests/test_it_forms.py b/tests/test_it_forms.py index 5ad323f0..4cd9dba9 100644 --- a/tests/test_it_forms.py +++ b/tests/test_it_forms.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_tabs_template(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") data = parse_page( diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py index b5d2a259..83f19e60 100644 --- a/tests/test_it_gloss.py +++ b/tests/test_it_gloss.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_gloss_list(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") self.wxr.wtp.add_page( diff --git a/tests/test_it_linkage.py b/tests/test_it_linkage.py index f1cf5c23..d0ad5d53 100644 --- a/tests/test_it_linkage.py +++ b/tests/test_it_linkage.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_synonyms(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") self.wxr.wtp.add_page( diff --git a/tests/test_it_sound.py b/tests/test_it_sound.py index 30ba4a95..d67736b5 100644 --- a/tests/test_it_sound.py +++ b/tests/test_it_sound.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_hyphenation_single_list(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") data = parse_page( diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py index 1ab8dfb7..60c39faa 100644 --- a/tests/test_it_translation.py +++ b/tests/test_it_translation.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_common_lists(self): self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") self.wxr.wtp.add_page("Template:ar", 10, "arabo") diff --git a/tests/test_pt_etymology.py b/tests/test_pt_etymology.py index de08b0f2..5045f16a 100644 --- a/tests/test_pt_etymology.py +++ b/tests/test_pt_etymology.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_list(self): self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") self.wxr.wtp.add_page( diff --git a/tests/test_pt_example.py b/tests/test_pt_example.py index e106ce58..3cfe8150 100644 --- a/tests/test_pt_example.py +++ b/tests/test_pt_example.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_tradex_template(self): self.wxr.wtp.add_page("Predefinição:-ryu-", 10, "Okinawano") self.wxr.wtp.add_page("Predefinição:Substantivo", 10, "Substantivo") diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py index 2b3fce00..3b7af273 100644 --- a/tests/test_pt_form.py +++ b/tests/test_pt_form.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_flex_pt_subst_completa(self): self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") self.wxr.wtp.add_page( diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py index 7458db65..a454a33f 100644 --- a/tests/test_pt_gloss.py +++ b/tests/test_pt_gloss.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_escopo(self): self.wxr.wtp.add_page( "Predefinição:-pt-", diff --git a/tests/test_pt_head_line.py b/tests/test_pt_head_line.py index 6e88274b..bdffc532 100644 --- a/tests/test_pt_head_line.py +++ b/tests/test_pt_head_line.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_gramática_template(self): self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''") diff --git a/tests/test_pt_linkage.py b/tests/test_pt_linkage.py index f260b4cd..09775fe7 100644 --- a/tests/test_pt_linkage.py +++ b/tests/test_pt_linkage.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_expression(self): self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") self.wxr.wtp.add_page("Predefinição:g", 10, "''masculino''") diff --git a/tests/test_pt_sound.py b/tests/test_pt_sound.py index 2078c8df..2cc887cd 100644 --- a/tests/test_pt_sound.py +++ b/tests/test_pt_sound.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_subsection(self): self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") self.wxr.wtp.add_page( diff --git a/tests/test_pt_translation.py b/tests/test_pt_translation.py index dff4a040..18ddc2c1 100644 --- a/tests/test_pt_translation.py +++ b/tests/test_pt_translation.py @@ -23,6 +23,9 @@ def setUp(self) -> None: conf, ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_subpage(self): self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") self.wxr.wtp.add_page( diff --git a/tests/test_th_desc.py b/tests/test_th_desc.py index f9c47d58..584faa6f 100644 --- a/tests/test_th_desc.py +++ b/tests/test_th_desc.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_desc_template(self): self.wxr.wtp.add_page( "แม่แบบ:desc", diff --git a/tests/test_th_example.py b/tests/test_th_example.py index 97f40da3..d0a042c7 100644 --- a/tests/test_th_example.py +++ b/tests/test_th_example.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_ux(self): self.wxr.wtp.add_page( "แม่แบบ:ko-usex", diff --git a/tests/test_th_gloss.py b/tests/test_th_gloss.py index 0bb2c6ae..bc10beec 100644 --- a/tests/test_th_gloss.py +++ b/tests/test_th_gloss.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_do_not_share_etymology_data(self): self.wxr.wtp.add_page( "แม่แบบ:inh+", diff --git a/tests/test_th_linkage.py b/tests/test_th_linkage.py index 5a3310b5..a22f03d9 100644 --- a/tests/test_th_linkage.py +++ b/tests/test_th_linkage.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_col(self): self.wxr.wtp.add_page( "แม่แบบ:col2", diff --git a/tests/test_th_translation.py b/tests/test_th_translation.py index 2a43dd69..644ab46c 100644 --- a/tests/test_th_translation.py +++ b/tests/test_th_translation.py @@ -18,6 +18,9 @@ def setUp(self) -> None: ), ) + def tearDown(self): + self.wxr.wtp.close_db_conn() + def test_nested_list(self): self.wxr.wtp.add_page( "แม่แบบ:trans-top",