Skip to content

Commit

Permalink
[th] extract gloss lists
Browse files Browse the repository at this point in the history
xxyzz committed Jan 7, 2025
1 parent 9a96ef4 commit af15047
Showing 6 changed files with 239 additions and 0 deletions.
21 changes: 21 additions & 0 deletions src/wiktextract/extractor/th/etymology.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import WordEntry


def extract_etymology_section(
wxr: WiktextractContext,
base_data: WordEntry,
level_node: LevelNode,
) -> None:
base_data.etymology_text = ""
base_data.categories.clear()
index = len(level_node.children)
for node_index, _ in level_node.find_child(LEVEL_KIND_FLAGS, True):
index = node_index
break
e_str = clean_node(wxr, base_data, level_node.children[:index])
if e_str != "":
base_data.etymology_text = e_str
31 changes: 31 additions & 0 deletions src/wiktextract/extractor/th/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pydantic import BaseModel, ConfigDict, Field


class ThaiBaseModel(BaseModel):
model_config = ConfigDict(
extra="forbid",
strict=True,
validate_assignment=True,
validate_default=True,
)


class Sense(ThaiBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []


class WordEntry(ThaiBaseModel):
model_config = ConfigDict(title="Thai Wiktionary")
word: str = Field(description="Word string", min_length=1)
lang_code: str = Field(description="Wiktionary language code", min_length=1)
lang: str = Field(description="Localized language name", min_length=1)
pos: str = Field(description="Part of speech type", min_length=1)
pos_title: str = ""
senses: list[Sense] = []
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
etymology_text: str = ""
62 changes: 62 additions & 0 deletions src/wiktextract/extractor/th/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import string
from typing import Any

from mediawiki_langcodes import name_to_code
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
) -> None:
title_text = clean_node(wxr, None, level_node.largs)
title_text = title_text.rstrip(string.digits + string.whitespace)
wxr.wtp.start_subsection(title_text)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "รากศัพท์":
extract_etymology_section(wxr, base_data, level_node)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout
# https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text, pre_expand=True)
page_data: list[WordEntry] = []
for level2_node in tree.find_child(NodeKind.LEVEL2):
lang_name = clean_node(wxr, None, level2_node.largs)
lang_name = lang_name.removeprefix("ภาษา")
lang_code = name_to_code(lang_name, "th")
if lang_code == "":
lang_code = "unknown"
if lang_name == "":
lang_name = "unknown"
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title,
lang_code=lang_code,
lang=lang_name,
pos="unknown",
)
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level_node)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]
39 changes: 39 additions & 0 deletions src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from wikitextprocessor import LevelNode, NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .section_titles import POS_DATA


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))

for list_node in level_node.find_child(NodeKind.LIST):
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)


def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
) -> None:
sense = Sense()
gloss_str = clean_node(
wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
)
if gloss_str != "":
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)
15 changes: 15 additions & 0 deletions src/wiktextract/extractor/th/section_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
POS_DATA = {
"คำกริยา": {"pos": "verb"},
"คำกริยาวิเศษณ์": {"pos": "adv"},
"คำคุณศัพท์": {"pos": "adj"},
"คำนาม": {"pos": "noun"},
"คำบุพบท": {"pos": "prep"},
"คำลักษณนาม": {"pos": "classifier"},
"คำวิสามานยนาม": {"pos": "name"},
"คำสรรพนาม": {"pos": "pron"},
"คำสันธาน": {"pos": "conj"},
"คำอนุภาค": {"pos": "particle"},
"คำอุทาน": {"pos": "intj"},
"วลี": {"pos": "phrase"},
"เลข": {"pos": "num", "tags": ["number"]},
}
71 changes: 71 additions & 0 deletions tests/test_th_gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.th.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestThGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="th"),
WiktionaryConfig(
dump_file_lang_code="th", capture_language_codes=None
),
)

def test_do_not_share_etymology_data(self):
self.wxr.wtp.add_page(
"แม่แบบ:inh+",
10,
"""สืบทอดจาก<span class="etyl">[[w:ภาษาไทดั้งเดิม|ไทดั้งเดิม]][[Category:ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม|กบ]][[Category:ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม|กบ]]</span> <i class="Latn mention" lang="tai-pro">[[การสืบสร้าง&#x3A;ไทดั้งเดิม&#x2F;kɤpᴰ|&#x2A;kɤpᴰ]]</i>""",
)
page_data = parse_page(
self.wxr,
"กบ",
"""== ภาษาไทย ==
=== รากศัพท์ 2 ===
{{inh+|th|tai-pro|*kɤpᴰ}}
==== คำนาม ====
{{th-noun|ตัว}}
# [[ชื่อ]]
=== รากศัพท์ 3 ===
==== คำนาม ====
{{th-noun|ตัว}}
# [[ปลา]]""",
)
self.assertEqual(
page_data,
[
{
"categories": [
"ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม",
"ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม",
],
"etymology_text": "สืบทอดจากไทดั้งเดิม *kɤpᴰ",
"senses": [{"glosses": ["ชื่อ"]}],
"pos": "noun",
"pos_title": "คำนาม",
"word": "กบ",
"lang": "ไทย",
"lang_code": "th",
},
{
"senses": [{"glosses": ["ปลา"]}],
"pos": "noun",
"pos_title": "คำนาม",
"word": "กบ",
"lang": "ไทย",
"lang_code": "th",
},
],
)

0 comments on commit af15047

Please sign in to comment.