-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
6 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import WordEntry | ||
|
||
|
||
def extract_etymology_section( | ||
wxr: WiktextractContext, | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
) -> None: | ||
base_data.etymology_text = "" | ||
base_data.categories.clear() | ||
index = len(level_node.children) | ||
for node_index, _ in level_node.find_child(LEVEL_KIND_FLAGS, True): | ||
index = node_index | ||
break | ||
e_str = clean_node(wxr, base_data, level_node.children[:index]) | ||
if e_str != "": | ||
base_data.etymology_text = e_str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from pydantic import BaseModel, ConfigDict, Field | ||
|
||
|
||
class ThaiBaseModel(BaseModel): | ||
model_config = ConfigDict( | ||
extra="forbid", | ||
strict=True, | ||
validate_assignment=True, | ||
validate_default=True, | ||
) | ||
|
||
|
||
class Sense(ThaiBaseModel): | ||
glosses: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] | ||
categories: list[str] = [] | ||
|
||
|
||
class WordEntry(ThaiBaseModel): | ||
model_config = ConfigDict(title="Thai Wiktionary") | ||
word: str = Field(description="Word string", min_length=1) | ||
lang_code: str = Field(description="Wiktionary language code", min_length=1) | ||
lang: str = Field(description="Localized language name", min_length=1) | ||
pos: str = Field(description="Part of speech type", min_length=1) | ||
pos_title: str = "" | ||
senses: list[Sense] = [] | ||
categories: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] | ||
etymology_text: str = "" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import string | ||
from typing import Any | ||
|
||
from mediawiki_langcodes import name_to_code | ||
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .etymology import extract_etymology_section | ||
from .models import Sense, WordEntry | ||
from .pos import extract_pos_section | ||
from .section_titles import POS_DATA | ||
|
||
|
||
def parse_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
) -> None: | ||
title_text = clean_node(wxr, None, level_node.largs) | ||
title_text = title_text.rstrip(string.digits + string.whitespace) | ||
wxr.wtp.start_subsection(title_text) | ||
if title_text in POS_DATA: | ||
extract_pos_section(wxr, page_data, base_data, level_node, title_text) | ||
elif title_text == "รากศัพท์": | ||
extract_etymology_section(wxr, base_data, level_node) | ||
|
||
for next_level in level_node.find_child(LEVEL_KIND_FLAGS): | ||
parse_section(wxr, page_data, base_data, next_level) | ||
|
||
|
||
def parse_page( | ||
wxr: WiktextractContext, page_title: str, page_text: str | ||
) -> list[dict[str, Any]]: | ||
# page layout | ||
# https://th.wiktionary.org/wiki/วิธีใช้:คู่มือในการเขียน | ||
wxr.wtp.start_page(page_title) | ||
tree = wxr.wtp.parse(page_text, pre_expand=True) | ||
page_data: list[WordEntry] = [] | ||
for level2_node in tree.find_child(NodeKind.LEVEL2): | ||
lang_name = clean_node(wxr, None, level2_node.largs) | ||
lang_name = lang_name.removeprefix("ภาษา") | ||
lang_code = name_to_code(lang_name, "th") | ||
if lang_code == "": | ||
lang_code = "unknown" | ||
if lang_name == "": | ||
lang_name = "unknown" | ||
wxr.wtp.start_section(lang_name) | ||
base_data = WordEntry( | ||
word=wxr.wtp.title, | ||
lang_code=lang_code, | ||
lang=lang_name, | ||
pos="unknown", | ||
) | ||
for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): | ||
parse_section(wxr, page_data, base_data, next_level_node) | ||
|
||
for data in page_data: | ||
if len(data.senses) == 0: | ||
data.senses.append(Sense(tags=["no-gloss"])) | ||
return [m.model_dump(exclude_defaults=True) for m in page_data] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from wikitextprocessor import LevelNode, NodeKind, WikiNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Sense, WordEntry | ||
from .section_titles import POS_DATA | ||
|
||
|
||
def extract_pos_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
pos_title: str, | ||
) -> None: | ||
page_data.append(base_data.model_copy(deep=True)) | ||
page_data[-1].pos_title = pos_title | ||
pos_data = POS_DATA[pos_title] | ||
page_data[-1].pos = pos_data["pos"] | ||
page_data[-1].tags.extend(pos_data.get("tags", [])) | ||
|
||
for list_node in level_node.find_child(NodeKind.LIST): | ||
if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
extract_gloss_list_item(wxr, page_data[-1], list_item) | ||
|
||
|
||
def extract_gloss_list_item( | ||
wxr: WiktextractContext, | ||
word_entry: WordEntry, | ||
list_item: WikiNode, | ||
) -> None: | ||
sense = Sense() | ||
gloss_str = clean_node( | ||
wxr, sense, list(list_item.invert_find_child(NodeKind.LIST)) | ||
) | ||
if gloss_str != "": | ||
sense.glosses.append(gloss_str) | ||
word_entry.senses.append(sense) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
POS_DATA = { | ||
"คำกริยา": {"pos": "verb"}, | ||
"คำกริยาวิเศษณ์": {"pos": "adv"}, | ||
"คำคุณศัพท์": {"pos": "adj"}, | ||
"คำนาม": {"pos": "noun"}, | ||
"คำบุพบท": {"pos": "prep"}, | ||
"คำลักษณนาม": {"pos": "classifier"}, | ||
"คำวิสามานยนาม": {"pos": "name"}, | ||
"คำสรรพนาม": {"pos": "pron"}, | ||
"คำสันธาน": {"pos": "conj"}, | ||
"คำอนุภาค": {"pos": "particle"}, | ||
"คำอุทาน": {"pos": "intj"}, | ||
"วลี": {"pos": "phrase"}, | ||
"เลข": {"pos": "num", "tags": ["number"]}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from unittest import TestCase | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.th.page import parse_page | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestThGloss(TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="th"), | ||
WiktionaryConfig( | ||
dump_file_lang_code="th", capture_language_codes=None | ||
), | ||
) | ||
|
||
def test_do_not_share_etymology_data(self): | ||
self.wxr.wtp.add_page( | ||
"แม่แบบ:inh+", | ||
10, | ||
"""สืบทอดจาก<span class="etyl">[[w:ภาษาไทดั้งเดิม|ไทดั้งเดิม]][[Category:ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม|กบ]][[Category:ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม|กบ]]</span> <i class="Latn mention" lang="tai-pro">[[การสืบสร้าง:ไทดั้งเดิม/kɤpᴰ|*kɤpᴰ]]</i>""", | ||
) | ||
page_data = parse_page( | ||
self.wxr, | ||
"กบ", | ||
"""== ภาษาไทย == | ||
=== รากศัพท์ 2 === | ||
{{inh+|th|tai-pro|*kɤpᴰ}} | ||
==== คำนาม ==== | ||
{{th-noun|ตัว}} | ||
# [[ชื่อ]] | ||
=== รากศัพท์ 3 === | ||
==== คำนาม ==== | ||
{{th-noun|ตัว}} | ||
# [[ปลา]]""", | ||
) | ||
self.assertEqual( | ||
page_data, | ||
[ | ||
{ | ||
"categories": [ | ||
"ศัพท์ภาษาไทยที่สืบทอดจากภาษาไทดั้งเดิม", | ||
"ศัพท์ภาษาไทยที่รับมาจากภาษาไทดั้งเดิม", | ||
], | ||
"etymology_text": "สืบทอดจากไทดั้งเดิม *kɤpᴰ", | ||
"senses": [{"glosses": ["ชื่อ"]}], | ||
"pos": "noun", | ||
"pos_title": "คำนาม", | ||
"word": "กบ", | ||
"lang": "ไทย", | ||
"lang_code": "th", | ||
}, | ||
{ | ||
"senses": [{"glosses": ["ปลา"]}], | ||
"pos": "noun", | ||
"pos_title": "คำนาม", | ||
"word": "กบ", | ||
"lang": "ไทย", | ||
"lang_code": "th", | ||
}, | ||
], | ||
) |