Skip to content

Commit

Permalink
[th] extract "syn", "ant" linkage templates
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 17, 2025
1 parent 97d365e commit d4b786b
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 3 deletions.
13 changes: 11 additions & 2 deletions src/wiktextract/extractor/th/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,19 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..ruby import extract_ruby
from .models import Example, Sense
from .models import Example, Sense, WordEntry
from .tags import translate_raw_tags


def extract_example_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
sense: Sense,
list_item: WikiNode,
ref: str = "",
) -> None:
from .linkage import LINKAGE_TEMPLATES, extract_syn_template

for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name in ["ux", "usex", "ko-usex"]:
Expand All @@ -25,9 +28,15 @@ def extract_example_list_item(
extract_template_ja_usex(wxr, sense, node, ref)
elif node.template_name.startswith("quote-"):
ref = extract_quote_template(wxr, sense, node)
elif node.template_name in LINKAGE_TEMPLATES:
extract_syn_template(
wxr, word_entry, node, LINKAGE_TEMPLATES[node.template_name]
)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, child_list_item, ref)
extract_example_list_item(
wxr, word_entry, sense, child_list_item, ref
)


def extract_ux_template(
Expand Down
33 changes: 33 additions & 0 deletions src/wiktextract/extractor/th/linkage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import count

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
Expand Down Expand Up @@ -129,3 +131,34 @@ def extract_ws_template(
if word != "":
l_data = Linkage(word=word, source=source)
getattr(word_entry, linkage_type).append(l_data)


LINKAGE_TEMPLATES = {
"syn": "synonyms",
"synonyms": "synonyms",
"synsee": "synonyms",
"ant": "antonyms",
"antonyms": "antonyms",
"cot": "coordinate_terms",
"coordinate terms": "coordinate_terms",
"hyper": "hypernyms",
"hypernyms": "hypernyms",
"hypo": "hyponyms",
"hyponyms": "hyponyms",
}


def extract_syn_template(
wxr: WiktextractContext,
word_entry: WordEntry,
t_node: TemplateNode,
linkage_type: str,
) -> None:
for arg_name in count(2):
if arg_name not in t_node.template_parameters:
break
arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name])
if arg_value.startswith("อรรถาภิธาน:"):
extract_thesaurus_page(wxr, word_entry, linkage_type, arg_value)
elif arg_value != "":
getattr(word_entry, linkage_type).append(Linkage(word=arg_value))
1 change: 1 addition & 0 deletions src/wiktextract/extractor/th/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,4 @@ class WordEntry(ThaiBaseModel):
hyponyms: list[Linkage] = []
hypernyms: list[Linkage] = []
idioms: list[Linkage] = []
coordinate_terms: list[Linkage] = []
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/th/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def extract_gloss_list_item(
(":", "*")
):
for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, e_list_item)
extract_example_list_item(wxr, word_entry, sense, e_list_item)

if gloss_str != "":
sense.glosses.append(gloss_str)
Expand Down
14 changes: 14 additions & 0 deletions tests/test_th_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,17 @@ def test_theasurus_page(self):
page_data[0]["synonyms"],
[{"word": "ต่อมโลหิต", "source": "อรรถาภิธาน:ระดู"}],
)

def test_syn_template(self):
page_data = parse_page(
self.wxr,
"โทรทัศน์",
"""== ภาษาไทย ==
=== คำนาม ===
# กระบวนการถ่ายทอด
#: {{syn|th|ทีวี|โทรภาพ}}""",
)
self.assertEqual(
page_data[0]["synonyms"],
[{"word": "ทีวี"}, {"word": "โทรภาพ"}],
)

0 comments on commit d4b786b

Please sign in to comment.