Skip to content

Commit

Permalink
Merge pull request #840 from xxyzz/ko
Browse files Browse the repository at this point in the history
[ko] add Korean edition extractor
  • Loading branch information
xxyzz authored Sep 30, 2024
2 parents 01dd959 + dc5378e commit 3fd8a50
Show file tree
Hide file tree
Showing 8 changed files with 295 additions and 1 deletion.
6 changes: 6 additions & 0 deletions src/wiktextract/data/ko/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false,
"save_ns_names": ["Main", "Template", "Module"],
"extract_ns_names": ["Main"]
}
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/ja/section_titles.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# List of templates
# https://ja.wiktionary.org/wiki/テンプレートの一覧#品詞表記
# https://ja.wiktionary.org/wiki/Wiktionary:テンプレートの一覧#品詞表記
POS_DATA = {
"名詞": {"pos": "noun"},
"数詞": {"pos": "num"},
Expand Down
31 changes: 31 additions & 0 deletions src/wiktextract/extractor/ko/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from pydantic import BaseModel, ConfigDict, Field


class KoreanBaseModel(BaseModel):
model_config = ConfigDict(
extra="forbid",
strict=True,
validate_assignment=True,
validate_default=True,
)


class Sense(KoreanBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
topics: list[str] = []
categories: list[str] = []


class WordEntry(KoreanBaseModel):
model_config = ConfigDict(title="Korean Wiktionary")
word: str = Field(description="Word string", min_length=1)
lang_code: str = Field(description="Wiktionary language code", min_length=1)
lang: str = Field(description="Localized language name", min_length=1)
pos: str = Field(description="Part of speech type", min_length=1)
pos_title: str = ""
senses: list[Sense] = []
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
75 changes: 75 additions & 0 deletions src/wiktextract/extractor/ko/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import re
from typing import Any

from mediawiki_langcodes import name_to_code
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA

PANEL_TEMPLATES = set()
PANEL_PREFIXES = set()
ADDITIONAL_EXPAND_TEMPLATES = set()


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
) -> None:
title_text = clean_node(wxr, None, level_node.largs)
title_text = re.sub(r"\s*\d+$", "", title_text)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)


def parse_language_section(
wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
) -> None:
lang_name = clean_node(wxr, None, level2_node.largs)
lang_code = name_to_code(lang_name, "ko")
if lang_code == "":
lang_code = "unknown"
if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
return
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
word=wxr.wtp.title,
lang_code=lang_code,
lang=lang_name,
pos="unknown",
)
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)

# no POS section
if not level2_node.contain_node(NodeKind.LEVEL3):
extract_pos_section(wxr, page_data, base_data, level2_node, "")


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, Any]]:
# page layout
# https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
# https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text)
page_data: list[WordEntry] = []
for level2_node in tree.find_child(NodeKind.LEVEL2):
parse_language_section(wxr, page_data, level2_node)

for data in page_data:
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]
74 changes: 74 additions & 0 deletions src/wiktextract/extractor/ko/pos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import re

from wikitextprocessor import LevelNode, NodeKind, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Sense, WordEntry
from .section_titles import POS_DATA


def extract_pos_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: LevelNode,
pos_title: str,
) -> None:
page_data.append(base_data.model_copy(deep=True))
if pos_title in POS_DATA:
page_data[-1].pos_title = pos_title
pos_data = POS_DATA[pos_title]
page_data[-1].pos = pos_data["pos"]
page_data[-1].tags.extend(pos_data.get("tags", []))

for list_node in level_node.find_child(NodeKind.LIST):
if list_node.sarg.endswith("#"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(wxr, page_data[-1], list_item)
elif list_node.sarg.endswith("*"):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
extract_unorderd_list_item(wxr, page_data[-1], list_item)

if len(page_data[-1].senses) == 0:
page_data.pop()


def extract_gloss_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
) -> None:
gloss_nodes = []
for node in list_item.children:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
if node.sarg.endswith("*"):
pass # example
continue
else:
gloss_nodes.append(node)

sense = Sense()
gloss_text = clean_node(wxr, sense, gloss_nodes)
if len(gloss_text) > 0:
sense.glosses.append(gloss_text)
word_entry.senses.append(sense)


def extract_unorderd_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
) -> None:
is_first_bold = True
for index, node in enumerate(list_item.children):
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.BOLD
and is_first_bold
):
is_first_bold = False
bold_text = clean_node(wxr, None, node)
if re.fullmatch(r"\d+\.", bold_text):
new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
new_list_item.children = list_item.children[index + 1:]
extract_gloss_list_item(wxr, word_entry, new_list_item)
break
elif isinstance(node, str) and node.startswith("어원:"):
break # etymology
11 changes: 11 additions & 0 deletions src/wiktextract/extractor/ko/section_titles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
POS_DATA = {
"명사": {"pos": "noun"},
"형용사": {"pos": "adj"},
"대명사": {"pos": "pron"},
"수사": {"pos": "num"},
"동사": {"pos": "verb"},
"관용구": {"pos": "phrase", "tags": ["idiomatic"]},
"기호": {"pos": "symbol"},
"접미사": {"pos": "suffix", "tags": ["morpheme"]},
"접두사": {"pos": "prefix", "tags": ["morpheme"]},
}
64 changes: 64 additions & 0 deletions tests/test_ko_gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.ko.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestKoGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="ko"),
WiktionaryConfig(
dump_file_lang_code="ko",
capture_language_codes=None,
),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_ignore_gloss_index_bold_node(self):
data = parse_page(
self.wxr,
"我們",
"""== 중국어 ==
=== 대명사 ===
* '''1.''' [[우리]].""",
)
self.assertEqual(data[0]["senses"], [{"glosses": ["우리."]}])

def test_no_pos_section(self):
data = parse_page(
self.wxr,
"大家",
"""== 한국어 ==
* '''1.''' 모든""",
)
self.assertEqual(data[0]["senses"], [{"glosses": ["모든"]}])

def test_level_4_pos(self):
data = parse_page(
self.wxr,
"개",
"""== 한국어 ==
=== 명사 ===
==== 명사 1 ====
# 가축으로 많이 기르는 갯과 포유류 동물.
==== 명사 2 ====
# 강이나 내에 바닷물이 드나드는 곳.""",
)
self.assertEqual(data[0]["pos"], "noun")
self.assertEqual(
data[0]["senses"],
[{"glosses": ["가축으로 많이 기르는 갯과 포유류 동물."]}],
)
self.assertEqual(data[1]["pos"], "noun")
self.assertEqual(
data[1]["senses"],
[{"glosses": ["강이나 내에 바닷물이 드나드는 곳."]}],
)
33 changes: 33 additions & 0 deletions tests/test_ru_sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.ru.models import WordEntry
from wiktextract.extractor.ru.pronunciation import extract_homophone_section
from wiktextract.wxr_context import WiktextractContext


class TestRUSound(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="ru"),
WiktionaryConfig(
dump_file_lang_code="ru", capture_language_codes=None
),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_homophone_section_list(self):
self.wxr.wtp.start_page("ไทย")
root = self.wxr.wtp.parse("* [[ไท]], [[ไถ]]")
data = WordEntry(lang="th", lang_code="Тайский", word="ไทย")
extract_homophone_section(self.wxr, data, root)
self.assertEqual(
[s.model_dump(exclude_defaults=True) for s in data.sounds],
[{"homophones": ["ไท", "ไถ"]}]
)

0 comments on commit 3fd8a50

Please sign in to comment.