-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #840 from xxyzz/ko
[ko] add Korean edition extractor
- Loading branch information
Showing
8 changed files
with
295 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"analyze_templates": false, | ||
"extract_thesaurus_pages": false, | ||
"save_ns_names": ["Main", "Template", "Module"], | ||
"extract_ns_names": ["Main"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from pydantic import BaseModel, ConfigDict, Field | ||
|
||
|
||
class KoreanBaseModel(BaseModel): | ||
model_config = ConfigDict( | ||
extra="forbid", | ||
strict=True, | ||
validate_assignment=True, | ||
validate_default=True, | ||
) | ||
|
||
|
||
class Sense(KoreanBaseModel): | ||
glosses: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] | ||
topics: list[str] = [] | ||
categories: list[str] = [] | ||
|
||
|
||
class WordEntry(KoreanBaseModel): | ||
model_config = ConfigDict(title="Korean Wiktionary") | ||
word: str = Field(description="Word string", min_length=1) | ||
lang_code: str = Field(description="Wiktionary language code", min_length=1) | ||
lang: str = Field(description="Localized language name", min_length=1) | ||
pos: str = Field(description="Part of speech type", min_length=1) | ||
pos_title: str = "" | ||
senses: list[Sense] = [] | ||
categories: list[str] = [] | ||
tags: list[str] = [] | ||
raw_tags: list[str] = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import re | ||
from typing import Any | ||
|
||
from mediawiki_langcodes import name_to_code | ||
from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Sense, WordEntry | ||
from .pos import extract_pos_section | ||
from .section_titles import POS_DATA | ||
|
||
PANEL_TEMPLATES = set() | ||
PANEL_PREFIXES = set() | ||
ADDITIONAL_EXPAND_TEMPLATES = set() | ||
|
||
|
||
def parse_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
) -> None: | ||
title_text = clean_node(wxr, None, level_node.largs) | ||
title_text = re.sub(r"\s*\d+$", "", title_text) | ||
if title_text in POS_DATA: | ||
extract_pos_section(wxr, page_data, base_data, level_node, title_text) | ||
|
||
for next_level in level_node.find_child(LEVEL_KIND_FLAGS): | ||
parse_section(wxr, page_data, base_data, next_level) | ||
|
||
|
||
def parse_language_section( | ||
wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode | ||
) -> None: | ||
lang_name = clean_node(wxr, None, level2_node.largs) | ||
lang_code = name_to_code(lang_name, "ko") | ||
if lang_code == "": | ||
lang_code = "unknown" | ||
if ( | ||
wxr.config.capture_language_codes is not None | ||
and lang_code not in wxr.config.capture_language_codes | ||
): | ||
return | ||
wxr.wtp.start_section(lang_name) | ||
base_data = WordEntry( | ||
word=wxr.wtp.title, | ||
lang_code=lang_code, | ||
lang=lang_name, | ||
pos="unknown", | ||
) | ||
for level3_node in level2_node.find_child(NodeKind.LEVEL3): | ||
parse_section(wxr, page_data, base_data, level3_node) | ||
|
||
# no POS section | ||
if not level2_node.contain_node(NodeKind.LEVEL3): | ||
extract_pos_section(wxr, page_data, base_data, level2_node, "") | ||
|
||
|
||
def parse_page( | ||
wxr: WiktextractContext, page_title: str, page_text: str | ||
) -> list[dict[str, Any]]: | ||
# page layout | ||
# https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식 | ||
# https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부 | ||
wxr.wtp.start_page(page_title) | ||
tree = wxr.wtp.parse(page_text) | ||
page_data: list[WordEntry] = [] | ||
for level2_node in tree.find_child(NodeKind.LEVEL2): | ||
parse_language_section(wxr, page_data, level2_node) | ||
|
||
for data in page_data: | ||
if len(data.senses) == 0: | ||
data.senses.append(Sense(tags=["no-gloss"])) | ||
return [m.model_dump(exclude_defaults=True) for m in page_data] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import re | ||
|
||
from wikitextprocessor import LevelNode, NodeKind, WikiNode | ||
|
||
from ...page import clean_node | ||
from ...wxr_context import WiktextractContext | ||
from .models import Sense, WordEntry | ||
from .section_titles import POS_DATA | ||
|
||
|
||
def extract_pos_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
base_data: WordEntry, | ||
level_node: LevelNode, | ||
pos_title: str, | ||
) -> None: | ||
page_data.append(base_data.model_copy(deep=True)) | ||
if pos_title in POS_DATA: | ||
page_data[-1].pos_title = pos_title | ||
pos_data = POS_DATA[pos_title] | ||
page_data[-1].pos = pos_data["pos"] | ||
page_data[-1].tags.extend(pos_data.get("tags", [])) | ||
|
||
for list_node in level_node.find_child(NodeKind.LIST): | ||
if list_node.sarg.endswith("#"): | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
extract_gloss_list_item(wxr, page_data[-1], list_item) | ||
elif list_node.sarg.endswith("*"): | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
extract_unorderd_list_item(wxr, page_data[-1], list_item) | ||
|
||
if len(page_data[-1].senses) == 0: | ||
page_data.pop() | ||
|
||
|
||
def extract_gloss_list_item( | ||
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode | ||
) -> None: | ||
gloss_nodes = [] | ||
for node in list_item.children: | ||
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: | ||
if node.sarg.endswith("*"): | ||
pass # example | ||
continue | ||
else: | ||
gloss_nodes.append(node) | ||
|
||
sense = Sense() | ||
gloss_text = clean_node(wxr, sense, gloss_nodes) | ||
if len(gloss_text) > 0: | ||
sense.glosses.append(gloss_text) | ||
word_entry.senses.append(sense) | ||
|
||
|
||
def extract_unorderd_list_item( | ||
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode | ||
) -> None: | ||
is_first_bold = True | ||
for index, node in enumerate(list_item.children): | ||
if ( | ||
isinstance(node, WikiNode) | ||
and node.kind == NodeKind.BOLD | ||
and is_first_bold | ||
): | ||
is_first_bold = False | ||
bold_text = clean_node(wxr, None, node) | ||
if re.fullmatch(r"\d+\.", bold_text): | ||
new_list_item = WikiNode(NodeKind.LIST_ITEM, 0) | ||
new_list_item.children = list_item.children[index + 1:] | ||
extract_gloss_list_item(wxr, word_entry, new_list_item) | ||
break | ||
elif isinstance(node, str) and node.startswith("어원:"): | ||
break # etymology |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
POS_DATA = { | ||
"명사": {"pos": "noun"}, | ||
"형용사": {"pos": "adj"}, | ||
"대명사": {"pos": "pron"}, | ||
"수사": {"pos": "num"}, | ||
"동사": {"pos": "verb"}, | ||
"관용구": {"pos": "phrase", "tags": ["idiomatic"]}, | ||
"기호": {"pos": "symbol"}, | ||
"접미사": {"pos": "suffix", "tags": ["morpheme"]}, | ||
"접두사": {"pos": "prefix", "tags": ["morpheme"]}, | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from unittest import TestCase | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.ko.page import parse_page | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestKoGloss(TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="ko"), | ||
WiktionaryConfig( | ||
dump_file_lang_code="ko", | ||
capture_language_codes=None, | ||
), | ||
) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
|
||
def test_ignore_gloss_index_bold_node(self): | ||
data = parse_page( | ||
self.wxr, | ||
"我們", | ||
"""== 중국어 == | ||
=== 대명사 === | ||
* '''1.''' [[우리]].""", | ||
) | ||
self.assertEqual(data[0]["senses"], [{"glosses": ["우리."]}]) | ||
|
||
def test_no_pos_section(self): | ||
data = parse_page( | ||
self.wxr, | ||
"大家", | ||
"""== 한국어 == | ||
* '''1.''' 모든""", | ||
) | ||
self.assertEqual(data[0]["senses"], [{"glosses": ["모든"]}]) | ||
|
||
def test_level_4_pos(self): | ||
data = parse_page( | ||
self.wxr, | ||
"개", | ||
"""== 한국어 == | ||
=== 명사 === | ||
==== 명사 1 ==== | ||
# 가축으로 많이 기르는 갯과 포유류 동물. | ||
==== 명사 2 ==== | ||
# 강이나 내에 바닷물이 드나드는 곳.""", | ||
) | ||
self.assertEqual(data[0]["pos"], "noun") | ||
self.assertEqual( | ||
data[0]["senses"], | ||
[{"glosses": ["가축으로 많이 기르는 갯과 포유류 동물."]}], | ||
) | ||
self.assertEqual(data[1]["pos"], "noun") | ||
self.assertEqual( | ||
data[1]["senses"], | ||
[{"glosses": ["강이나 내에 바닷물이 드나드는 곳."]}], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from unittest import TestCase | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.ru.models import WordEntry | ||
from wiktextract.extractor.ru.pronunciation import extract_homophone_section | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestRUSound(TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="ru"), | ||
WiktionaryConfig( | ||
dump_file_lang_code="ru", capture_language_codes=None | ||
), | ||
) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
|
||
def test_homophone_section_list(self): | ||
self.wxr.wtp.start_page("ไทย") | ||
root = self.wxr.wtp.parse("* [[ไท]], [[ไถ]]") | ||
data = WordEntry(lang="th", lang_code="Тайский", word="ไทย") | ||
extract_homophone_section(self.wxr, data, root) | ||
self.assertEqual( | ||
[s.model_dump(exclude_defaults=True) for s in data.sounds], | ||
[{"homophones": ["ไท", "ไถ"]}] | ||
) |