Merge pull request #840 from xxyzz/ko

[ko] add Korean edition extractor
tatuylonen · Sep 30, 2024 · 3fd8a50 · 3fd8a50
2 parents 01dd959 + dc5378e
commit 3fd8a50
Show file tree

Hide file tree

Showing 8 changed files with 295 additions and 1 deletion.
diff --git a/src/wiktextract/data/ko/config.json b/src/wiktextract/data/ko/config.json
@@ -0,0 +1,6 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": false,
+  "save_ns_names": ["Main", "Template", "Module"],
+  "extract_ns_names": ["Main"]
+}
diff --git a/src/wiktextract/extractor/ja/section_titles.py b/src/wiktextract/extractor/ja/section_titles.py
@@ -1,5 +1,5 @@
 # List of templates
-# https://ja.wiktionary.org/wiki/テンプレートの一覧#品詞表記
+# https://ja.wiktionary.org/wiki/Wiktionary:テンプレートの一覧#品詞表記
 POS_DATA = {
     "名詞": {"pos": "noun"},
     "数詞": {"pos": "num"},

diff --git a/src/wiktextract/extractor/ko/models.py b/src/wiktextract/extractor/ko/models.py
@@ -0,0 +1,31 @@
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class KoreanBaseModel(BaseModel):
+    model_config = ConfigDict(
+        extra="forbid",
+        strict=True,
+        validate_assignment=True,
+        validate_default=True,
+    )
+
+
+class Sense(KoreanBaseModel):
+    glosses: list[str] = []
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    topics: list[str] = []
+    categories: list[str] = []
+
+
+class WordEntry(KoreanBaseModel):
+    model_config = ConfigDict(title="Korean Wiktionary")
+    word: str = Field(description="Word string", min_length=1)
+    lang_code: str = Field(description="Wiktionary language code", min_length=1)
+    lang: str = Field(description="Localized language name", min_length=1)
+    pos: str = Field(description="Part of speech type", min_length=1)
+    pos_title: str = ""
+    senses: list[Sense] = []
+    categories: list[str] = []
+    tags: list[str] = []
+    raw_tags: list[str] = []
diff --git a/src/wiktextract/extractor/ko/page.py b/src/wiktextract/extractor/ko/page.py
@@ -0,0 +1,75 @@
+import re
+from typing import Any
+
+from mediawiki_langcodes import name_to_code
+from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Sense, WordEntry
+from .pos import extract_pos_section
+from .section_titles import POS_DATA
+
+PANEL_TEMPLATES = set()
+PANEL_PREFIXES = set()
+ADDITIONAL_EXPAND_TEMPLATES = set()
+
+
+def parse_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    base_data: WordEntry,
+    level_node: LevelNode,
+) -> None:
+    title_text = clean_node(wxr, None, level_node.largs)
+    title_text = re.sub(r"\s*\d+$", "", title_text)
+    if title_text in POS_DATA:
+        extract_pos_section(wxr, page_data, base_data, level_node, title_text)
+
+    for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
+        parse_section(wxr, page_data, base_data, next_level)
+
+
+def parse_language_section(
+    wxr: WiktextractContext, page_data: list[WordEntry], level2_node: LevelNode
+) -> None:
+    lang_name = clean_node(wxr, None, level2_node.largs)
+    lang_code = name_to_code(lang_name, "ko")
+    if lang_code == "":
+        lang_code = "unknown"
+    if (
+        wxr.config.capture_language_codes is not None
+        and lang_code not in wxr.config.capture_language_codes
+    ):
+        return
+    wxr.wtp.start_section(lang_name)
+    base_data = WordEntry(
+        word=wxr.wtp.title,
+        lang_code=lang_code,
+        lang=lang_name,
+        pos="unknown",
+    )
+    for level3_node in level2_node.find_child(NodeKind.LEVEL3):
+        parse_section(wxr, page_data, base_data, level3_node)
+
+    # no POS section
+    if not level2_node.contain_node(NodeKind.LEVEL3):
+        extract_pos_section(wxr, page_data, base_data, level2_node, "")
+
+
+def parse_page(
+    wxr: WiktextractContext, page_title: str, page_text: str
+) -> list[dict[str, Any]]:
+    # page layout
+    # https://ko.wiktionary.org/wiki/위키낱말사전:문서_양식
+    # https://ko.wiktionary.org/wiki/위키낱말사전:한국어_편집부
+    wxr.wtp.start_page(page_title)
+    tree = wxr.wtp.parse(page_text)
+    page_data: list[WordEntry] = []
+    for level2_node in tree.find_child(NodeKind.LEVEL2):
+        parse_language_section(wxr, page_data, level2_node)
+
+    for data in page_data:
+        if len(data.senses) == 0:
+            data.senses.append(Sense(tags=["no-gloss"]))
+    return [m.model_dump(exclude_defaults=True) for m in page_data]
diff --git a/src/wiktextract/extractor/ko/pos.py b/src/wiktextract/extractor/ko/pos.py
@@ -0,0 +1,74 @@
+import re
+
+from wikitextprocessor import LevelNode, NodeKind, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Sense, WordEntry
+from .section_titles import POS_DATA
+
+
+def extract_pos_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    base_data: WordEntry,
+    level_node: LevelNode,
+    pos_title: str,
+) -> None:
+    page_data.append(base_data.model_copy(deep=True))
+    if pos_title in POS_DATA:
+        page_data[-1].pos_title = pos_title
+        pos_data = POS_DATA[pos_title]
+        page_data[-1].pos = pos_data["pos"]
+        page_data[-1].tags.extend(pos_data.get("tags", []))
+
+    for list_node in level_node.find_child(NodeKind.LIST):
+        if list_node.sarg.endswith("#"):
+            for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+                extract_gloss_list_item(wxr, page_data[-1], list_item)
+        elif list_node.sarg.endswith("*"):
+            for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+                extract_unorderd_list_item(wxr, page_data[-1], list_item)
+
+    if len(page_data[-1].senses) == 0:
+        page_data.pop()
+
+
+def extract_gloss_list_item(
+    wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
+) -> None:
+    gloss_nodes = []
+    for node in list_item.children:
+        if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            if node.sarg.endswith("*"):
+                pass  # example
+            continue
+        else:
+            gloss_nodes.append(node)
+
+    sense = Sense()
+    gloss_text = clean_node(wxr, sense, gloss_nodes)
+    if len(gloss_text) > 0:
+        sense.glosses.append(gloss_text)
+        word_entry.senses.append(sense)
+
+
+def extract_unorderd_list_item(
+    wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
+) -> None:
+    is_first_bold = True
+    for index, node in enumerate(list_item.children):
+        if (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.BOLD
+            and is_first_bold
+        ):
+            is_first_bold = False
+            bold_text = clean_node(wxr, None, node)
+            if re.fullmatch(r"\d+\.", bold_text):
+                new_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
+                new_list_item.children = list_item.children[index + 1:]
+                extract_gloss_list_item(wxr, word_entry, new_list_item)
+                break
+        elif isinstance(node, str) and node.startswith("어원:"):
+            break  # etymology
diff --git a/src/wiktextract/extractor/ko/section_titles.py b/src/wiktextract/extractor/ko/section_titles.py
@@ -0,0 +1,11 @@
+POS_DATA = {
+    "명사": {"pos": "noun"},
+    "형용사": {"pos": "adj"},
+    "대명사": {"pos": "pron"},
+    "수사": {"pos": "num"},
+    "동사": {"pos": "verb"},
+    "관용구": {"pos": "phrase", "tags": ["idiomatic"]},
+    "기호": {"pos": "symbol"},
+    "접미사": {"pos": "suffix", "tags": ["morpheme"]},
+    "접두사": {"pos": "prefix", "tags": ["morpheme"]},
+}
diff --git a/tests/test_ko_gloss.py b/tests/test_ko_gloss.py
@@ -0,0 +1,64 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.ko.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestKoGloss(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="ko"),
+            WiktionaryConfig(
+                dump_file_lang_code="ko",
+                capture_language_codes=None,
+            ),
+        )
+
+    def tearDown(self) -> None:
+        self.wxr.wtp.close_db_conn()
+
+    def test_ignore_gloss_index_bold_node(self):
+        data = parse_page(
+            self.wxr,
+            "我們",
+            """== 중국어 ==
+=== 대명사 ===
+* '''1.''' [[우리]].""",
+        )
+        self.assertEqual(data[0]["senses"], [{"glosses": ["우리."]}])
+
+    def test_no_pos_section(self):
+        data = parse_page(
+            self.wxr,
+            "大家",
+            """== 한국어 ==
+* '''1.''' 모든""",
+        )
+        self.assertEqual(data[0]["senses"], [{"glosses": ["모든"]}])
+
+    def test_level_4_pos(self):
+        data = parse_page(
+            self.wxr,
+            "개",
+            """== 한국어 ==
+=== 명사 ===
+==== 명사 1 ====
+# 가축으로 많이 기르는 갯과 포유류 동물.
+==== 명사 2 ====
+# 강이나 내에 바닷물이 드나드는 곳.""",
+        )
+        self.assertEqual(data[0]["pos"], "noun")
+        self.assertEqual(
+            data[0]["senses"],
+            [{"glosses": ["가축으로 많이 기르는 갯과 포유류 동물."]}],
+        )
+        self.assertEqual(data[1]["pos"], "noun")
+        self.assertEqual(
+            data[1]["senses"],
+            [{"glosses": ["강이나 내에 바닷물이 드나드는 곳."]}],
+        )
diff --git a/tests/test_ru_sound.py b/tests/test_ru_sound.py
@@ -0,0 +1,33 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.ru.models import WordEntry
+from wiktextract.extractor.ru.pronunciation import extract_homophone_section
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestRUSound(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="ru"),
+            WiktionaryConfig(
+                dump_file_lang_code="ru", capture_language_codes=None
+            ),
+        )
+
+    def tearDown(self) -> None:
+        self.wxr.wtp.close_db_conn()
+
+    def test_homophone_section_list(self):
+        self.wxr.wtp.start_page("ไทย")
+        root = self.wxr.wtp.parse("* [[ไท]], [[ไถ]]")
+        data = WordEntry(lang="th", lang_code="Тайский", word="ไทย")
+        extract_homophone_section(self.wxr, data, root)
+        self.assertEqual(
+            [s.model_dump(exclude_defaults=True) for s in data.sounds],
+            [{"homophones": ["ไท", "ไถ"]}]
+        )