Merge pull request #411 from empiriker/ru

Parse page for Russian Wiktionary
tatuylonen · Dec 5, 2023 · b0c038f · b0c038f
2 parents fee414a + 9da9fa9
commit b0c038f
Show file tree

Hide file tree

Showing 6 changed files with 348 additions and 0 deletions.
diff --git a/json_schema/ru.json b/json_schema/ru.json
@@ -0,0 +1,56 @@
+{
+  "$id": "https://kaikki.org/ru.json",
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.",
+  "properties": {
+    "categories": {
+      "default": [],
+      "description": "list of non-disambiguated categories for the word",
+      "items": {
+        "type": "string"
+      },
+      "title": "Categories",
+      "type": "array"
+    },
+    "lang_code": {
+      "description": "Wiktionary language code",
+      "examples": [
+        "ru"
+      ],
+      "title": "Lang Code",
+      "type": "string"
+    },
+    "lang_name": {
+      "description": "Localized language name of the word",
+      "examples": [
+        "Русский"
+      ],
+      "title": "Lang Name",
+      "type": "string"
+    },
+    "pos": {
+      "default": null,
+      "description": "Part of speech type",
+      "title": "Pos",
+      "type": "string"
+    },
+    "pos_title": {
+      "default": null,
+      "description": "Original POS title",
+      "title": "Pos Title",
+      "type": "string"
+    },
+    "word": {
+      "description": "word string",
+      "title": "Word",
+      "type": "string"
+    }
+  },
+  "required": [
+    "word",
+    "lang_code",
+    "lang_name"
+  ],
+  "title": "Russian Wiktionary",
+  "type": "object"
+}
diff --git a/src/wiktextract/data/ru/config.json b/src/wiktextract/data/ru/config.json
@@ -0,0 +1,4 @@
+{
+  "analyze_templates": false,
+  "extract_thesaurus_pages": false
+}
diff --git a/src/wiktextract/data/ru/pos_subtitles.json b/src/wiktextract/data/ru/pos_subtitles.json
@@ -0,0 +1,14 @@
+{
+  "аббревиатура": { "pos": "abbrev" },
+  "глагол": { "pos": "verb" },
+  "деепричастие": { "pos": "gerund" },
+  "имя собственное": { "pos": "name" },
+  "имя, собственное": { "pos": "name" },
+  "междометие": { "pos": "interj" },
+  "префикс": { "pos": "prefix" },
+  "префиксоид": { "pos": "prefix" },
+  "прилагательное": { "pos": "adj" },
+  "суффикс": { "pos": "suffix" },
+  "существительное": { "pos": "noun" },
+  "устойчивое сочетание": { "pos": "phrase" }
+}
diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py
@@ -0,0 +1,46 @@
+import json
+
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic.json_schema import GenerateJsonSchema
+
+
+class BaseModelWrap(BaseModel):
+    model_config = ConfigDict(validate_assignment=True, extra="forbid")
+
+
+class WordEntry(BaseModelWrap):
+    """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract."""
+
+    word: str = Field(description="word string")
+    pos: str = Field(default=None, description="Part of speech type")
+    pos_title: str = Field(default=None, description="Original POS title")
+    lang_code: str = Field(
+        description="Wiktionary language code", examples=["ru"]
+    )
+    lang_name: str = Field(
+        description="Localized language name of the word", examples=["Русский"]
+    )
+    categories: list[str] = Field(
+        default=[],
+        description="list of non-disambiguated categories for the word",
+    )
+
+
+if __name__ == "__main__":
+
+    class JsonSchemaGenerator(GenerateJsonSchema):
+        def generate(self, schema, mode="validation"):
+            json_schema = super().generate(schema, mode=mode)
+            json_schema["title"] = "Russian Wiktionary"
+            json_schema["$id"] = "https://kaikki.org/ru.json"
+            json_schema["$schema"] = self.schema_dialect
+            return json_schema
+
+    with open("json_schema/ru.json", "w") as f:
+        json.dump(
+            WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator),
+            f,
+            indent=2,
+            ensure_ascii=False,
+            sort_keys=True,
+        )
diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py
@@ -0,0 +1,216 @@
+import copy
+import logging
+from typing import Optional
+
+from wikitextprocessor import NodeKind, WikiNode
+
+from wiktextract.extractor.ru.models import WordEntry
+from wiktextract.extractor.ru.pronunciation import extract_pronunciation
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+# Templates that are used to form panels on pages and that
+# should be ignored in various positions
+PANEL_TEMPLATES = set()
+
+# Template name prefixes used for language-specific panel templates (i.e.,
+# templates that create side boxes or notice boxes or that should generally
+# be ignored).
+PANEL_PREFIXES = set()
+
+# Additional templates to be expanded in the pre-expand phase
+ADDITIONAL_EXPAND_TEMPLATES = set()
+
+
+def process_semantic_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    semantic_level_node: WikiNode,
+):
+    pass
+
+
+def get_pos(
+    wxr: WiktextractContext,
+    level_node: WikiNode,
+) -> Optional[str]:
+    # Search for POS in template names
+    for template_node in level_node.find_child(NodeKind.TEMPLATE):
+        POS_MAP = {
+            "abbrev": "abbrev",
+            "adv": "adv",
+            "affix": "affix",
+            "conj": "conj",
+            "interj": "interj",
+            "noun": "noun",
+            "onomatop": "onomatopeia",
+            "part": "particle",
+            "phrase": "phrase",
+            "predic": "adj",
+            "prep": "prep",
+            "suffix": "suffix",
+            "буква": "character",
+            "гидроним": "name",
+            "гл": "verb",
+            "дее": "gerund",
+            "деепр": "gerund",
+            "мест": "pronoun",
+            "нар": "adv",
+            "падежи": "noun",
+            "послелог": "postp",
+            "прил": "adj",
+            "прич": "participle",
+            "союз": "conj",
+            "сущ": "noun",
+            "существительное": "noun",
+            "топоним": "name",
+            "фам": "name",
+            "част": "particle",
+            "числ": "number",
+        }
+        template_name = template_node.template_name.lower()
+        for part in template_name.split()[:2]:
+            for subpart in part.split("-")[:2]:
+                if subpart in POS_MAP:
+                    return POS_MAP[subpart]
+
+    # Search for POS in clean_text
+    text = clean_node(wxr, {}, level_node.children)
+
+    for POS_string in wxr.config.POS_SUBTITLES.keys():
+        if POS_string in text.lower():
+            return wxr.config.POS_SUBTITLES[POS_string]["pos"]
+
+    if "форма" in text.lower():
+        # XXX: Decide what to do with form entries
+        return
+
+    if text.strip():
+        wxr.wtp.debug(
+            f"No part of speech found in children: {level_node.children} with clean text {text}",
+            sortid="wiktextract/extractor/ru/page/get_pos/98",
+        )
+
+
+def parse_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level3_node: WikiNode,
+):
+    section_title = clean_node(wxr, {}, level3_node.largs).strip()
+    wxr.wtp.start_subsection(section_title)
+    if section_title in [
+        "Морфологические и синтаксические свойства",  # Morphological and syntactic properties
+        "Тип и синтаксические свойства сочетания",  # Type and syntactic properties of the word combination
+    ]:
+        pos = get_pos(wxr, level3_node)
+        if pos:
+            page_data[-1].pos = pos
+        # XXX: Extract forms from Russian Wiktionary
+        # XXX: Extract grammatical tags (gender, etc.) from Russian Wiktionary
+
+    elif section_title == "Произношение":
+        if wxr.config.capture_pronunciation:
+            extract_pronunciation(wxr, page_data, level3_node)
+    elif section_title == "Семантические свойства":  # Semantic properties
+        process_semantic_section(wxr, page_data, level3_node)
+    elif section_title == "Значение":
+        pass
+    elif section_title == "Родственные слова":  # Word family
+        if wxr.config.capture_linkages:
+            pass
+    elif section_title == "Этимология":
+        if wxr.config.capture_etymologies:
+            # XXX: Extract etymology
+            pass
+    elif section_title == "Фразеологизмы и устойчивые сочетания":
+        if wxr.config.capture_linkages:
+            pass
+    elif section_title == "Перевод":
+        if wxr.config.capture_translations:
+            pass
+    elif section_title in ["Анаграммы", "Метаграммы", "Синонимы", "Антонимы"]:
+        pass
+    elif section_title == "Библиография":
+        pass
+    elif section_title in ["Латиница (Latinça)", "Латиница (Latinca)"]:
+        pass
+    elif section_title == "Иноязычные аналоги":
+        pass
+    elif section_title == "Прочее":
+        pass
+    else:
+        wxr.wtp.debug(
+            f"Unprocessed section {section_title}",
+            sortid="wixtextract/extractor/ru/page/parse_section/66",
+        )
+
+
+def parse_page(
+    wxr: WiktextractContext, page_title: str, page_text: str
+) -> list[dict[str, str]]:
+    # Help site describing page structure: https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей
+
+    if wxr.config.verbose:
+        logging.info(f"Parsing page: {page_title}")
+
+    wxr.config.word = page_title
+    wxr.wtp.start_page(page_title)
+
+    # Parse the page, pre-expanding those templates that are likely to
+    # influence parsing
+    tree = wxr.wtp.parse(
+        page_text,
+        pre_expand=True,
+        additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
+    )
+
+    page_data: list[WordEntry] = []
+    for level1_node in tree.find_child(NodeKind.LEVEL1):
+        for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE):
+            lang_code = (
+                subtitle_template.template_name.strip()
+                .removeprefix("-")
+                .removesuffix("-")
+            )
+
+            if (
+                wxr.config.capture_language_codes is not None
+                and lang_code not in wxr.config.capture_language_codes
+            ):
+                continue
+
+            categories = {"categories": []}
+
+            lang_name = clean_node(wxr, categories, subtitle_template)
+            wxr.wtp.start_section(lang_name)
+
+            base_data = WordEntry(
+                lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
+            )
+            base_data.categories.extend(categories["categories"])
+
+            for non_level23_node in level1_node.invert_find_child(
+                NodeKind.LEVEL2 | NodeKind.LEVEL3
+            ):
+                IGNORED_TEMPLATES = ["wikipedia", "Омонимы", "improve"]
+                if not (
+                    isinstance(non_level23_node, WikiNode)
+                    and non_level23_node.kind == NodeKind.TEMPLATE
+                    and non_level23_node.template_name in IGNORED_TEMPLATES
+                ):
+                    wxr.wtp.debug(
+                        f"Found unexpected child in level node {level1_node.largs}: {non_level23_node}",
+                        sortid="extractor/es/page/parse_page/80",
+                    )
+
+            for level2_node in level1_node.find_child(NodeKind.LEVEL2):
+                page_data.append(copy.deepcopy(base_data))
+                for level3_node in level2_node.find_child(NodeKind.LEVEL3):
+                    parse_section(wxr, page_data, level3_node)
+
+            page_data.append(copy.deepcopy(base_data))
+            for level3_node in level1_node.find_child(NodeKind.LEVEL3):
+                parse_section(wxr, page_data, level3_node)
+
+    return [d.model_dump(exclude_defaults=True) for d in page_data]
diff --git a/src/wiktextract/extractor/ru/pronunciation.py b/src/wiktextract/extractor/ru/pronunciation.py
@@ -0,0 +1,12 @@
+from wikitextprocessor.parser import LevelNode
+
+from wiktextract.extractor.ru.models import WordEntry
+from wiktextract.wxr_context import WiktextractContext
+
+
+def extract_pronunciation(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level_node: LevelNode,
+):
+    pass