diff --git a/json_schema/ru.json b/json_schema/ru.json new file mode 100644 index 000000000..8dfcd3e3e --- /dev/null +++ b/json_schema/ru.json @@ -0,0 +1,56 @@ +{ + "$id": "https://kaikki.org/ru.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", + "properties": { + "categories": { + "default": [], + "description": "list of non-disambiguated categories for the word", + "items": { + "type": "string" + }, + "title": "Categories", + "type": "array" + }, + "lang_code": { + "description": "Wiktionary language code", + "examples": [ + "ru" + ], + "title": "Lang Code", + "type": "string" + }, + "lang_name": { + "description": "Localized language name of the word", + "examples": [ + "Русский" + ], + "title": "Lang Name", + "type": "string" + }, + "pos": { + "default": null, + "description": "Part of speech type", + "title": "Pos", + "type": "string" + }, + "pos_title": { + "default": null, + "description": "Original POS title", + "title": "Pos Title", + "type": "string" + }, + "word": { + "description": "word string", + "title": "Word", + "type": "string" + } + }, + "required": [ + "word", + "lang_code", + "lang_name" + ], + "title": "Russian Wiktionary", + "type": "object" +} \ No newline at end of file diff --git a/src/wiktextract/data/ru/config.json b/src/wiktextract/data/ru/config.json new file mode 100644 index 000000000..91a7ba446 --- /dev/null +++ b/src/wiktextract/data/ru/config.json @@ -0,0 +1,4 @@ +{ + "analyze_templates": false, + "extract_thesaurus_pages": false +} diff --git a/src/wiktextract/data/ru/pos_subtitles.json b/src/wiktextract/data/ru/pos_subtitles.json new file mode 100644 index 000000000..6f94a43b9 --- /dev/null +++ b/src/wiktextract/data/ru/pos_subtitles.json @@ -0,0 +1,14 @@ +{ + "аббревиатура": { "pos": "abbrev" }, + "глагол": { "pos": "verb" }, + "деепричастие": { "pos": "gerund" }, + "имя собственное": { "pos": "name" }, + "имя, собственное": { "pos": "name" }, + "междометие": { "pos": "interj" }, + "префикс": { "pos": "prefix" }, + "префиксоид": { "pos": "prefix" }, + "прилагательное": { "pos": "adj" }, + "суффикс": { "pos": "suffix" }, + "существительное": { "pos": "noun" }, + "устойчивое сочетание": { "pos": "phrase" } +} diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py new file mode 100644 index 000000000..3ad7b06ac --- /dev/null +++ b/src/wiktextract/extractor/ru/models.py @@ -0,0 +1,46 @@ +import json + +from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic.json_schema import GenerateJsonSchema + + +class BaseModelWrap(BaseModel): + model_config = ConfigDict(validate_assignment=True, extra="forbid") + + +class WordEntry(BaseModelWrap): + """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" + + word: str = Field(description="word string") + pos: str = Field(default=None, description="Part of speech type") + pos_title: str = Field(default=None, description="Original POS title") + lang_code: str = Field( + description="Wiktionary language code", examples=["ru"] + ) + lang_name: str = Field( + description="Localized language name of the word", examples=["Русский"] + ) + categories: list[str] = Field( + default=[], + description="list of non-disambiguated categories for the word", + ) + + +if __name__ == "__main__": + + class JsonSchemaGenerator(GenerateJsonSchema): + def generate(self, schema, mode="validation"): + json_schema = super().generate(schema, mode=mode) + json_schema["title"] = "Russian Wiktionary" + json_schema["$id"] = "https://kaikki.org/ru.json" + json_schema["$schema"] = self.schema_dialect + return json_schema + + with open("json_schema/ru.json", "w") as f: + json.dump( + WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), + f, + indent=2, + ensure_ascii=False, + sort_keys=True, + ) diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py new file mode 100644 index 000000000..429df0ca0 --- /dev/null +++ b/src/wiktextract/extractor/ru/page.py @@ -0,0 +1,216 @@ +import copy +import logging +from typing import Optional + +from wikitextprocessor import NodeKind, WikiNode + +from wiktextract.extractor.ru.models import WordEntry +from wiktextract.extractor.ru.pronunciation import extract_pronunciation +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +# Templates that are used to form panels on pages and that +# should be ignored in various positions +PANEL_TEMPLATES = set() + +# Template name prefixes used for language-specific panel templates (i.e., +# templates that create side boxes or notice boxes or that should generally +# be ignored). +PANEL_PREFIXES = set() + +# Additional templates to be expanded in the pre-expand phase +ADDITIONAL_EXPAND_TEMPLATES = set() + + +def process_semantic_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + semantic_level_node: WikiNode, +): + pass + + +def get_pos( + wxr: WiktextractContext, + level_node: WikiNode, +) -> Optional[str]: + # Search for POS in template names + for template_node in level_node.find_child(NodeKind.TEMPLATE): + POS_MAP = { + "abbrev": "abbrev", + "adv": "adv", + "affix": "affix", + "conj": "conj", + "interj": "interj", + "noun": "noun", + "onomatop": "onomatopeia", + "part": "particle", + "phrase": "phrase", + "predic": "adj", + "prep": "prep", + "suffix": "suffix", + "буква": "character", + "гидроним": "name", + "гл": "verb", + "дее": "gerund", + "деепр": "gerund", + "мест": "pronoun", + "нар": "adv", + "падежи": "noun", + "послелог": "postp", + "прил": "adj", + "прич": "participle", + "союз": "conj", + "сущ": "noun", + "существительное": "noun", + "топоним": "name", + "фам": "name", + "част": "particle", + "числ": "number", + } + template_name = template_node.template_name.lower() + for part in template_name.split()[:2]: + for subpart in part.split("-")[:2]: + if subpart in POS_MAP: + return POS_MAP[subpart] + + # Search for POS in clean_text + text = clean_node(wxr, {}, level_node.children) + + for POS_string in wxr.config.POS_SUBTITLES.keys(): + if POS_string in text.lower(): + return wxr.config.POS_SUBTITLES[POS_string]["pos"] + + if "форма" in text.lower(): + # XXX: Decide what to do with form entries + return + + if text.strip(): + wxr.wtp.debug( + f"No part of speech found in children: {level_node.children} with clean text {text}", + sortid="wiktextract/extractor/ru/page/get_pos/98", + ) + + +def parse_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + level3_node: WikiNode, +): + section_title = clean_node(wxr, {}, level3_node.largs).strip() + wxr.wtp.start_subsection(section_title) + if section_title in [ + "Морфологические и синтаксические свойства", # Morphological and syntactic properties + "Тип и синтаксические свойства сочетания", # Type and syntactic properties of the word combination + ]: + pos = get_pos(wxr, level3_node) + if pos: + page_data[-1].pos = pos + # XXX: Extract forms from Russian Wiktionary + # XXX: Extract grammatical tags (gender, etc.) from Russian Wiktionary + + elif section_title == "Произношение": + if wxr.config.capture_pronunciation: + extract_pronunciation(wxr, page_data, level3_node) + elif section_title == "Семантические свойства": # Semantic properties + process_semantic_section(wxr, page_data, level3_node) + elif section_title == "Значение": + pass + elif section_title == "Родственные слова": # Word family + if wxr.config.capture_linkages: + pass + elif section_title == "Этимология": + if wxr.config.capture_etymologies: + # XXX: Extract etymology + pass + elif section_title == "Фразеологизмы и устойчивые сочетания": + if wxr.config.capture_linkages: + pass + elif section_title == "Перевод": + if wxr.config.capture_translations: + pass + elif section_title in ["Анаграммы", "Метаграммы", "Синонимы", "Антонимы"]: + pass + elif section_title == "Библиография": + pass + elif section_title in ["Латиница (Latinça)", "Латиница (Latinca)"]: + pass + elif section_title == "Иноязычные аналоги": + pass + elif section_title == "Прочее": + pass + else: + wxr.wtp.debug( + f"Unprocessed section {section_title}", + sortid="wixtextract/extractor/ru/page/parse_section/66", + ) + + +def parse_page( + wxr: WiktextractContext, page_title: str, page_text: str +) -> list[dict[str, str]]: + # Help site describing page structure: https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей + + if wxr.config.verbose: + logging.info(f"Parsing page: {page_title}") + + wxr.config.word = page_title + wxr.wtp.start_page(page_title) + + # Parse the page, pre-expanding those templates that are likely to + # influence parsing + tree = wxr.wtp.parse( + page_text, + pre_expand=True, + additional_expand=ADDITIONAL_EXPAND_TEMPLATES, + ) + + page_data: list[WordEntry] = [] + for level1_node in tree.find_child(NodeKind.LEVEL1): + for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): + lang_code = ( + subtitle_template.template_name.strip() + .removeprefix("-") + .removesuffix("-") + ) + + if ( + wxr.config.capture_language_codes is not None + and lang_code not in wxr.config.capture_language_codes + ): + continue + + categories = {"categories": []} + + lang_name = clean_node(wxr, categories, subtitle_template) + wxr.wtp.start_section(lang_name) + + base_data = WordEntry( + lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title + ) + base_data.categories.extend(categories["categories"]) + + for non_level23_node in level1_node.invert_find_child( + NodeKind.LEVEL2 | NodeKind.LEVEL3 + ): + IGNORED_TEMPLATES = ["wikipedia", "Омонимы", "improve"] + if not ( + isinstance(non_level23_node, WikiNode) + and non_level23_node.kind == NodeKind.TEMPLATE + and non_level23_node.template_name in IGNORED_TEMPLATES + ): + wxr.wtp.debug( + f"Found unexpected child in level node {level1_node.largs}: {non_level23_node}", + sortid="extractor/es/page/parse_page/80", + ) + + for level2_node in level1_node.find_child(NodeKind.LEVEL2): + page_data.append(copy.deepcopy(base_data)) + for level3_node in level2_node.find_child(NodeKind.LEVEL3): + parse_section(wxr, page_data, level3_node) + + page_data.append(copy.deepcopy(base_data)) + for level3_node in level1_node.find_child(NodeKind.LEVEL3): + parse_section(wxr, page_data, level3_node) + + return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/src/wiktextract/extractor/ru/pronunciation.py b/src/wiktextract/extractor/ru/pronunciation.py new file mode 100644 index 000000000..4f9b99b77 --- /dev/null +++ b/src/wiktextract/extractor/ru/pronunciation.py @@ -0,0 +1,12 @@ +from wikitextprocessor.parser import LevelNode + +from wiktextract.extractor.ru.models import WordEntry +from wiktextract.wxr_context import WiktextractContext + + +def extract_pronunciation( + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: LevelNode, +): + pass