From fce99d2f67e72f0425b9ebbaf72f2f0e92f31b46 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Wed, 29 Nov 2023 09:22:50 +0100 Subject: [PATCH 1/4] Parse page for Russian Wiktionary --- json_schema/ru.json | 56 +++++ src/wiktextract/data/ru/config.json | 4 + src/wiktextract/data/ru/pos_subtitles.json | 14 ++ src/wiktextract/extractor/ru/models.py | 81 +++++++ src/wiktextract/extractor/ru/page.py | 218 ++++++++++++++++++ src/wiktextract/extractor/ru/pronunciation.py | 14 ++ 6 files changed, 387 insertions(+) create mode 100644 json_schema/ru.json create mode 100644 src/wiktextract/data/ru/config.json create mode 100644 src/wiktextract/data/ru/pos_subtitles.json create mode 100644 src/wiktextract/extractor/ru/models.py create mode 100644 src/wiktextract/extractor/ru/page.py create mode 100644 src/wiktextract/extractor/ru/pronunciation.py diff --git a/json_schema/ru.json b/json_schema/ru.json new file mode 100644 index 000000000..8dfcd3e3e --- /dev/null +++ b/json_schema/ru.json @@ -0,0 +1,56 @@ +{ + "$id": "https://kaikki.org/ru.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", + "properties": { + "categories": { + "default": [], + "description": "list of non-disambiguated categories for the word", + "items": { + "type": "string" + }, + "title": "Categories", + "type": "array" + }, + "lang_code": { + "description": "Wiktionary language code", + "examples": [ + "ru" + ], + "title": "Lang Code", + "type": "string" + }, + "lang_name": { + "description": "Localized language name of the word", + "examples": [ + "Русский" + ], + "title": "Lang Name", + "type": "string" + }, + "pos": { + "default": null, + "description": "Part of speech type", + "title": "Pos", + "type": "string" + }, + "pos_title": { + "default": null, + "description": "Original POS title", + "title": "Pos Title", + "type": "string" + }, + "word": { + "description": "word string", + "title": "Word", + "type": "string" + } + }, + "required": [ + "word", + "lang_code", + "lang_name" + ], + "title": "Russian Wiktionary", + "type": "object" +} \ No newline at end of file diff --git a/src/wiktextract/data/ru/config.json b/src/wiktextract/data/ru/config.json new file mode 100644 index 000000000..91a7ba446 --- /dev/null +++ b/src/wiktextract/data/ru/config.json @@ -0,0 +1,4 @@ +{ + "analyze_templates": false, + "extract_thesaurus_pages": false +} diff --git a/src/wiktextract/data/ru/pos_subtitles.json b/src/wiktextract/data/ru/pos_subtitles.json new file mode 100644 index 000000000..6f94a43b9 --- /dev/null +++ b/src/wiktextract/data/ru/pos_subtitles.json @@ -0,0 +1,14 @@ +{ + "аббревиатура": { "pos": "abbrev" }, + "глагол": { "pos": "verb" }, + "деепричастие": { "pos": "gerund" }, + "имя собственное": { "pos": "name" }, + "имя, собственное": { "pos": "name" }, + "междометие": { "pos": "interj" }, + "префикс": { "pos": "prefix" }, + "префиксоид": { "pos": "prefix" }, + "прилагательное": { "pos": "adj" }, + "суффикс": { "pos": "suffix" }, + "существительное": { "pos": "noun" }, + "устойчивое сочетание": { "pos": "phrase" } +} diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py new file mode 100644 index 000000000..1179fc832 --- /dev/null +++ b/src/wiktextract/extractor/ru/models.py @@ -0,0 +1,81 @@ +import json +import logging +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic.json_schema import GenerateJsonSchema + +from wiktextract.wxr_context import WiktextractContext + + +class PydanticLogger: + wxr: Optional[WiktextractContext] = None + + @classmethod + def debug( + cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted" + ): + if cls.wxr: + cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid) + else: + logging.debug(msg) + + +class BaseModelWrap(BaseModel): + model_config = ConfigDict(validate_assignment=True) + + +class LoggingExtraFieldsModel(BaseModelWrap): + @model_validator(mode="before") + def log_extra_fields(cls, values): + all_allowed_field_names = cls.model_fields.keys() + extra_fields = { + name: str(value) + for name, value in values.items() + if name not in all_allowed_field_names + } + if extra_fields: + class_full_name = cls.__name__ + PydanticLogger.debug( + msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}", + sortid="wiktextract/extractor/es/pydantic/extra_fields/33", + ) + return values + + +class WordEntry(LoggingExtraFieldsModel): + """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" + + word: str = Field(description="word string") + pos: str = Field(default=None, description="Part of speech type") + pos_title: str = Field(default=None, description="Original POS title") + lang_code: str = Field( + description="Wiktionary language code", examples=["ru"] + ) + lang_name: str = Field( + description="Localized language name of the word", examples=["Русский"] + ) + categories: list[str] = Field( + default=[], + description="list of non-disambiguated categories for the word", + ) + + +if __name__ == "__main__": + + class JsonSchemaGenerator(GenerateJsonSchema): + def generate(self, schema, mode="validation"): + json_schema = super().generate(schema, mode=mode) + json_schema["title"] = "Russian Wiktionary" + json_schema["$id"] = "https://kaikki.org/ru.json" + json_schema["$schema"] = self.schema_dialect + return json_schema + + with open("json_schema/ru.json", "w") as f: + json.dump( + WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), + f, + indent=2, + ensure_ascii=False, + sort_keys=True, + ) diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py new file mode 100644 index 000000000..1f0eb8409 --- /dev/null +++ b/src/wiktextract/extractor/ru/page.py @@ -0,0 +1,218 @@ +import copy +import logging +from typing import Dict, List, Optional + +from wikitextprocessor import NodeKind, WikiNode + +from wiktextract.extractor.ru.models import PydanticLogger, WordEntry +from wiktextract.extractor.ru.pronunciation import extract_pronunciation +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +# Templates that are used to form panels on pages and that +# should be ignored in various positions +PANEL_TEMPLATES = set() + +# Template name prefixes used for language-specific panel templates (i.e., +# templates that create side boxes or notice boxes or that should generally +# be ignored). +PANEL_PREFIXES = set() + +# Additional templates to be expanded in the pre-expand phase +ADDITIONAL_EXPAND_TEMPLATES = set() + + +def process_semantic_section( + wxr: WiktextractContext, + page_data: List[WordEntry], + semantic_level_node: WikiNode, +): + pass + + +def get_pos( + wxr: WiktextractContext, + level_node: WikiNode, +) -> Optional[str]: + # Search for POS in template names + for template_node in level_node.find_child(NodeKind.TEMPLATE): + POS_MAP = { + "abbrev": "abbrev", + "adv": "adv", + "affix": "affix", + "conj": "conj", + "interj": "interj", + "noun": "noun", + "onomatop": "onomatopeia", + "part": "particle", + "phrase": "phrase", + "predic": "adj", + "prep": "prep", + "suffix": "suffix", + "буква": "character", + "гидроним": "name", + "гл": "verb", + "дее": "gerund", + "деепр": "gerund", + "мест": "pronoun", + "нар": "adv", + "падежи": "noun", + "послелог": "postp", + "прил": "adj", + "прич": "participle", + "союз": "conj", + "сущ": "noun", + "существительное": "noun", + "топоним": "name", + "фам": "name", + "част": "particle", + "числ": "number", + } + template_name = template_node.template_name.lower() + for part in template_name.split(): + for subpart in part.split("-"): + if subpart in POS_MAP: + return POS_MAP[subpart] + + # Search for POS in clean_text + text = clean_node(wxr, {}, level_node.children) + + for POS_string in wxr.config.POS_SUBTITLES.keys(): + if POS_string in text.lower(): + return wxr.config.POS_SUBTITLES[POS_string]["pos"] + + if "форма" in text.lower(): + # XXX: Decide what to do with form entries + return + + if text.strip(): + wxr.wtp.debug( + f"No part of speech found in children: {level_node.children} with clean text {text}", + sortid="wiktextract/extractor/ru/page/get_pos/98", + ) + + +def parse_section( + wxr: WiktextractContext, + page_data: List[WordEntry], + level3_node: WikiNode, +): + section_title = clean_node(wxr, {}, level3_node.largs).strip() + wxr.wtp.start_subsection(section_title) + if section_title in [ + "Морфологические и синтаксические свойства", # Morphological and syntactic properties + "Тип и синтаксические свойства сочетания", # Type and syntactic properties of the word combination + ]: + pos = get_pos(wxr, level3_node) + if pos: + page_data[-1].pos = pos + # XXX: Extract forms from Russian Wiktionary + # XXX: Extract grammatical tags (gender, etc.) from Russian Wiktionary + + elif section_title == "Произношение": + if wxr.config.capture_pronunciation: + extract_pronunciation(wxr, page_data, level3_node) + elif section_title == "Семантические свойства": # Semantic properties + process_semantic_section(wxr, page_data, level3_node) + elif section_title == "Значение": + pass + elif section_title == "Родственные слова": # Word family + if wxr.config.capture_linkages: + pass + elif section_title == "Этимология": + if wxr.config.capture_etymologies: + # XXX: Extract etymology + pass + elif section_title == "Фразеологизмы и устойчивые сочетания": + if wxr.config.capture_linkages: + pass + elif section_title == "Перевод": + if wxr.config.capture_translations: + pass + elif section_title in ["Анаграммы", "Метаграммы", "Синонимы", "Антонимы"]: + pass + elif section_title == "Библиография": + pass + elif section_title in ["Латиница (Latinça)", "Латиница (Latinca)"]: + pass + elif section_title == "Иноязычные аналоги": + pass + elif section_title == "Прочее": + pass + else: + wxr.wtp.debug( + f"Unprocessed section {section_title}", + sortid="wixtextract/extractor/ru/page/parse_section/66", + ) + + +def parse_page( + wxr: WiktextractContext, page_title: str, page_text: str +) -> List[Dict[str, str]]: + # Help site describing page structure: https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей + + if wxr.config.verbose: + logging.info(f"Parsing page: {page_title}") + # Pass current wiktextractcontext to pydantic for more better logging + PydanticLogger.wxr = wxr + + wxr.config.word = page_title + wxr.wtp.start_page(page_title) + + # Parse the page, pre-expanding those templates that are likely to + # influence parsing + tree = wxr.wtp.parse( + page_text, + pre_expand=True, + additional_expand=ADDITIONAL_EXPAND_TEMPLATES, + ) + + page_data: List[WordEntry] = [] + for level1_node in tree.find_child(NodeKind.LEVEL1): + for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): + lang_code = ( + subtitle_template.template_name.strip() + .removeprefix("-") + .removesuffix("-") + ) + + if ( + wxr.config.capture_language_codes is not None + and lang_code not in wxr.config.capture_language_codes + ): + continue + + categories = {"categories": []} + + lang_name = clean_node(wxr, categories, subtitle_template) + wxr.wtp.start_section(lang_name) + + base_data = WordEntry( + lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title + ) + base_data.categories.extend(categories["categories"]) + + for non_level23_node in level1_node.invert_find_child( + NodeKind.LEVEL2 | NodeKind.LEVEL3 + ): + IGNORED_TEMPLATES = ["wikipedia", "Омонимы", "improve"] + if not ( + isinstance(non_level23_node, WikiNode) + and non_level23_node.kind == NodeKind.TEMPLATE + and non_level23_node.template_name in IGNORED_TEMPLATES + ): + wxr.wtp.debug( + f"Found unexpected child in level node {level1_node.largs}: {non_level23_node}", + sortid="extractor/es/page/parse_page/80", + ) + + for level2_node in level1_node.find_child(NodeKind.LEVEL2): + page_data.append(copy.deepcopy(base_data)) + for level3_node in level2_node.find_child(NodeKind.LEVEL3): + parse_section(wxr, page_data, level3_node) + + page_data.append(copy.deepcopy(base_data)) + for level3_node in level1_node.find_child(NodeKind.LEVEL3): + parse_section(wxr, page_data, level3_node) + + return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/src/wiktextract/extractor/ru/pronunciation.py b/src/wiktextract/extractor/ru/pronunciation.py new file mode 100644 index 000000000..a7bf28bde --- /dev/null +++ b/src/wiktextract/extractor/ru/pronunciation.py @@ -0,0 +1,14 @@ +from typing import List + +from wikitextprocessor.parser import LevelNode + +from wiktextract.extractor.ru.models import WordEntry +from wiktextract.wxr_context import WiktextractContext + + +def extract_pronunciation( + wxr: WiktextractContext, + page_data: List[WordEntry], + level_node: LevelNode, +): + pass From 200a7876d23a436be1918b9be51de97c4af3d1e1 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Mon, 4 Dec 2023 21:32:41 +0100 Subject: [PATCH 2/4] Consider only first two elements in POS template_name splits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/ru/page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py index 1f0eb8409..da7880006 100644 --- a/src/wiktextract/extractor/ru/page.py +++ b/src/wiktextract/extractor/ru/page.py @@ -69,8 +69,8 @@ def get_pos( "числ": "number", } template_name = template_node.template_name.lower() - for part in template_name.split(): - for subpart in part.split("-"): + for part in template_name.split()[:2]: + for subpart in part.split("-")[:2]: if subpart in POS_MAP: return POS_MAP[subpart] From 52441a609f2eecce57cd69ed1b4ea1060bf3436b Mon Sep 17 00:00:00 2001 From: Empiriker Date: Mon, 4 Dec 2023 21:35:27 +0100 Subject: [PATCH 3/4] Replace typings List->list, Dict-> dict for Russian extractor --- src/wiktextract/extractor/ru/page.py | 10 +++++----- src/wiktextract/extractor/ru/pronunciation.py | 4 +--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py index da7880006..c27e5084c 100644 --- a/src/wiktextract/extractor/ru/page.py +++ b/src/wiktextract/extractor/ru/page.py @@ -1,6 +1,6 @@ import copy import logging -from typing import Dict, List, Optional +from typing import Optional from wikitextprocessor import NodeKind, WikiNode @@ -24,7 +24,7 @@ def process_semantic_section( wxr: WiktextractContext, - page_data: List[WordEntry], + page_data: list[WordEntry], semantic_level_node: WikiNode, ): pass @@ -94,7 +94,7 @@ def get_pos( def parse_section( wxr: WiktextractContext, - page_data: List[WordEntry], + page_data: list[WordEntry], level3_node: WikiNode, ): section_title = clean_node(wxr, {}, level3_node.largs).strip() @@ -148,7 +148,7 @@ def parse_section( def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, str]]: +) -> list[dict[str, str]]: # Help site describing page structure: https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей if wxr.config.verbose: @@ -167,7 +167,7 @@ def parse_page( additional_expand=ADDITIONAL_EXPAND_TEMPLATES, ) - page_data: List[WordEntry] = [] + page_data: list[WordEntry] = [] for level1_node in tree.find_child(NodeKind.LEVEL1): for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): lang_code = ( diff --git a/src/wiktextract/extractor/ru/pronunciation.py b/src/wiktextract/extractor/ru/pronunciation.py index a7bf28bde..4f9b99b77 100644 --- a/src/wiktextract/extractor/ru/pronunciation.py +++ b/src/wiktextract/extractor/ru/pronunciation.py @@ -1,5 +1,3 @@ -from typing import List - from wikitextprocessor.parser import LevelNode from wiktextract.extractor.ru.models import WordEntry @@ -8,7 +6,7 @@ def extract_pronunciation( wxr: WiktextractContext, - page_data: List[WordEntry], + page_data: list[WordEntry], level_node: LevelNode, ): pass From 9da9fa9bb06d55ac872e37b1e5532d39aa9d4793 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Tue, 5 Dec 2023 08:39:59 +0100 Subject: [PATCH 4/4] Set extra='forbid' and remove pydantic logging in Russian Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/ru/models.py | 39 ++------------------------ src/wiktextract/extractor/ru/page.py | 4 +-- 2 files changed, 3 insertions(+), 40 deletions(-) diff --git a/src/wiktextract/extractor/ru/models.py b/src/wiktextract/extractor/ru/models.py index 1179fc832..3ad7b06ac 100644 --- a/src/wiktextract/extractor/ru/models.py +++ b/src/wiktextract/extractor/ru/models.py @@ -1,49 +1,14 @@ import json -import logging -from typing import Optional from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic.json_schema import GenerateJsonSchema -from wiktextract.wxr_context import WiktextractContext - - -class PydanticLogger: - wxr: Optional[WiktextractContext] = None - - @classmethod - def debug( - cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted" - ): - if cls.wxr: - cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid) - else: - logging.debug(msg) - class BaseModelWrap(BaseModel): - model_config = ConfigDict(validate_assignment=True) - - -class LoggingExtraFieldsModel(BaseModelWrap): - @model_validator(mode="before") - def log_extra_fields(cls, values): - all_allowed_field_names = cls.model_fields.keys() - extra_fields = { - name: str(value) - for name, value in values.items() - if name not in all_allowed_field_names - } - if extra_fields: - class_full_name = cls.__name__ - PydanticLogger.debug( - msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}", - sortid="wiktextract/extractor/es/pydantic/extra_fields/33", - ) - return values + model_config = ConfigDict(validate_assignment=True, extra="forbid") -class WordEntry(LoggingExtraFieldsModel): +class WordEntry(BaseModelWrap): """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" word: str = Field(description="word string") diff --git a/src/wiktextract/extractor/ru/page.py b/src/wiktextract/extractor/ru/page.py index c27e5084c..429df0ca0 100644 --- a/src/wiktextract/extractor/ru/page.py +++ b/src/wiktextract/extractor/ru/page.py @@ -4,7 +4,7 @@ from wikitextprocessor import NodeKind, WikiNode -from wiktextract.extractor.ru.models import PydanticLogger, WordEntry +from wiktextract.extractor.ru.models import WordEntry from wiktextract.extractor.ru.pronunciation import extract_pronunciation from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -153,8 +153,6 @@ def parse_page( if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") - # Pass current wiktextractcontext to pydantic for more better logging - PydanticLogger.wxr = wxr wxr.config.word = page_title wxr.wtp.start_page(page_title)