-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #411 from empiriker/ru
Parse page for Russian Wiktionary
- Loading branch information
Showing
6 changed files
with
348 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
{ | ||
"$id": "https://kaikki.org/ru.json", | ||
"$schema": "https://json-schema.org/draft/2020-12/schema", | ||
"description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", | ||
"properties": { | ||
"categories": { | ||
"default": [], | ||
"description": "list of non-disambiguated categories for the word", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Categories", | ||
"type": "array" | ||
}, | ||
"lang_code": { | ||
"description": "Wiktionary language code", | ||
"examples": [ | ||
"ru" | ||
], | ||
"title": "Lang Code", | ||
"type": "string" | ||
}, | ||
"lang_name": { | ||
"description": "Localized language name of the word", | ||
"examples": [ | ||
"Русский" | ||
], | ||
"title": "Lang Name", | ||
"type": "string" | ||
}, | ||
"pos": { | ||
"default": null, | ||
"description": "Part of speech type", | ||
"title": "Pos", | ||
"type": "string" | ||
}, | ||
"pos_title": { | ||
"default": null, | ||
"description": "Original POS title", | ||
"title": "Pos Title", | ||
"type": "string" | ||
}, | ||
"word": { | ||
"description": "word string", | ||
"title": "Word", | ||
"type": "string" | ||
} | ||
}, | ||
"required": [ | ||
"word", | ||
"lang_code", | ||
"lang_name" | ||
], | ||
"title": "Russian Wiktionary", | ||
"type": "object" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"analyze_templates": false, | ||
"extract_thesaurus_pages": false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"аббревиатура": { "pos": "abbrev" }, | ||
"глагол": { "pos": "verb" }, | ||
"деепричастие": { "pos": "gerund" }, | ||
"имя собственное": { "pos": "name" }, | ||
"имя, собственное": { "pos": "name" }, | ||
"междометие": { "pos": "interj" }, | ||
"префикс": { "pos": "prefix" }, | ||
"префиксоид": { "pos": "prefix" }, | ||
"прилагательное": { "pos": "adj" }, | ||
"суффикс": { "pos": "suffix" }, | ||
"существительное": { "pos": "noun" }, | ||
"устойчивое сочетание": { "pos": "phrase" } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import json | ||
|
||
from pydantic import BaseModel, ConfigDict, Field, model_validator | ||
from pydantic.json_schema import GenerateJsonSchema | ||
|
||
|
||
class BaseModelWrap(BaseModel): | ||
model_config = ConfigDict(validate_assignment=True, extra="forbid") | ||
|
||
|
||
class WordEntry(BaseModelWrap): | ||
"""WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" | ||
|
||
word: str = Field(description="word string") | ||
pos: str = Field(default=None, description="Part of speech type") | ||
pos_title: str = Field(default=None, description="Original POS title") | ||
lang_code: str = Field( | ||
description="Wiktionary language code", examples=["ru"] | ||
) | ||
lang_name: str = Field( | ||
description="Localized language name of the word", examples=["Русский"] | ||
) | ||
categories: list[str] = Field( | ||
default=[], | ||
description="list of non-disambiguated categories for the word", | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
class JsonSchemaGenerator(GenerateJsonSchema): | ||
def generate(self, schema, mode="validation"): | ||
json_schema = super().generate(schema, mode=mode) | ||
json_schema["title"] = "Russian Wiktionary" | ||
json_schema["$id"] = "https://kaikki.org/ru.json" | ||
json_schema["$schema"] = self.schema_dialect | ||
return json_schema | ||
|
||
with open("json_schema/ru.json", "w") as f: | ||
json.dump( | ||
WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), | ||
f, | ||
indent=2, | ||
ensure_ascii=False, | ||
sort_keys=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
import copy | ||
import logging | ||
from typing import Optional | ||
|
||
from wikitextprocessor import NodeKind, WikiNode | ||
|
||
from wiktextract.extractor.ru.models import WordEntry | ||
from wiktextract.extractor.ru.pronunciation import extract_pronunciation | ||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
# Templates that are used to form panels on pages and that | ||
# should be ignored in various positions | ||
PANEL_TEMPLATES = set() | ||
|
||
# Template name prefixes used for language-specific panel templates (i.e., | ||
# templates that create side boxes or notice boxes or that should generally | ||
# be ignored). | ||
PANEL_PREFIXES = set() | ||
|
||
# Additional templates to be expanded in the pre-expand phase | ||
ADDITIONAL_EXPAND_TEMPLATES = set() | ||
|
||
|
||
def process_semantic_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
semantic_level_node: WikiNode, | ||
): | ||
pass | ||
|
||
|
||
def get_pos( | ||
wxr: WiktextractContext, | ||
level_node: WikiNode, | ||
) -> Optional[str]: | ||
# Search for POS in template names | ||
for template_node in level_node.find_child(NodeKind.TEMPLATE): | ||
POS_MAP = { | ||
"abbrev": "abbrev", | ||
"adv": "adv", | ||
"affix": "affix", | ||
"conj": "conj", | ||
"interj": "interj", | ||
"noun": "noun", | ||
"onomatop": "onomatopeia", | ||
"part": "particle", | ||
"phrase": "phrase", | ||
"predic": "adj", | ||
"prep": "prep", | ||
"suffix": "suffix", | ||
"буква": "character", | ||
"гидроним": "name", | ||
"гл": "verb", | ||
"дее": "gerund", | ||
"деепр": "gerund", | ||
"мест": "pronoun", | ||
"нар": "adv", | ||
"падежи": "noun", | ||
"послелог": "postp", | ||
"прил": "adj", | ||
"прич": "participle", | ||
"союз": "conj", | ||
"сущ": "noun", | ||
"существительное": "noun", | ||
"топоним": "name", | ||
"фам": "name", | ||
"част": "particle", | ||
"числ": "number", | ||
} | ||
template_name = template_node.template_name.lower() | ||
for part in template_name.split()[:2]: | ||
for subpart in part.split("-")[:2]: | ||
if subpart in POS_MAP: | ||
return POS_MAP[subpart] | ||
|
||
# Search for POS in clean_text | ||
text = clean_node(wxr, {}, level_node.children) | ||
|
||
for POS_string in wxr.config.POS_SUBTITLES.keys(): | ||
if POS_string in text.lower(): | ||
return wxr.config.POS_SUBTITLES[POS_string]["pos"] | ||
|
||
if "форма" in text.lower(): | ||
# XXX: Decide what to do with form entries | ||
return | ||
|
||
if text.strip(): | ||
wxr.wtp.debug( | ||
f"No part of speech found in children: {level_node.children} with clean text {text}", | ||
sortid="wiktextract/extractor/ru/page/get_pos/98", | ||
) | ||
|
||
|
||
def parse_section( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
level3_node: WikiNode, | ||
): | ||
section_title = clean_node(wxr, {}, level3_node.largs).strip() | ||
wxr.wtp.start_subsection(section_title) | ||
if section_title in [ | ||
"Морфологические и синтаксические свойства", # Morphological and syntactic properties | ||
"Тип и синтаксические свойства сочетания", # Type and syntactic properties of the word combination | ||
]: | ||
pos = get_pos(wxr, level3_node) | ||
if pos: | ||
page_data[-1].pos = pos | ||
# XXX: Extract forms from Russian Wiktionary | ||
# XXX: Extract grammatical tags (gender, etc.) from Russian Wiktionary | ||
|
||
elif section_title == "Произношение": | ||
if wxr.config.capture_pronunciation: | ||
extract_pronunciation(wxr, page_data, level3_node) | ||
elif section_title == "Семантические свойства": # Semantic properties | ||
process_semantic_section(wxr, page_data, level3_node) | ||
elif section_title == "Значение": | ||
pass | ||
elif section_title == "Родственные слова": # Word family | ||
if wxr.config.capture_linkages: | ||
pass | ||
elif section_title == "Этимология": | ||
if wxr.config.capture_etymologies: | ||
# XXX: Extract etymology | ||
pass | ||
elif section_title == "Фразеологизмы и устойчивые сочетания": | ||
if wxr.config.capture_linkages: | ||
pass | ||
elif section_title == "Перевод": | ||
if wxr.config.capture_translations: | ||
pass | ||
elif section_title in ["Анаграммы", "Метаграммы", "Синонимы", "Антонимы"]: | ||
pass | ||
elif section_title == "Библиография": | ||
pass | ||
elif section_title in ["Латиница (Latinça)", "Латиница (Latinca)"]: | ||
pass | ||
elif section_title == "Иноязычные аналоги": | ||
pass | ||
elif section_title == "Прочее": | ||
pass | ||
else: | ||
wxr.wtp.debug( | ||
f"Unprocessed section {section_title}", | ||
sortid="wixtextract/extractor/ru/page/parse_section/66", | ||
) | ||
|
||
|
||
def parse_page( | ||
wxr: WiktextractContext, page_title: str, page_text: str | ||
) -> list[dict[str, str]]: | ||
# Help site describing page structure: https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей | ||
|
||
if wxr.config.verbose: | ||
logging.info(f"Parsing page: {page_title}") | ||
|
||
wxr.config.word = page_title | ||
wxr.wtp.start_page(page_title) | ||
|
||
# Parse the page, pre-expanding those templates that are likely to | ||
# influence parsing | ||
tree = wxr.wtp.parse( | ||
page_text, | ||
pre_expand=True, | ||
additional_expand=ADDITIONAL_EXPAND_TEMPLATES, | ||
) | ||
|
||
page_data: list[WordEntry] = [] | ||
for level1_node in tree.find_child(NodeKind.LEVEL1): | ||
for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): | ||
lang_code = ( | ||
subtitle_template.template_name.strip() | ||
.removeprefix("-") | ||
.removesuffix("-") | ||
) | ||
|
||
if ( | ||
wxr.config.capture_language_codes is not None | ||
and lang_code not in wxr.config.capture_language_codes | ||
): | ||
continue | ||
|
||
categories = {"categories": []} | ||
|
||
lang_name = clean_node(wxr, categories, subtitle_template) | ||
wxr.wtp.start_section(lang_name) | ||
|
||
base_data = WordEntry( | ||
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title | ||
) | ||
base_data.categories.extend(categories["categories"]) | ||
|
||
for non_level23_node in level1_node.invert_find_child( | ||
NodeKind.LEVEL2 | NodeKind.LEVEL3 | ||
): | ||
IGNORED_TEMPLATES = ["wikipedia", "Омонимы", "improve"] | ||
if not ( | ||
isinstance(non_level23_node, WikiNode) | ||
and non_level23_node.kind == NodeKind.TEMPLATE | ||
and non_level23_node.template_name in IGNORED_TEMPLATES | ||
): | ||
wxr.wtp.debug( | ||
f"Found unexpected child in level node {level1_node.largs}: {non_level23_node}", | ||
sortid="extractor/es/page/parse_page/80", | ||
) | ||
|
||
for level2_node in level1_node.find_child(NodeKind.LEVEL2): | ||
page_data.append(copy.deepcopy(base_data)) | ||
for level3_node in level2_node.find_child(NodeKind.LEVEL3): | ||
parse_section(wxr, page_data, level3_node) | ||
|
||
page_data.append(copy.deepcopy(base_data)) | ||
for level3_node in level1_node.find_child(NodeKind.LEVEL3): | ||
parse_section(wxr, page_data, level3_node) | ||
|
||
return [d.model_dump(exclude_defaults=True) for d in page_data] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from wikitextprocessor.parser import LevelNode | ||
|
||
from wiktextract.extractor.ru.models import WordEntry | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
def extract_pronunciation( | ||
wxr: WiktextractContext, | ||
page_data: list[WordEntry], | ||
level_node: LevelNode, | ||
): | ||
pass |