Skip to content

Commit

Permalink
Merge pull request #411 from empiriker/ru
Browse files Browse the repository at this point in the history
Parse page for Russian Wiktionary
  • Loading branch information
xxyzz authored Dec 5, 2023
2 parents fee414a + 9da9fa9 commit b0c038f
Show file tree
Hide file tree
Showing 6 changed files with 348 additions and 0 deletions.
56 changes: 56 additions & 0 deletions json_schema/ru.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
{
"$id": "https://kaikki.org/ru.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.",
"properties": {
"categories": {
"default": [],
"description": "list of non-disambiguated categories for the word",
"items": {
"type": "string"
},
"title": "Categories",
"type": "array"
},
"lang_code": {
"description": "Wiktionary language code",
"examples": [
"ru"
],
"title": "Lang Code",
"type": "string"
},
"lang_name": {
"description": "Localized language name of the word",
"examples": [
"Русский"
],
"title": "Lang Name",
"type": "string"
},
"pos": {
"default": null,
"description": "Part of speech type",
"title": "Pos",
"type": "string"
},
"pos_title": {
"default": null,
"description": "Original POS title",
"title": "Pos Title",
"type": "string"
},
"word": {
"description": "word string",
"title": "Word",
"type": "string"
}
},
"required": [
"word",
"lang_code",
"lang_name"
],
"title": "Russian Wiktionary",
"type": "object"
}
4 changes: 4 additions & 0 deletions src/wiktextract/data/ru/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
}
14 changes: 14 additions & 0 deletions src/wiktextract/data/ru/pos_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{
"аббревиатура": { "pos": "abbrev" },
"глагол": { "pos": "verb" },
"деепричастие": { "pos": "gerund" },
"имя собственное": { "pos": "name" },
"имя, собственное": { "pos": "name" },
"междометие": { "pos": "interj" },
"префикс": { "pos": "prefix" },
"префиксоид": { "pos": "prefix" },
"прилагательное": { "pos": "adj" },
"суффикс": { "pos": "suffix" },
"существительное": { "pos": "noun" },
"устойчивое сочетание": { "pos": "phrase" }
}
46 changes: 46 additions & 0 deletions src/wiktextract/extractor/ru/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import json

from pydantic import BaseModel, ConfigDict, Field, model_validator
from pydantic.json_schema import GenerateJsonSchema


class BaseModelWrap(BaseModel):
model_config = ConfigDict(validate_assignment=True, extra="forbid")


class WordEntry(BaseModelWrap):
"""WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract."""

word: str = Field(description="word string")
pos: str = Field(default=None, description="Part of speech type")
pos_title: str = Field(default=None, description="Original POS title")
lang_code: str = Field(
description="Wiktionary language code", examples=["ru"]
)
lang_name: str = Field(
description="Localized language name of the word", examples=["Русский"]
)
categories: list[str] = Field(
default=[],
description="list of non-disambiguated categories for the word",
)


if __name__ == "__main__":

class JsonSchemaGenerator(GenerateJsonSchema):
def generate(self, schema, mode="validation"):
json_schema = super().generate(schema, mode=mode)
json_schema["title"] = "Russian Wiktionary"
json_schema["$id"] = "https://kaikki.org/ru.json"
json_schema["$schema"] = self.schema_dialect
return json_schema

with open("json_schema/ru.json", "w") as f:
json.dump(
WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator),
f,
indent=2,
ensure_ascii=False,
sort_keys=True,
)
216 changes: 216 additions & 0 deletions src/wiktextract/extractor/ru/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
import copy
import logging
from typing import Optional

from wikitextprocessor import NodeKind, WikiNode

from wiktextract.extractor.ru.models import WordEntry
from wiktextract.extractor.ru.pronunciation import extract_pronunciation
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

# Templates that are used to form panels on pages and that
# should be ignored in various positions
PANEL_TEMPLATES = set()

# Template name prefixes used for language-specific panel templates (i.e.,
# templates that create side boxes or notice boxes or that should generally
# be ignored).
PANEL_PREFIXES = set()

# Additional templates to be expanded in the pre-expand phase
ADDITIONAL_EXPAND_TEMPLATES = set()


def process_semantic_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
semantic_level_node: WikiNode,
):
pass


def get_pos(
wxr: WiktextractContext,
level_node: WikiNode,
) -> Optional[str]:
# Search for POS in template names
for template_node in level_node.find_child(NodeKind.TEMPLATE):
POS_MAP = {
"abbrev": "abbrev",
"adv": "adv",
"affix": "affix",
"conj": "conj",
"interj": "interj",
"noun": "noun",
"onomatop": "onomatopeia",
"part": "particle",
"phrase": "phrase",
"predic": "adj",
"prep": "prep",
"suffix": "suffix",
"буква": "character",
"гидроним": "name",
"гл": "verb",
"дее": "gerund",
"деепр": "gerund",
"мест": "pronoun",
"нар": "adv",
"падежи": "noun",
"послелог": "postp",
"прил": "adj",
"прич": "participle",
"союз": "conj",
"сущ": "noun",
"существительное": "noun",
"топоним": "name",
"фам": "name",
"част": "particle",
"числ": "number",
}
template_name = template_node.template_name.lower()
for part in template_name.split()[:2]:
for subpart in part.split("-")[:2]:
if subpart in POS_MAP:
return POS_MAP[subpart]

# Search for POS in clean_text
text = clean_node(wxr, {}, level_node.children)

for POS_string in wxr.config.POS_SUBTITLES.keys():
if POS_string in text.lower():
return wxr.config.POS_SUBTITLES[POS_string]["pos"]

if "форма" in text.lower():
# XXX: Decide what to do with form entries
return

if text.strip():
wxr.wtp.debug(
f"No part of speech found in children: {level_node.children} with clean text {text}",
sortid="wiktextract/extractor/ru/page/get_pos/98",
)


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level3_node: WikiNode,
):
section_title = clean_node(wxr, {}, level3_node.largs).strip()
wxr.wtp.start_subsection(section_title)
if section_title in [
"Морфологические и синтаксические свойства", # Morphological and syntactic properties
"Тип и синтаксические свойства сочетания", # Type and syntactic properties of the word combination
]:
pos = get_pos(wxr, level3_node)
if pos:
page_data[-1].pos = pos
# XXX: Extract forms from Russian Wiktionary
# XXX: Extract grammatical tags (gender, etc.) from Russian Wiktionary

elif section_title == "Произношение":
if wxr.config.capture_pronunciation:
extract_pronunciation(wxr, page_data, level3_node)
elif section_title == "Семантические свойства": # Semantic properties
process_semantic_section(wxr, page_data, level3_node)
elif section_title == "Значение":
pass
elif section_title == "Родственные слова": # Word family
if wxr.config.capture_linkages:
pass
elif section_title == "Этимология":
if wxr.config.capture_etymologies:
# XXX: Extract etymology
pass
elif section_title == "Фразеологизмы и устойчивые сочетания":
if wxr.config.capture_linkages:
pass
elif section_title == "Перевод":
if wxr.config.capture_translations:
pass
elif section_title in ["Анаграммы", "Метаграммы", "Синонимы", "Антонимы"]:
pass
elif section_title == "Библиография":
pass
elif section_title in ["Латиница (Latinça)", "Латиница (Latinca)"]:
pass
elif section_title == "Иноязычные аналоги":
pass
elif section_title == "Прочее":
pass
else:
wxr.wtp.debug(
f"Unprocessed section {section_title}",
sortid="wixtextract/extractor/ru/page/parse_section/66",
)


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> list[dict[str, str]]:
# Help site describing page structure: https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей

if wxr.config.verbose:
logging.info(f"Parsing page: {page_title}")

wxr.config.word = page_title
wxr.wtp.start_page(page_title)

# Parse the page, pre-expanding those templates that are likely to
# influence parsing
tree = wxr.wtp.parse(
page_text,
pre_expand=True,
additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
)

page_data: list[WordEntry] = []
for level1_node in tree.find_child(NodeKind.LEVEL1):
for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE):
lang_code = (
subtitle_template.template_name.strip()
.removeprefix("-")
.removesuffix("-")
)

if (
wxr.config.capture_language_codes is not None
and lang_code not in wxr.config.capture_language_codes
):
continue

categories = {"categories": []}

lang_name = clean_node(wxr, categories, subtitle_template)
wxr.wtp.start_section(lang_name)

base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
)
base_data.categories.extend(categories["categories"])

for non_level23_node in level1_node.invert_find_child(
NodeKind.LEVEL2 | NodeKind.LEVEL3
):
IGNORED_TEMPLATES = ["wikipedia", "Омонимы", "improve"]
if not (
isinstance(non_level23_node, WikiNode)
and non_level23_node.kind == NodeKind.TEMPLATE
and non_level23_node.template_name in IGNORED_TEMPLATES
):
wxr.wtp.debug(
f"Found unexpected child in level node {level1_node.largs}: {non_level23_node}",
sortid="extractor/es/page/parse_page/80",
)

for level2_node in level1_node.find_child(NodeKind.LEVEL2):
page_data.append(copy.deepcopy(base_data))
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, level3_node)

page_data.append(copy.deepcopy(base_data))
for level3_node in level1_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, level3_node)

return [d.model_dump(exclude_defaults=True) for d in page_data]
12 changes: 12 additions & 0 deletions src/wiktextract/extractor/ru/pronunciation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.ru.models import WordEntry
from wiktextract.wxr_context import WiktextractContext


def extract_pronunciation(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
):
pass

0 comments on commit b0c038f

Please sign in to comment.