Skip to content

Commit

Permalink
[ru] extract stressed form in "з" template
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 27, 2024
1 parent e1aaf33 commit c1ed206
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 4 deletions.
27 changes: 24 additions & 3 deletions src/wiktextract/extractor/ru/page.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
from typing import Any, Optional
from typing import Any

from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
Expand Down Expand Up @@ -71,7 +72,7 @@ def process_semantic_section(

def get_pos_from_template(
wxr: WiktextractContext, template_node: TemplateNode
) -> Optional[POSSubtitleData]:
) -> POSSubtitleData | None:
# Search for POS in template names
template_name = template_node.template_name.lower()
if template_name == "morph":
Expand Down Expand Up @@ -103,7 +104,7 @@ def get_pos_from_template(

def get_pos(
wxr: WiktextractContext, level_node: WikiNode
) -> Optional[POSSubtitleData]:
) -> POSSubtitleData | None:
for template_node in level_node.find_child(NodeKind.TEMPLATE):
pos_data = get_pos_from_template(wxr, template_node)
if pos_data is not None:
Expand Down Expand Up @@ -287,6 +288,7 @@ def parse_page(
base_data.pos = pos_data["pos"]
base_data.tags.extend(pos_data.get("tags", []))
page_data.append(base_data.model_copy(deep=True))
extract_level2_node_contents(wxr, page_data[-1], level2_node)
has_level3 = False
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, level3_node)
Expand Down Expand Up @@ -436,3 +438,22 @@ def extract_zh_forms_template(
base_data.forms.append(form_data)
elif p_name == "lit":
base_data.literal_meaning = clean_node(wxr, None, p_value)


def extract_level2_node_contents(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
) -> None:
for t_node in level_node.find_content(NodeKind.TEMPLATE):
if t_node.template_name in ["заголовок", "з"]:
# https://ru.wiktionary.org/wiki/Шаблон:з
stressed_form = clean_node(
wxr, None, t_node.template_parameters.get("ударение", "")
)
if "(" in stressed_form:
stressed_form = stressed_form[
: stressed_form.index("(")
].strip()
if stressed_form not in ["", wxr.wtp.title]:
word_entry.forms.append(
Form(form=stressed_form, tags=["stressed"])
)
16 changes: 16 additions & 0 deletions tests/test_ru_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,3 +348,19 @@ def test_no_level3_title(self):
}
],
)

def test_stressed_form(self):
self.wxr.wtp.add_page("Шаблон:-ru-", 10, "Русский")
data = parse_page(
self.wxr,
"-ful",
"""= {{-ru-}} =
== {{заголовок|ударение=коса́ (существительное I)}} ==
{{сущ-ru|коса́|f ina 1d|слоги={{по-слогам|ко|са́}}}}
=== Семантические свойства ===
==== Значение ====
# [[вид]] укладки [[волос]]""",
)
self.assertEqual(
data[0]["forms"], [{"form": "коса́", "tags": ["stressed"]}]
)
2 changes: 1 addition & 1 deletion tests/test_ru_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ def test_homophone_section_list(self):
extract_homophone_section(self.wxr, data, root)
self.assertEqual(
[s.model_dump(exclude_defaults=True) for s in data.sounds],
[{"homophones": ["ไท", "ไถ"]}]
[{"homophones": ["ไท", "ไถ"]}],
)

0 comments on commit c1ed206

Please sign in to comment.