Skip to content

Commit

Permalink
[de] add some tags in "Flexion" level 2 section title
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 8, 2025
1 parent b4b8b9f commit 1d25ac8
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def match_sense_index(sense_idx: str, sense: Sense) -> bool:

if sense_idx == sense.sense_index:
return True
first_number_str = re.split(r",|\.|-", sense.sense_index, 1)[0]
first_number_str = re.split(r",|\.|-", sense.sense_index, maxsplit=1)[0]
first_number = 0
if first_number_str.isdigit():
first_number = int(first_number_str)
Expand Down
40 changes: 27 additions & 13 deletions src/wiktextract/extractor/de/flexion.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from dataclasses import dataclass

from wikitextprocessor import NodeKind
from wikitextprocessor.parser import HTMLNode, TemplateNode
from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry
from .tags import translate_raw_tags


LEVEL2_TAGS = frozenset(["untrennbar", "unregelmäßig"])


def parse_flexion_page(
wxr: WiktextractContext, word_entry: WordEntry, page_title: str
) -> None:
Expand All @@ -19,17 +21,26 @@ def parse_flexion_page(
if flexion_page is None:
return
flexion_root = wxr.wtp.parse(flexion_page)
for flexion_template in flexion_root.find_child_recursively(
NodeKind.TEMPLATE
shared_raw_tags = []
for node in flexion_root.find_child_recursively(
NodeKind.TEMPLATE | NodeKind.LEVEL2
):
if flexion_template.template_name.startswith("Deklinationsseite"):
process_deklinationsseite_template(
wxr, word_entry, flexion_template, page_title
)
elif flexion_template.template_name.startswith("Deutsch Verb"):
process_deutsch_verb_template(
wxr, word_entry, flexion_template, page_title
)
match node.kind:
case NodeKind.LEVEL2:
shared_raw_tags.clear()
section_str = clean_node(wxr, None, node.largs)
for raw_tag in LEVEL2_TAGS:
if raw_tag in section_str:
shared_raw_tags.append(raw_tag)
case NodeKind.TEMPLATE:
if node.template_name.startswith("Deklinationsseite"):
process_deklinationsseite_template(
wxr, word_entry, node, page_title
)
elif node.template_name.startswith("Deutsch Verb"):
process_deutsch_verb_template(
wxr, word_entry, node, page_title, shared_raw_tags
)


@dataclass
Expand Down Expand Up @@ -113,6 +124,7 @@ def process_deutsch_verb_template(
word_entry: WordEntry,
template_node: TemplateNode,
page_tite: str,
shared_raw_tags: list[str],
) -> None:
# Vorlage:Deutsch Verb regelmäßig
expanded_template = wxr.wtp.parse(
Expand Down Expand Up @@ -201,7 +213,9 @@ def process_deutsch_verb_template(
":", 1
)
form = Form(
form=form_text.strip(), source=page_tite
form=form_text.strip(),
source=page_tite,
raw_tags=shared_raw_tags,
)
if form_raw_tag != "":
form.raw_tags.append(form_raw_tag)
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@
"Infinitive": "infinitive",
"Infinitiv": "infinitive",
"Partizipien": "participle",
"unregelmäßig": "irregular",
}

VOICE_TAGS = {
Expand Down

0 comments on commit 1d25ac8

Please sign in to comment.