Skip to content

Commit

Permalink
[it] extract "form-of" words
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 13, 2025
1 parent 2cbb6cf commit 12c0a11
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 1 deletion.
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,18 @@ class Example(ItalianBaseModel):
raw_tags: list[str] = []


class AltForm(ItalianBaseModel):
word: str


class Sense(ItalianBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []
topics: list[str] = []
form_of: list[AltForm] = []


class Translation(ItalianBaseModel):
Expand Down
16 changes: 15 additions & 1 deletion src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .models import AltForm, Sense, WordEntry
from .section_titles import POS_DATA
from .tag_form_line import extract_tag_form_line_nodes
from .tags import translate_raw_tags
Expand Down Expand Up @@ -119,4 +119,18 @@ def extract_gloss_list_item(
if gloss_str != "":
sense.glosses.append(gloss_str)
translate_raw_tags(sense)
if "form-of" in word_entry.tags:
extract_form_of_word(wxr, sense, list_item)
word_entry.senses.append(sense)


def extract_form_of_word(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
word = ""
for node in list_item.find_child(NodeKind.LINK):
word = clean_node(wxr, None, node)
if word != "":
sense.form_of.append(AltForm(word=word))
14 changes: 14 additions & 0 deletions tests/test_it_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,17 @@ def test_subsecton_template_add_new_word_entry(self):
},
],
)

def test_form_of(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"cani",
"""== {{-it-}} ==
===Sostantivo, forma flessa===
# plurale di [[cane]]""",
)
self.assertEqual(
data[0]["senses"],
[{"glosses": ["plurale di cane"], "form_of": [{"word": "cane"}]}],
)

0 comments on commit 12c0a11

Please sign in to comment.