Skip to content

Commit

Permalink
Merge pull request #985 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt, it] extract "form-of" words
  • Loading branch information
xxyzz authored Jan 13, 2025
2 parents 4ba5975 + 12c0a11 commit debb661
Show file tree
Hide file tree
Showing 7 changed files with 180 additions and 85 deletions.
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,18 @@ class Example(ItalianBaseModel):
raw_tags: list[str] = []


class AltForm(ItalianBaseModel):
word: str


class Sense(ItalianBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []
topics: list[str] = []
form_of: list[AltForm] = []


class Translation(ItalianBaseModel):
Expand Down
16 changes: 15 additions & 1 deletion src/wiktextract/extractor/it/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .models import Sense, WordEntry
from .models import AltForm, Sense, WordEntry
from .section_titles import POS_DATA
from .tag_form_line import extract_tag_form_line_nodes
from .tags import translate_raw_tags
Expand Down Expand Up @@ -119,4 +119,18 @@ def extract_gloss_list_item(
if gloss_str != "":
sense.glosses.append(gloss_str)
translate_raw_tags(sense)
if "form-of" in word_entry.tags:
extract_form_of_word(wxr, sense, list_item)
word_entry.senses.append(sense)


def extract_form_of_word(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
word = ""
for node in list_item.find_child(NodeKind.LINK):
word = clean_node(wxr, None, node)
if word != "":
sense.form_of.append(AltForm(word=word))
97 changes: 97 additions & 0 deletions src/wiktextract/extractor/pt/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import re

from wikitextprocessor import (
HTMLNode,
NodeKind,
TemplateNode,
WikiNode,
)

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Example, Sense


def extract_example_list_item(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
example = Example()
ref_nodes = []

for index, node in enumerate(list_item.children):
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and example.text == ""
):
example.text = clean_node(wxr, None, node)
elif isinstance(node, HTMLNode) and node.tag == "small":
example.translation = clean_node(wxr, None, node)
if example.translation.startswith(
"("
) and example.translation.endswith(")"):
example.translation = example.translation.strip("()")
elif isinstance(node, TemplateNode):
match node.template_name:
case "OESP":
example.ref = clean_node(wxr, sense, node).strip("()")
case "tradex":
example.text = clean_node(
wxr, None, node.template_parameters.get(2, "")
)
example.translation = clean_node(
wxr, None, node.template_parameters.get(3, "")
)
clean_node(wxr, sense, node)
case "Ex.":
example.text = clean_node(
wxr, sense, node.template_parameters.get(1, "")
)
elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str) is not None:
list_item_str = clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
if list_item_str.endswith(":"):
ref_nodes.clear()
example.ref = list_item_str
for child_list in list_item.find_child(NodeKind.LIST):
for child_list_item in child_list.find_child(
NodeKind.LIST_ITEM
):
example.text = clean_node(
wxr, None, child_list_item.children
)
break
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
ref_nodes.clear()
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
ref_nodes.append(child_list_item.children)
else:
ref_nodes.append(node)

if example.text != "":
if example.ref == "":
example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
sense.examples.append(example)
else:
extract_example_text_list(wxr, sense, list_item)


def extract_example_text_list(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
list_item_text = clean_node(
wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
)
example = Example(text=list_item_text)
if "-" in example.text:
tr_start = example.text.index("-")
example.translation = example.text[tr_start + 1 :].strip()
example.text = example.text[:tr_start].strip()
sense.examples.append(example)
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,18 @@ class Example(PortugueseBaseModel):
ref: str = ""


class AltForm(PortugueseBaseModel):
word: str


class Sense(PortugueseBaseModel):
glosses: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
categories: list[str] = []
topics: list[str] = []
examples: list[Example] = []
form_of: list[AltForm] = []


class Translation(PortugueseBaseModel):
Expand Down
95 changes: 11 additions & 84 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import re

from wikitextprocessor import (
HTMLNode,
LevelNode,
NodeKind,
TemplateNode,
Expand All @@ -10,9 +9,10 @@

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .example import extract_example_list_item
from .head_line import extract_head_line_nodes
from .inflection import extract_flex_template
from .models import Example, Linkage, Sense, WordEntry
from .models import AltForm, Linkage, Sense, WordEntry
from .section_titles import POS_DATA
from .tags import translate_raw_tags

Expand Down Expand Up @@ -75,6 +75,8 @@ def extract_gloss_list_item(
if len(gloss_str) > 0:
sense.glosses.append(gloss_str)
translate_raw_tags(sense)
if "form-of" in word_entry.tags:
extract_form_of_word(wxr, sense, list_item)
word_entry.senses.append(sense)

for child_list in list_item.find_child(NodeKind.LIST):
Expand Down Expand Up @@ -112,86 +114,11 @@ def extract_escopo2_template(
return raw_tags


def extract_example_list_item(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
def extract_form_of_word(
wxr: WiktextractContext, sense: Sense, list_item: WikiNode
) -> None:
example = Example()
ref_nodes = []

for index, node in enumerate(list_item.children):
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.ITALIC
and example.text == ""
):
example.text = clean_node(wxr, None, node)
elif isinstance(node, HTMLNode) and node.tag == "small":
example.translation = clean_node(wxr, None, node)
if example.translation.startswith(
"("
) and example.translation.endswith(")"):
example.translation = example.translation.strip("()")
elif isinstance(node, TemplateNode):
match node.template_name:
case "OESP":
example.ref = clean_node(wxr, sense, node).strip("()")
case "tradex":
example.text = clean_node(
wxr, None, node.template_parameters.get(2, "")
)
example.translation = clean_node(
wxr, None, node.template_parameters.get(3, "")
)
clean_node(wxr, sense, node)
case "Ex.":
example.text = clean_node(
wxr, sense, node.template_parameters.get(1, "")
)
elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str) is not None:
list_item_str = clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
if list_item_str.endswith(":"):
ref_nodes.clear()
example.ref = list_item_str
for child_list in list_item.find_child(NodeKind.LIST):
for child_list_item in child_list.find_child(
NodeKind.LIST_ITEM
):
example.text = clean_node(
wxr, None, child_list_item.children
)
break
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
ref_nodes.clear()
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
ref_nodes.append(child_list_item.children)
else:
ref_nodes.append(node)

if example.text != "":
if example.ref == "":
example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
sense.examples.append(example)
else:
extract_example_text_list(wxr, sense, list_item)


def extract_example_text_list(
wxr: WiktextractContext,
sense: Sense,
list_item: WikiNode,
) -> None:
list_item_text = clean_node(
wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
)
example = Example(text=list_item_text)
if "-" in example.text:
tr_start = example.text.index("-")
example.translation = example.text[tr_start + 1 :].strip()
example.text = example.text[:tr_start].strip()
sense.examples.append(example)
form_of = ""
for link_node in list_item.find_child_recursively(NodeKind.LINK):
form_of = clean_node(wxr, None, link_node)
if form_of != "":
sense.form_of.append(AltForm(word=form_of))
14 changes: 14 additions & 0 deletions tests/test_it_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,17 @@ def test_subsecton_template_add_new_word_entry(self):
},
],
)

def test_form_of(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
data = parse_page(
self.wxr,
"cani",
"""== {{-it-}} ==
===Sostantivo, forma flessa===
# plurale di [[cane]]""",
)
self.assertEqual(
data[0]["senses"],
[{"glosses": ["plurale di cane"], "form_of": [{"word": "cane"}]}],
)
33 changes: 33 additions & 0 deletions tests/test_pt_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,36 @@ def test_nested_list(self):
{"glosses": ["médio", "relativo à média;"]},
],
)

def test_form_of_bold(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"cães",
"""={{-pt-}}=
==Forma de substantivo==
# plural de '''[[cão]]'''""",
)
self.assertEqual(
data[0]["senses"],
[{"glosses": ["plural de cão"], "form_of": [{"word": "cão"}]}],
)

def test_form_of_link(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
data = parse_page(
self.wxr,
"cãs",
"""={{-pt-}}=
==Forma de substantivo==
# feminino plural de [[cão]] (cruel, brutal)""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": ["feminino plural de cão (cruel, brutal)"],
"form_of": [{"word": "cão"}],
}
],
)

0 comments on commit debb661

Please sign in to comment.