Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pydantic models and parse_page for Spanish Wiktionary #392

Merged
merged 7 commits into from
Dec 1, 2023
Prev Previous commit
Next Next commit
Extract glosses from Spanish Wiktionary
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
empiriker committed Nov 28, 2023
commit aed70aa7322fd5dc2f560bf6bbb9496507867fd8
60 changes: 60 additions & 0 deletions src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import re
from typing import List
from wiktextract.extractor.es.models import Sense, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
from wikitextprocessor import WikiNode, NodeKind
from wikitextprocessor.parser import WikiNodeChildrenList


def extract_gloss(
wxr: WiktextractContext,
page_data: List[WordEntry],
list_node: WikiNode,
) -> None:
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
gloss_data = Sense(glosses=[])

definition: WikiNodeChildrenList = []
other: WikiNodeChildrenList = []

for node in list_item.definition:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
other.append(node)
else:
definition.append(node)

list_item.definition

gloss = clean_node(wxr, gloss_data, definition)
gloss_data.glosses.append(gloss)

gloss_note = clean_node(wxr, gloss_data, list_item.children)

match = re.match(r"^(\d+)", gloss_note)

if match:
gloss_data["senseid"] = int(match.group(1))
tag_string = gloss_note[len(match.group(1)) :].strip()
else:
tag_string = gloss_data["tags"] = gloss_note.strip()

# split tags by comma or "y"
tags = re.split(r",|y", tag_string)
for tag in tags:
tag = (
tag.strip()
.removesuffix(".")
.removesuffix("Main")
.removeprefix("Main")
)
if tag:
gloss_data["tags"].append(tag)

if other:
wxr.wtp.debug(
f"Found nodes that are not part of definition: {other}",
sortid="extractor/es/gloss/extract_gloss/46",
)

page_data[-1].senses.append(gloss_data)
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
@@ -79,6 +79,9 @@ class Sense(LoggingExtraFieldsModel):
subsenses: list["Sense"] = Field(
default=[], description="List of subsenses"
)
senseid: Optional[int] = Field(
default=None, description="Sense number used in Wiktionary"
)


class WordEntry(LoggingExtraFieldsModel):
11 changes: 8 additions & 3 deletions src/wiktextract/extractor/es/page.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@

from wikitextprocessor import NodeKind, WikiNode
from wiktextract.datautils import append_base_data
from wiktextract.extractor.es.gloss import extract_gloss
from wiktextract.extractor.es.pronunciation import extract_pronunciation
from wiktextract.extractor.es.models import WordEntry, PydanticLogger

@@ -76,9 +77,13 @@ def process_pos_block(
):
# XXX: Extract forms
pass
elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
# XXX: Extract data
pass
elif (
isinstance(child, WikiNode)
and child.kind == NodeKind.LIST
and child.sarg == ";"
):
extract_gloss(wxr, page_data, child)

else:
# XXX: Extract data
pass
88 changes: 88 additions & 0 deletions tests/test_es_gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from typing import List
import unittest

from wikitextprocessor import Wtp
from wiktextract.extractor.es.gloss import extract_gloss
from wiktextract.extractor.es.models import WordEntry

from wiktextract.config import WiktionaryConfig
from wiktextract.wxr_context import WiktextractContext


class TestESGloss(unittest.TestCase):
def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="es"),
WiktionaryConfig(dump_file_lang_code="es"),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def get_default_page_data(self) -> List[WordEntry]:
return [WordEntry(word="test", lang_code="es", lang_name="Language")]

def test_es_extract_glosses(self):
# https://es.wiktionary.org/wiki/ayudar

self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir")
self.wxr.wtp.start_page("")

root = self.wxr.wtp.parse(
""";1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo.
;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]"""
)

page_data = self.get_default_page_data()

extract_gloss(self.wxr, page_data, root.children[0])

self.assertEqual(
page_data[0].model_dump(exclude_defaults=True)["senses"],
[
{
"glosses": [
"Contribuir esfuerzo o recursos para la realización de algo."
],
"senseid": 1,
},
{
"glosses": [
"Por antonomasia, cooperar a que alguno salga de una situación dificultosa"
],
"senseid": 2,
},
],
)

def test_es_extract_gloss_categories(self):
# https://es.wiktionary.org/wiki/amor
self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento")
self.wxr.wtp.add_page(
"Plantilla:sentimientos",
10,
"Humanidades. [[Categoría:ES:Sentimientos]]",
)
self.wxr.wtp.start_page("")

root = self.wxr.wtp.parse(
";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa"
)

page_data = self.get_default_page_data()

extract_gloss(self.wxr, page_data, root.children[0])

self.assertEqual(
page_data[0].model_dump(exclude_defaults=True)["senses"],
[
{
"glosses": [
"Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa"
],
"senseid": 1,
"tags": ["Humanidades"],
"categories": ["ES:Sentimientos"],
}
],
)