Skip to content

Commit

Permalink
[it] extract translation section
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Dec 12, 2024
1 parent 049ee8d commit 22ee0bf
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 0 deletions.
14 changes: 14 additions & 0 deletions src/wiktextract/extractor/it/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@ class Sense(ItalianBaseModel):
examples: list[Example] = []


class Translation(ItalianBaseModel):
lang_code: str = Field(
default="",
description="Wiktionary language code of the translation term",
)
lang: str = Field(default="", description="Translation language name")
word: str = Field(default="", description="Translation term")
sense: str = Field(default="", description="Translation gloss")
tags: list[str] = []
raw_tags: list[str] = []
roman: str = ""


class WordEntry(ItalianBaseModel):
model_config = ConfigDict(title="Italian Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -41,3 +54,4 @@ class WordEntry(ItalianBaseModel):
categories: list[str] = []
tags: list[str] = []
raw_tags: list[str] = []
translations: list[Translation] = []
3 changes: 3 additions & 0 deletions src/wiktextract/extractor/it/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import POS_DATA
from .translation import extract_translation_section


def parse_section(
Expand All @@ -18,6 +19,8 @@ def parse_section(
title_text = clean_node(wxr, None, level_node.largs)
if title_text in POS_DATA:
extract_pos_section(wxr, page_data, base_data, level_node, title_text)
elif title_text == "Traduzione":
extract_translation_section(wxr, page_data, level_node)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
85 changes: 85 additions & 0 deletions src/wiktextract/extractor/it/translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import re

from mediawiki_langcodes import name_to_code
from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Translation, WordEntry


def extract_translation_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
sense = ""
translations = []
cats = {}
for node in level_node.children:
if isinstance(node, TemplateNode) and node.template_name == "Trad1":
sense = clean_node(wxr, cats, node.template_parameters.get(1, ""))
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
translations.extend(
extract_translation_list_item(wxr, list_item, sense)
)

for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.translations.extend(translations)
data.categories.extend(cats.get("categories", []))


TR_GENDER_TAGS = {
"c": "common",
"f": "feminine",
"m": "masculine",
"n": "neuter",
}


def extract_translation_list_item(
wxr: WiktextractContext, list_item: WikiNode, sense: str
) -> list[Translation]:
translations = []
lang_name = "unknown"
lang_code = "unknown"
before_colon = True
for index, node in enumerate(list_item.children):
if before_colon and isinstance(node, str) and ":" in node:
before_colon = False
lang_name = clean_node(wxr, None, list_item.children[:index])
for n in list_item.children[:index]:
if isinstance(n, TemplateNode):
lang_code = n.template_name
break
if lang_code == "unknown":
new_lang_code = name_to_code(lang_name, "it")
if new_lang_code != "":
lang_code = new_lang_code
elif not before_colon and isinstance(node, WikiNode):
match node.kind:
case NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "":
translations.append(
Translation(
word=word,
sense=sense,
lang=lang_name,
lang_code=lang_code,
)
)
case NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag in TR_GENDER_TAGS and len(translations) > 0:
translations[-1].tags.append(TR_GENDER_TAGS[raw_tag])
elif raw_tag != "" and len(translations) > 0:
translations[-1].raw_tags.append(raw_tag)
elif not before_colon and isinstance(node, str):
m = re.search(r"\((.+)\)", node)
if m is not None and len(translations) > 0:
translations[-1].roman = m.group(1)

return translations
54 changes: 54 additions & 0 deletions tests/test_it_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.it.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestItGloss(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="it"),
WiktionaryConfig(
dump_file_lang_code="it", capture_language_codes=None
),
)

def test_common_lists(self):
self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
self.wxr.wtp.add_page("Template:ar", 10, "arabo")
data = parse_page(
self.wxr,
"cane",
"""== {{-it-}} ==
===Sostantivo===
# [[animale]]
===Traduzione===
{{Trad1|animale}}
:*{{ar}}: [[كَلْب]] (kalb) ''m''
:*[[romagnolo]]: [[chèn]] ''m''""",
)
self.assertEqual(
data[0]["translations"],
[
{
"word": "كَلْب",
"lang_code": "ar",
"lang": "arabo",
"roman": "kalb",
"tags": ["masculine"],
"sense": "animale",
},
{
"word": "chèn",
"lang_code": "rgn",
"lang": "romagnolo",
"tags": ["masculine"],
"sense": "animale",
},
],
)

0 comments on commit 22ee0bf

Please sign in to comment.