[it] extract translation section

tatuylonen · Dec 12, 2024 · 22ee0bf · 22ee0bf
1 parent 049ee8d
commit 22ee0bf
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 0 deletions.
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
@@ -30,6 +30,19 @@ class Sense(ItalianBaseModel):
     examples: list[Example] = []
 
 
+class Translation(ItalianBaseModel):
+    lang_code: str = Field(
+        default="",
+        description="Wiktionary language code of the translation term",
+    )
+    lang: str = Field(default="", description="Translation language name")
+    word: str = Field(default="", description="Translation term")
+    sense: str = Field(default="", description="Translation gloss")
+    tags: list[str] = []
+    raw_tags: list[str] = []
+    roman: str = ""
+
+
 class WordEntry(ItalianBaseModel):
     model_config = ConfigDict(title="Italian Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -41,3 +54,4 @@ class WordEntry(ItalianBaseModel):
     categories: list[str] = []
     tags: list[str] = []
     raw_tags: list[str] = []
+    translations: list[Translation] = []
diff --git a/src/wiktextract/extractor/it/page.py b/src/wiktextract/extractor/it/page.py
@@ -7,6 +7,7 @@
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
 from .section_titles import POS_DATA
+from .translation import extract_translation_section
 
 
 def parse_section(
@@ -18,6 +19,8 @@ def parse_section(
     title_text = clean_node(wxr, None, level_node.largs)
     if title_text in POS_DATA:
         extract_pos_section(wxr, page_data, base_data, level_node, title_text)
+    elif title_text == "Traduzione":
+        extract_translation_section(wxr, page_data, level_node)
 
     for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
         parse_section(wxr, page_data, base_data, next_level)

diff --git a/src/wiktextract/extractor/it/translation.py b/src/wiktextract/extractor/it/translation.py
@@ -0,0 +1,85 @@
+import re
+
+from mediawiki_langcodes import name_to_code
+from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Translation, WordEntry
+
+
+def extract_translation_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level_node: LevelNode,
+) -> None:
+    sense = ""
+    translations = []
+    cats = {}
+    for node in level_node.children:
+        if isinstance(node, TemplateNode) and node.template_name == "Trad1":
+            sense = clean_node(wxr, cats, node.template_parameters.get(1, ""))
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for list_item in node.find_child(NodeKind.LIST_ITEM):
+                translations.extend(
+                    extract_translation_list_item(wxr, list_item, sense)
+                )
+
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            data.translations.extend(translations)
+            data.categories.extend(cats.get("categories", []))
+
+
+TR_GENDER_TAGS = {
+    "c": "common",
+    "f": "feminine",
+    "m": "masculine",
+    "n": "neuter",
+}
+
+
+def extract_translation_list_item(
+    wxr: WiktextractContext, list_item: WikiNode, sense: str
+) -> list[Translation]:
+    translations = []
+    lang_name = "unknown"
+    lang_code = "unknown"
+    before_colon = True
+    for index, node in enumerate(list_item.children):
+        if before_colon and isinstance(node, str) and ":" in node:
+            before_colon = False
+            lang_name = clean_node(wxr, None, list_item.children[:index])
+            for n in list_item.children[:index]:
+                if isinstance(n, TemplateNode):
+                    lang_code = n.template_name
+                    break
+            if lang_code == "unknown":
+                new_lang_code = name_to_code(lang_name, "it")
+                if new_lang_code != "":
+                    lang_code = new_lang_code
+        elif not before_colon and isinstance(node, WikiNode):
+            match node.kind:
+                case NodeKind.LINK:
+                    word = clean_node(wxr, None, node)
+                    if word != "":
+                        translations.append(
+                            Translation(
+                                word=word,
+                                sense=sense,
+                                lang=lang_name,
+                                lang_code=lang_code,
+                            )
+                        )
+                case NodeKind.ITALIC:
+                    raw_tag = clean_node(wxr, None, node)
+                    if raw_tag in TR_GENDER_TAGS and len(translations) > 0:
+                        translations[-1].tags.append(TR_GENDER_TAGS[raw_tag])
+                    elif raw_tag != "" and len(translations) > 0:
+                        translations[-1].raw_tags.append(raw_tag)
+        elif not before_colon and isinstance(node, str):
+            m = re.search(r"\((.+)\)", node)
+            if m is not None and len(translations) > 0:
+                translations[-1].roman = m.group(1)
+
+    return translations
diff --git a/tests/test_it_translation.py b/tests/test_it_translation.py
@@ -0,0 +1,54 @@
+from unittest import TestCase
+
+from wikitextprocessor import Wtp
+
+from wiktextract.config import WiktionaryConfig
+from wiktextract.extractor.it.page import parse_page
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestItGloss(TestCase):
+    maxDiff = None
+
+    def setUp(self) -> None:
+        self.wxr = WiktextractContext(
+            Wtp(lang_code="it"),
+            WiktionaryConfig(
+                dump_file_lang_code="it", capture_language_codes=None
+            ),
+        )
+
+    def test_common_lists(self):
+        self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
+        self.wxr.wtp.add_page("Template:ar", 10, "arabo")
+        data = parse_page(
+            self.wxr,
+            "cane",
+            """== {{-it-}} ==
+===Sostantivo===
+# [[animale]]
+===Traduzione===
+{{Trad1|animale}}
+:*{{ar}}: [[كَلْب]] (kalb) ''m''
+:*[[romagnolo]]: [[chèn]] ''m''""",
+        )
+        self.assertEqual(
+            data[0]["translations"],
+            [
+                {
+                    "word": "كَلْب",
+                    "lang_code": "ar",
+                    "lang": "arabo",
+                    "roman": "kalb",
+                    "tags": ["masculine"],
+                    "sense": "animale",
+                },
+                {
+                    "word": "chèn",
+                    "lang_code": "rgn",
+                    "lang": "romagnolo",
+                    "tags": ["masculine"],
+                    "sense": "animale",
+                },
+            ],
+        )