Merge pull request #985 from xxyzz/pt

[pt, it] extract "form-of" words
tatuylonen · Jan 13, 2025 · debb661 · debb661
2 parents 4ba5975 + 12c0a11
commit debb661
Show file tree

Hide file tree

Showing 7 changed files with 180 additions and 85 deletions.
diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py
@@ -22,13 +22,18 @@ class Example(ItalianBaseModel):
     raw_tags: list[str] = []
 
 
+class AltForm(ItalianBaseModel):
+    word: str
+
+
 class Sense(ItalianBaseModel):
     glosses: list[str] = []
     tags: list[str] = []
     raw_tags: list[str] = []
     categories: list[str] = []
     examples: list[Example] = []
     topics: list[str] = []
+    form_of: list[AltForm] = []
 
 
 class Translation(ItalianBaseModel):

diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py
@@ -3,7 +3,7 @@
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
 from .example import extract_example_list_item
-from .models import Sense, WordEntry
+from .models import AltForm, Sense, WordEntry
 from .section_titles import POS_DATA
 from .tag_form_line import extract_tag_form_line_nodes
 from .tags import translate_raw_tags
@@ -119,4 +119,18 @@ def extract_gloss_list_item(
     if gloss_str != "":
         sense.glosses.append(gloss_str)
         translate_raw_tags(sense)
+        if "form-of" in word_entry.tags:
+            extract_form_of_word(wxr, sense, list_item)
         word_entry.senses.append(sense)
+
+
+def extract_form_of_word(
+    wxr: WiktextractContext,
+    sense: Sense,
+    list_item: WikiNode,
+) -> None:
+    word = ""
+    for node in list_item.find_child(NodeKind.LINK):
+        word = clean_node(wxr, None, node)
+    if word != "":
+        sense.form_of.append(AltForm(word=word))
diff --git a/src/wiktextract/extractor/pt/example.py b/src/wiktextract/extractor/pt/example.py
@@ -0,0 +1,97 @@
+import re
+
+from wikitextprocessor import (
+    HTMLNode,
+    NodeKind,
+    TemplateNode,
+    WikiNode,
+)
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Example, Sense
+
+
+def extract_example_list_item(
+    wxr: WiktextractContext,
+    sense: Sense,
+    list_item: WikiNode,
+) -> None:
+    example = Example()
+    ref_nodes = []
+
+    for index, node in enumerate(list_item.children):
+        if (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.ITALIC
+            and example.text == ""
+        ):
+            example.text = clean_node(wxr, None, node)
+        elif isinstance(node, HTMLNode) and node.tag == "small":
+            example.translation = clean_node(wxr, None, node)
+            if example.translation.startswith(
+                "("
+            ) and example.translation.endswith(")"):
+                example.translation = example.translation.strip("()")
+        elif isinstance(node, TemplateNode):
+            match node.template_name:
+                case "OESP":
+                    example.ref = clean_node(wxr, sense, node).strip("()")
+                case "tradex":
+                    example.text = clean_node(
+                        wxr, None, node.template_parameters.get(2, "")
+                    )
+                    example.translation = clean_node(
+                        wxr, None, node.template_parameters.get(3, "")
+                    )
+                    clean_node(wxr, sense, node)
+                case "Ex.":
+                    example.text = clean_node(
+                        wxr, sense, node.template_parameters.get(1, "")
+                    )
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
+            bold_str = clean_node(wxr, None, node)
+            if re.fullmatch(r"\d+", bold_str) is not None:
+                list_item_str = clean_node(
+                    wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
+                )
+                if list_item_str.endswith(":"):
+                    ref_nodes.clear()
+                    example.ref = list_item_str
+                    for child_list in list_item.find_child(NodeKind.LIST):
+                        for child_list_item in child_list.find_child(
+                            NodeKind.LIST_ITEM
+                        ):
+                            example.text = clean_node(
+                                wxr, None, child_list_item.children
+                            )
+                    break
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            ref_nodes.clear()
+            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                ref_nodes.append(child_list_item.children)
+        else:
+            ref_nodes.append(node)
+
+    if example.text != "":
+        if example.ref == "":
+            example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
+        sense.examples.append(example)
+    else:
+        extract_example_text_list(wxr, sense, list_item)
+
+
+def extract_example_text_list(
+    wxr: WiktextractContext,
+    sense: Sense,
+    list_item: WikiNode,
+) -> None:
+    list_item_text = clean_node(
+        wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
+    )
+    example = Example(text=list_item_text)
+    if "-" in example.text:
+        tr_start = example.text.index("-")
+        example.translation = example.text[tr_start + 1 :].strip()
+        example.text = example.text[:tr_start].strip()
+        sense.examples.append(example)
diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py
@@ -16,13 +16,18 @@ class Example(PortugueseBaseModel):
     ref: str = ""
 
 
+class AltForm(PortugueseBaseModel):
+    word: str
+
+
 class Sense(PortugueseBaseModel):
     glosses: list[str] = []
     tags: list[str] = []
     raw_tags: list[str] = []
     categories: list[str] = []
     topics: list[str] = []
     examples: list[Example] = []
+    form_of: list[AltForm] = []
 
 
 class Translation(PortugueseBaseModel):

diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
@@ -1,7 +1,6 @@
 import re
 
 from wikitextprocessor import (
-    HTMLNode,
     LevelNode,
     NodeKind,
     TemplateNode,
@@ -10,9 +9,10 @@
 
 from ...page import clean_node
 from ...wxr_context import WiktextractContext
+from .example import extract_example_list_item
 from .head_line import extract_head_line_nodes
 from .inflection import extract_flex_template
-from .models import Example, Linkage, Sense, WordEntry
+from .models import AltForm, Linkage, Sense, WordEntry
 from .section_titles import POS_DATA
 from .tags import translate_raw_tags
 
@@ -75,6 +75,8 @@ def extract_gloss_list_item(
     if len(gloss_str) > 0:
         sense.glosses.append(gloss_str)
         translate_raw_tags(sense)
+        if "form-of" in word_entry.tags:
+            extract_form_of_word(wxr, sense, list_item)
         word_entry.senses.append(sense)
 
     for child_list in list_item.find_child(NodeKind.LIST):
@@ -112,86 +114,11 @@ def extract_escopo2_template(
     return raw_tags
 
 
-def extract_example_list_item(
-    wxr: WiktextractContext,
-    sense: Sense,
-    list_item: WikiNode,
+def extract_form_of_word(
+    wxr: WiktextractContext, sense: Sense, list_item: WikiNode
 ) -> None:
-    example = Example()
-    ref_nodes = []
-
-    for index, node in enumerate(list_item.children):
-        if (
-            isinstance(node, WikiNode)
-            and node.kind == NodeKind.ITALIC
-            and example.text == ""
-        ):
-            example.text = clean_node(wxr, None, node)
-        elif isinstance(node, HTMLNode) and node.tag == "small":
-            example.translation = clean_node(wxr, None, node)
-            if example.translation.startswith(
-                "("
-            ) and example.translation.endswith(")"):
-                example.translation = example.translation.strip("()")
-        elif isinstance(node, TemplateNode):
-            match node.template_name:
-                case "OESP":
-                    example.ref = clean_node(wxr, sense, node).strip("()")
-                case "tradex":
-                    example.text = clean_node(
-                        wxr, None, node.template_parameters.get(2, "")
-                    )
-                    example.translation = clean_node(
-                        wxr, None, node.template_parameters.get(3, "")
-                    )
-                    clean_node(wxr, sense, node)
-                case "Ex.":
-                    example.text = clean_node(
-                        wxr, sense, node.template_parameters.get(1, "")
-                    )
-        elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
-            bold_str = clean_node(wxr, None, node)
-            if re.fullmatch(r"\d+", bold_str) is not None:
-                list_item_str = clean_node(
-                    wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
-                )
-                if list_item_str.endswith(":"):
-                    ref_nodes.clear()
-                    example.ref = list_item_str
-                    for child_list in list_item.find_child(NodeKind.LIST):
-                        for child_list_item in child_list.find_child(
-                            NodeKind.LIST_ITEM
-                        ):
-                            example.text = clean_node(
-                                wxr, None, child_list_item.children
-                            )
-                    break
-        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
-            ref_nodes.clear()
-            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
-                ref_nodes.append(child_list_item.children)
-        else:
-            ref_nodes.append(node)
-
-    if example.text != "":
-        if example.ref == "":
-            example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
-        sense.examples.append(example)
-    else:
-        extract_example_text_list(wxr, sense, list_item)
-
-
-def extract_example_text_list(
-    wxr: WiktextractContext,
-    sense: Sense,
-    list_item: WikiNode,
-) -> None:
-    list_item_text = clean_node(
-        wxr, sense, list(list_item.invert_find_child(NodeKind.LIST))
-    )
-    example = Example(text=list_item_text)
-    if "-" in example.text:
-        tr_start = example.text.index("-")
-        example.translation = example.text[tr_start + 1 :].strip()
-        example.text = example.text[:tr_start].strip()
-        sense.examples.append(example)
+    form_of = ""
+    for link_node in list_item.find_child_recursively(NodeKind.LINK):
+        form_of = clean_node(wxr, None, link_node)
+    if form_of != "":
+        sense.form_of.append(AltForm(word=form_of))
diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py
@@ -158,3 +158,17 @@ def test_subsecton_template_add_new_word_entry(self):
                 },
             ],
         )
+
+    def test_form_of(self):
+        self.wxr.wtp.add_page("Template:-it-", 10, "Italiano")
+        data = parse_page(
+            self.wxr,
+            "cani",
+            """== {{-it-}} ==
+===Sostantivo, forma flessa===
+# plurale di [[cane]]""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [{"glosses": ["plurale di cane"], "form_of": [{"word": "cane"}]}],
+        )
diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py
@@ -93,3 +93,36 @@ def test_nested_list(self):
                 {"glosses": ["médio", "relativo à média;"]},
             ],
         )
+
+    def test_form_of_bold(self):
+        self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+        data = parse_page(
+            self.wxr,
+            "cães",
+            """={{-pt-}}=
+==Forma de substantivo==
+# plural de '''[[cão]]'''""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [{"glosses": ["plural de cão"], "form_of": [{"word": "cão"}]}],
+        )
+
+    def test_form_of_link(self):
+        self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
+        data = parse_page(
+            self.wxr,
+            "cãs",
+            """={{-pt-}}=
+==Forma de substantivo==
+# feminino plural de [[cão]] (cruel, brutal)""",
+        )
+        self.assertEqual(
+            data[0]["senses"],
+            [
+                {
+                    "glosses": ["feminino plural de cão (cruel, brutal)"],
+                    "form_of": [{"word": "cão"}],
+                }
+            ],
+        )