diff --git a/src/wiktextract/extractor/it/models.py b/src/wiktextract/extractor/it/models.py index 1cb73f8cb..70e34abae 100644 --- a/src/wiktextract/extractor/it/models.py +++ b/src/wiktextract/extractor/it/models.py @@ -22,6 +22,10 @@ class Example(ItalianBaseModel): raw_tags: list[str] = [] +class AltForm(ItalianBaseModel): + word: str + + class Sense(ItalianBaseModel): glosses: list[str] = [] tags: list[str] = [] @@ -29,6 +33,7 @@ class Sense(ItalianBaseModel): categories: list[str] = [] examples: list[Example] = [] topics: list[str] = [] + form_of: list[AltForm] = [] class Translation(ItalianBaseModel): diff --git a/src/wiktextract/extractor/it/pos.py b/src/wiktextract/extractor/it/pos.py index 87c80324f..16da323a7 100644 --- a/src/wiktextract/extractor/it/pos.py +++ b/src/wiktextract/extractor/it/pos.py @@ -3,7 +3,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .example import extract_example_list_item -from .models import Sense, WordEntry +from .models import AltForm, Sense, WordEntry from .section_titles import POS_DATA from .tag_form_line import extract_tag_form_line_nodes from .tags import translate_raw_tags @@ -119,4 +119,18 @@ def extract_gloss_list_item( if gloss_str != "": sense.glosses.append(gloss_str) translate_raw_tags(sense) + if "form-of" in word_entry.tags: + extract_form_of_word(wxr, sense, list_item) word_entry.senses.append(sense) + + +def extract_form_of_word( + wxr: WiktextractContext, + sense: Sense, + list_item: WikiNode, +) -> None: + word = "" + for node in list_item.find_child(NodeKind.LINK): + word = clean_node(wxr, None, node) + if word != "": + sense.form_of.append(AltForm(word=word)) diff --git a/src/wiktextract/extractor/pt/example.py b/src/wiktextract/extractor/pt/example.py new file mode 100644 index 000000000..e49a3da30 --- /dev/null +++ b/src/wiktextract/extractor/pt/example.py @@ -0,0 +1,97 @@ +import re + +from wikitextprocessor import ( + HTMLNode, + NodeKind, + TemplateNode, + WikiNode, +) + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Example, Sense + + +def extract_example_list_item( + wxr: WiktextractContext, + sense: Sense, + list_item: WikiNode, +) -> None: + example = Example() + ref_nodes = [] + + for index, node in enumerate(list_item.children): + if ( + isinstance(node, WikiNode) + and node.kind == NodeKind.ITALIC + and example.text == "" + ): + example.text = clean_node(wxr, None, node) + elif isinstance(node, HTMLNode) and node.tag == "small": + example.translation = clean_node(wxr, None, node) + if example.translation.startswith( + "(" + ) and example.translation.endswith(")"): + example.translation = example.translation.strip("()") + elif isinstance(node, TemplateNode): + match node.template_name: + case "OESP": + example.ref = clean_node(wxr, sense, node).strip("()") + case "tradex": + example.text = clean_node( + wxr, None, node.template_parameters.get(2, "") + ) + example.translation = clean_node( + wxr, None, node.template_parameters.get(3, "") + ) + clean_node(wxr, sense, node) + case "Ex.": + example.text = clean_node( + wxr, sense, node.template_parameters.get(1, "") + ) + elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: + bold_str = clean_node(wxr, None, node) + if re.fullmatch(r"\d+", bold_str) is not None: + list_item_str = clean_node( + wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) + ) + if list_item_str.endswith(":"): + ref_nodes.clear() + example.ref = list_item_str + for child_list in list_item.find_child(NodeKind.LIST): + for child_list_item in child_list.find_child( + NodeKind.LIST_ITEM + ): + example.text = clean_node( + wxr, None, child_list_item.children + ) + break + elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + ref_nodes.clear() + for child_list_item in node.find_child(NodeKind.LIST_ITEM): + ref_nodes.append(child_list_item.children) + else: + ref_nodes.append(node) + + if example.text != "": + if example.ref == "": + example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n") + sense.examples.append(example) + else: + extract_example_text_list(wxr, sense, list_item) + + +def extract_example_text_list( + wxr: WiktextractContext, + sense: Sense, + list_item: WikiNode, +) -> None: + list_item_text = clean_node( + wxr, sense, list(list_item.invert_find_child(NodeKind.LIST)) + ) + example = Example(text=list_item_text) + if "-" in example.text: + tr_start = example.text.index("-") + example.translation = example.text[tr_start + 1 :].strip() + example.text = example.text[:tr_start].strip() + sense.examples.append(example) diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index 01ff3aaaa..b21367818 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -16,6 +16,10 @@ class Example(PortugueseBaseModel): ref: str = "" +class AltForm(PortugueseBaseModel): + word: str + + class Sense(PortugueseBaseModel): glosses: list[str] = [] tags: list[str] = [] @@ -23,6 +27,7 @@ class Sense(PortugueseBaseModel): categories: list[str] = [] topics: list[str] = [] examples: list[Example] = [] + form_of: list[AltForm] = [] class Translation(PortugueseBaseModel): diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index 03fbd0c09..86b29b6ea 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -1,7 +1,6 @@ import re from wikitextprocessor import ( - HTMLNode, LevelNode, NodeKind, TemplateNode, @@ -10,9 +9,10 @@ from ...page import clean_node from ...wxr_context import WiktextractContext +from .example import extract_example_list_item from .head_line import extract_head_line_nodes from .inflection import extract_flex_template -from .models import Example, Linkage, Sense, WordEntry +from .models import AltForm, Linkage, Sense, WordEntry from .section_titles import POS_DATA from .tags import translate_raw_tags @@ -75,6 +75,8 @@ def extract_gloss_list_item( if len(gloss_str) > 0: sense.glosses.append(gloss_str) translate_raw_tags(sense) + if "form-of" in word_entry.tags: + extract_form_of_word(wxr, sense, list_item) word_entry.senses.append(sense) for child_list in list_item.find_child(NodeKind.LIST): @@ -112,86 +114,11 @@ def extract_escopo2_template( return raw_tags -def extract_example_list_item( - wxr: WiktextractContext, - sense: Sense, - list_item: WikiNode, +def extract_form_of_word( + wxr: WiktextractContext, sense: Sense, list_item: WikiNode ) -> None: - example = Example() - ref_nodes = [] - - for index, node in enumerate(list_item.children): - if ( - isinstance(node, WikiNode) - and node.kind == NodeKind.ITALIC - and example.text == "" - ): - example.text = clean_node(wxr, None, node) - elif isinstance(node, HTMLNode) and node.tag == "small": - example.translation = clean_node(wxr, None, node) - if example.translation.startswith( - "(" - ) and example.translation.endswith(")"): - example.translation = example.translation.strip("()") - elif isinstance(node, TemplateNode): - match node.template_name: - case "OESP": - example.ref = clean_node(wxr, sense, node).strip("()") - case "tradex": - example.text = clean_node( - wxr, None, node.template_parameters.get(2, "") - ) - example.translation = clean_node( - wxr, None, node.template_parameters.get(3, "") - ) - clean_node(wxr, sense, node) - case "Ex.": - example.text = clean_node( - wxr, sense, node.template_parameters.get(1, "") - ) - elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: - bold_str = clean_node(wxr, None, node) - if re.fullmatch(r"\d+", bold_str) is not None: - list_item_str = clean_node( - wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) - ) - if list_item_str.endswith(":"): - ref_nodes.clear() - example.ref = list_item_str - for child_list in list_item.find_child(NodeKind.LIST): - for child_list_item in child_list.find_child( - NodeKind.LIST_ITEM - ): - example.text = clean_node( - wxr, None, child_list_item.children - ) - break - elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: - ref_nodes.clear() - for child_list_item in node.find_child(NodeKind.LIST_ITEM): - ref_nodes.append(child_list_item.children) - else: - ref_nodes.append(node) - - if example.text != "": - if example.ref == "": - example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n") - sense.examples.append(example) - else: - extract_example_text_list(wxr, sense, list_item) - - -def extract_example_text_list( - wxr: WiktextractContext, - sense: Sense, - list_item: WikiNode, -) -> None: - list_item_text = clean_node( - wxr, sense, list(list_item.invert_find_child(NodeKind.LIST)) - ) - example = Example(text=list_item_text) - if "-" in example.text: - tr_start = example.text.index("-") - example.translation = example.text[tr_start + 1 :].strip() - example.text = example.text[:tr_start].strip() - sense.examples.append(example) + form_of = "" + for link_node in list_item.find_child_recursively(NodeKind.LINK): + form_of = clean_node(wxr, None, link_node) + if form_of != "": + sense.form_of.append(AltForm(word=form_of)) diff --git a/tests/test_it_gloss.py b/tests/test_it_gloss.py index 325012e59..b5d2a2594 100644 --- a/tests/test_it_gloss.py +++ b/tests/test_it_gloss.py @@ -158,3 +158,17 @@ def test_subsecton_template_add_new_word_entry(self): }, ], ) + + def test_form_of(self): + self.wxr.wtp.add_page("Template:-it-", 10, "Italiano") + data = parse_page( + self.wxr, + "cani", + """== {{-it-}} == +===Sostantivo, forma flessa=== +# plurale di [[cane]]""", + ) + self.assertEqual( + data[0]["senses"], + [{"glosses": ["plurale di cane"], "form_of": [{"word": "cane"}]}], + ) diff --git a/tests/test_pt_gloss.py b/tests/test_pt_gloss.py index 50e2313c8..7458db651 100644 --- a/tests/test_pt_gloss.py +++ b/tests/test_pt_gloss.py @@ -93,3 +93,36 @@ def test_nested_list(self): {"glosses": ["médio", "relativo à média;"]}, ], ) + + def test_form_of_bold(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "cães", + """={{-pt-}}= +==Forma de substantivo== +# plural de '''[[cão]]'''""", + ) + self.assertEqual( + data[0]["senses"], + [{"glosses": ["plural de cão"], "form_of": [{"word": "cão"}]}], + ) + + def test_form_of_link(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + data = parse_page( + self.wxr, + "cãs", + """={{-pt-}}= +==Forma de substantivo== +# feminino plural de [[cão]] (cruel, brutal)""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": ["feminino plural de cão (cruel, brutal)"], + "form_of": [{"word": "cão"}], + } + ], + )