From a8880847042061a1febd6ddbb611ab03c19a538c Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Nov 2024 12:08:03 +0800 Subject: [PATCH 1/7] [nl] add sense data for pages don't have gloss text but have tags or examples --- src/wiktextract/extractor/nl/pos.py | 10 +++++++++- tests/test_nl_gloss.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index 4ce46523..65456fbb 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -40,7 +40,7 @@ def extract_pos_section( forms_data.forms.clear() forms_data.categories.clear() extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node) - if len(page_data[-1].senses) == 0: + if len(page_data[-1].senses) == 0 and pos_title in LINKAGE_SECTIONS: page_data.pop() @@ -163,7 +163,15 @@ def extract_gloss_list_item( sense.raw_tags.append(m.group(1)) if len(gloss_text) > 0: sense.glosses.append(gloss_text) + if ( + len(sense.glosses) > 0 + or len(sense.tags) > 0 + or len(sense.raw_tags) > 0 + or len(sense.examples) > 0 + ): translate_raw_tags(sense) + if len(sense.glosses) == 0: + sense.tags.append("no-gloss") word_entry.senses.append(sense) diff --git a/tests/test_nl_gloss.py b/tests/test_nl_gloss.py index 28cea48b..e07b51fa 100644 --- a/tests/test_nl_gloss.py +++ b/tests/test_nl_gloss.py @@ -290,3 +290,34 @@ def test_eng_onv_d(self): self.assertEqual( data[1]["categories"], ["Zelfstandig naamwoord in het Engels"] ) + + def test_no_gloss_but_has_tag_example(self): + self.wxr.wtp.add_page( + "Sjabloon:naam-m", + 10, + """([[mannelijk]]e [[naam]])[[Categorie:Mannelijke naam_in_het_Engels]] """, + ) + data = parse_page( + self.wxr, + "Clark", + """==Engels== +====Eigennaam==== +'''Clark''' +#{{naam-m|eng}} +{{bijv-2|'''Clark''' Gable was a popular movie star|'''Clark''' Gable was een bekende filmster.}}""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "categories": ["Mannelijke naam_in_het_Engels"], + "tags": ["masculine", "name", "no-gloss"], + "examples": [ + { + "text": "Clark Gable was a popular movie star", + "translation": "Clark Gable was een bekende filmster.", + } + ], + } + ], + ) From 27f9ba758617e4128bfda4e38a88c274acd966f7 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Nov 2024 13:40:33 +0800 Subject: [PATCH 2/7] [nl] extract "-nlname-" forms table template --- src/wiktextract/extractor/nl/inflection.py | 2 +- src/wiktextract/extractor/nl/tags.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py index 00e6b435..2e744d5b 100644 --- a/src/wiktextract/extractor/nl/inflection.py +++ b/src/wiktextract/extractor/nl/inflection.py @@ -12,7 +12,7 @@ def extract_inflection_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode ) -> None: - if t_node.template_name in ["-nlnoun-", "adjcomp"]: + if t_node.template_name in ["-nlnoun-", "adjcomp", "-nlname-"]: extract_noun_adj_table(wxr, word_entry, t_node) elif t_node.template_name == "-nlstam-": extract_nlstam_template(wxr, word_entry, t_node) diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py index e46eea13..0fba1fea 100644 --- a/src/wiktextract/extractor/nl/tags.py +++ b/src/wiktextract/extractor/nl/tags.py @@ -123,6 +123,9 @@ "derde": "third-person", "verleden": "past", "voorwaardelijk": "conditional", + # Sjabloon:-nlname- + "nominatief": "nominative", + "genitief": "genitive", } From 58dd4c21f537ee0b02b74ee02c5b0efaf9a64979 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Nov 2024 14:44:45 +0800 Subject: [PATCH 3/7] [nl] extract gloss text in double colons list and add it to the gloss list extract from above list or template --- src/wiktextract/extractor/nl/example.py | 4 +-- src/wiktextract/extractor/nl/pos.py | 16 ++++++---- tests/test_nl_gloss.py | 40 ++++++++++++++++++++++++- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/nl/example.py b/src/wiktextract/extractor/nl/example.py index 5aaac894..59729700 100644 --- a/src/wiktextract/extractor/nl/example.py +++ b/src/wiktextract/extractor/nl/example.py @@ -4,7 +4,7 @@ from ...wxr_context import WiktextractContext from .models import Example, Sense -EXAMPLE_TEMPLATES = frozenset({"bijv-1", "bijv-2", "citeer"}) +EXAMPLE_TEMPLATES = frozenset({"bijv-1", "bijv-2", "bijv-e", "citeer"}) def extract_example_list_item( @@ -28,7 +28,7 @@ def extract_example_template( e_text = clean_node(wxr, None, node.template_parameters.get(1, "")) if len(e_text) > 0: sense.examples.append(Example(text=e_text)) - elif node.template_name == "bijv-2": + elif node.template_name in ["bijv-2", "bijv-e"]: e_text = clean_node(wxr, None, node.template_parameters.get(1, "")) if len(e_text) > 0: e_trans = clean_node(wxr, None, node.template_parameters.get(2, "")) diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index 65456fbb..600dc69b 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -56,9 +56,9 @@ def extract_pos_section_nodes( if ( isinstance(node, WikiNode) and node.kind == NodeKind.LIST - and node.sarg.endswith("#") + and node.sarg.endswith(("#", "::")) ): - if gloss_list_start == 0: + if gloss_list_start == 0 and node.sarg.endswith("#"): gloss_list_start = index extract_pos_header_line_nodes( wxr, page_data[-1], level_node.children[:index] @@ -123,9 +123,14 @@ def extract_pos_section_nodes( def extract_gloss_list_item( - wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, ) -> None: - sense = Sense() + create_new_sense = ( + False if list_item.sarg == "::" and len(word_entry.senses) > 0 else True + ) + sense = Sense() if create_new_sense else word_entry.senses[-1] gloss_nodes = [] for child in list_item.children: if isinstance(child, TemplateNode): @@ -172,7 +177,8 @@ def extract_gloss_list_item( translate_raw_tags(sense) if len(sense.glosses) == 0: sense.tags.append("no-gloss") - word_entry.senses.append(sense) + if create_new_sense: + word_entry.senses.append(sense) def extract_pos_header_line_nodes( diff --git a/tests/test_nl_gloss.py b/tests/test_nl_gloss.py index e07b51fa..7d3302ac 100644 --- a/tests/test_nl_gloss.py +++ b/tests/test_nl_gloss.py @@ -295,7 +295,7 @@ def test_no_gloss_but_has_tag_example(self): self.wxr.wtp.add_page( "Sjabloon:naam-m", 10, - """([[mannelijk]]e [[naam]])[[Categorie:Mannelijke naam_in_het_Engels]] """, + """([[mannelijk]]e [[naam]])[[Categorie:Mannelijke naam_in_het_Engels]]""", ) data = parse_page( self.wxr, @@ -321,3 +321,41 @@ def test_no_gloss_but_has_tag_example(self): } ], ) + + def test_double_colons_list(self): + self.wxr.wtp.add_page( + "Sjabloon:oudeschrijfwijze", + 10, + """'''Ehstland''' +# verouderde spelling of vorm van [[Estland#Duits|Estland]][[Categorie:Oude spelling van het Duits]]""", + ) + self.wxr.wtp.add_page( + "Sjabloon:verouderd", + 10, + "([[verouderd]])[[Categorie:Verouderd_in_het_Duits]]", + ) + data = parse_page( + self.wxr, + "Ehstland", + """==Duits== +====Eigennaam==== +{{oudeschrijfwijze|Estland||deu}} +::{{verouderd|deu}} nominatief enkelvoud van [[Ehstland#Duits|Ehstland]]""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "categories": [ + "Oude spelling van het Duits", + "Verouderd_in_het_Duits", + ], + "glosses": [ + "verouderde spelling of vorm van Estland", + "nominatief enkelvoud van Ehstland", + ], + "tags": ["form-of", "obsolete"], + "form_of": [{"word": "Estland"}], + } + ], + ) From c33a8bbeac1befd8cc4cddf3eae98a6fde161dd7 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Nov 2024 15:14:08 +0800 Subject: [PATCH 4/7] [nl] extract tag template after form-of template --- src/wiktextract/extractor/nl/pos.py | 16 ++++++++++++++++ tests/test_nl_gloss.py | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index 600dc69b..6819695e 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -120,6 +120,22 @@ def extract_pos_section_nodes( extract_verb_form_of_template( wxr, page_data, base_data, forms_data, node ) + elif isinstance(node, TemplateNode): + # tag template after form-of template + cats = {} + expanded_text = clean_node(wxr, cats, node) + if ( + expanded_text.startswith("(") + and expanded_text.endswith(")") + and len(page_data[-1].senses) > 0 + ): + page_data[-1].senses[-1].raw_tags.append( + expanded_text.strip("() ") + ) + page_data[-1].senses[-1].categories.extend( + cats.get("categories", []) + ) + translate_raw_tags(page_data[-1].senses[-1]) def extract_gloss_list_item( diff --git a/tests/test_nl_gloss.py b/tests/test_nl_gloss.py index 7d3302ac..b98a7d49 100644 --- a/tests/test_nl_gloss.py +++ b/tests/test_nl_gloss.py @@ -359,3 +359,21 @@ def test_double_colons_list(self): } ], ) + + def test_tag_template_after_form_of_template(self): + self.wxr.wtp.add_page( + "Sjabloon:geologie", + 10, + "([[geologie]])[[Categorie:Geologie_in_het_Nederlands]]", + ) + data = parse_page( + self.wxr, + "Fanerozoïcum", + """==Nederlands== +====Zelfstandig naamwoord==== +{{oudeschrijfwijze|fanerozoïcum|2006|nld|g=n}} {{geologie|nld}}""", + ) + self.assertEqual(data[0]["senses"][0]["topics"], ["geology"]) + self.assertEqual( + data[0]["senses"][0]["categories"], ["Geologie_in_het_Nederlands"] + ) From d8bf994c5e46f6a078acf77d10b82396aaa9a5e1 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Nov 2024 15:27:48 +0800 Subject: [PATCH 5/7] [nl] extract "-denoun-", "-denoun1-" forms table templates --- src/wiktextract/extractor/nl/inflection.py | 8 +++++++- src/wiktextract/extractor/nl/tags.py | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py index 2e744d5b..9c8804db 100644 --- a/src/wiktextract/extractor/nl/inflection.py +++ b/src/wiktextract/extractor/nl/inflection.py @@ -12,7 +12,13 @@ def extract_inflection_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode ) -> None: - if t_node.template_name in ["-nlnoun-", "adjcomp", "-nlname-"]: + if t_node.template_name in [ + "-nlnoun-", + "adjcomp", + "-nlname-", + "-denoun-", + "-denoun1-", + ]: extract_noun_adj_table(wxr, word_entry, t_node) elif t_node.template_name == "-nlstam-": extract_nlstam_template(wxr, word_entry, t_node) diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py index 0fba1fea..ffcc0bce 100644 --- a/src/wiktextract/extractor/nl/tags.py +++ b/src/wiktextract/extractor/nl/tags.py @@ -126,6 +126,9 @@ # Sjabloon:-nlname- "nominatief": "nominative", "genitief": "genitive", + # Sjabloon:-denoun- + "datief": "dative", + "accusatief": "accusative", } From cf9ff7e6013af25439c55f223770a905c6917304 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Nov 2024 15:45:43 +0800 Subject: [PATCH 6/7] [nl] don't add gloss text in "::" list as raw tag --- src/wiktextract/extractor/nl/pos.py | 12 ++++++++--- tests/test_nl_gloss.py | 31 +++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index 6819695e..c43885a4 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -179,9 +179,15 @@ def extract_gloss_list_item( while gloss_text.startswith(","): # between qualifier templates gloss_text = gloss_text.removeprefix(",").strip() m = re.match(r"\(([^()]+)\)", gloss_text) - if m is not None: # expanded "verouderd" template in "2ps" template - gloss_text = gloss_text[m.end() :].strip() - sense.raw_tags.append(m.group(1)) + if m is not None: + new_gloss_text = gloss_text[m.end() :].strip() + if new_gloss_text != "": + # expanded "verouderd" template in "2ps" template + gloss_text = new_gloss_text + sense.raw_tags.append(m.group(1)) + else: # gloss text after form-of template + gloss_text = m.group(1) + if len(gloss_text) > 0: sense.glosses.append(gloss_text) if ( diff --git a/tests/test_nl_gloss.py b/tests/test_nl_gloss.py index b98a7d49..c73702f7 100644 --- a/tests/test_nl_gloss.py +++ b/tests/test_nl_gloss.py @@ -377,3 +377,34 @@ def test_tag_template_after_form_of_template(self): self.assertEqual( data[0]["senses"][0]["categories"], ["Geologie_in_het_Nederlands"] ) + + def test_double_colons_list_in_parentheses(self): + self.wxr.wtp.add_page( + "Sjabloon:oudeschrijfwijze", + 10, + """'''Haafer''' +# verouderde spelling of vorm van [[Hafer#Duits|Hafer]] tot 1876[[Categorie:Oude spelling van het Duits van voor 1876]]""", + ) + self.wxr.wtp.add_page("Sjabloon:Q", 10, "[[Haafer#Duits|Haafer]]") + data = parse_page( + self.wxr, + "Haafer", + """==Duits== +====Zelfstandig naamwoord==== +{{oudeschrijfwijze|Hafer|1876|deu}} +::(nominatief mannelijk enkelvoud van {{Q|Haafer|deu}})""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": [ + "verouderde spelling of vorm van Hafer tot 1876", + "nominatief mannelijk enkelvoud van Haafer", + ], + "categories": ["Oude spelling van het Duits van voor 1876"], + "tags": ["form-of"], + "form_of": [{"word": "Hafer"}], + } + ], + ) From 8cfabac3893271ae2e93e165caa16ceb8417ceb6 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Thu, 7 Nov 2024 16:34:12 +0800 Subject: [PATCH 7/7] [nl] remove magic word before level node in section templates should be done in wikitextprocessor code --- src/wiktextract/extractor/nl/analyze_template.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/wiktextract/extractor/nl/analyze_template.py b/src/wiktextract/extractor/nl/analyze_template.py index a9f8a40f..727d10a4 100644 --- a/src/wiktextract/extractor/nl/analyze_template.py +++ b/src/wiktextract/extractor/nl/analyze_template.py @@ -104,9 +104,16 @@ def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]: # pre-expand section templates, like "=nld=", "-pron-" # don't expand "==" # don't expand inflection table templates like "-nlnoun-" - return ( - set(), + need_pre_expand = ( re.fullmatch(r"Sjabloon:=.+=", page.title) is not None or page.title in POS_TEMPLATES - or page.title in SECTION_TEMPLATES, + or page.title in SECTION_TEMPLATES ) + + # magic word breaks level2 node in "=qtu=" template + if need_pre_expand and page.body.startswith("__NOEDITSECTION__"): + wtp.add_page( + page.title, 10, page.body.removeprefix("__NOEDITSECTION__").strip() + ) + + return set(), need_pre_expand