diff --git a/src/wiktextract/extractor/nl/analyze_template.py b/src/wiktextract/extractor/nl/analyze_template.py index a9f8a40f..727d10a4 100644 --- a/src/wiktextract/extractor/nl/analyze_template.py +++ b/src/wiktextract/extractor/nl/analyze_template.py @@ -104,9 +104,16 @@ def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]: # pre-expand section templates, like "=nld=", "-pron-" # don't expand "==" # don't expand inflection table templates like "-nlnoun-" - return ( - set(), + need_pre_expand = ( re.fullmatch(r"Sjabloon:=.+=", page.title) is not None or page.title in POS_TEMPLATES - or page.title in SECTION_TEMPLATES, + or page.title in SECTION_TEMPLATES ) + + # magic word breaks level2 node in "=qtu=" template + if need_pre_expand and page.body.startswith("__NOEDITSECTION__"): + wtp.add_page( + page.title, 10, page.body.removeprefix("__NOEDITSECTION__").strip() + ) + + return set(), need_pre_expand diff --git a/src/wiktextract/extractor/nl/example.py b/src/wiktextract/extractor/nl/example.py index 5aaac894..59729700 100644 --- a/src/wiktextract/extractor/nl/example.py +++ b/src/wiktextract/extractor/nl/example.py @@ -4,7 +4,7 @@ from ...wxr_context import WiktextractContext from .models import Example, Sense -EXAMPLE_TEMPLATES = frozenset({"bijv-1", "bijv-2", "citeer"}) +EXAMPLE_TEMPLATES = frozenset({"bijv-1", "bijv-2", "bijv-e", "citeer"}) def extract_example_list_item( @@ -28,7 +28,7 @@ def extract_example_template( e_text = clean_node(wxr, None, node.template_parameters.get(1, "")) if len(e_text) > 0: sense.examples.append(Example(text=e_text)) - elif node.template_name == "bijv-2": + elif node.template_name in ["bijv-2", "bijv-e"]: e_text = clean_node(wxr, None, node.template_parameters.get(1, "")) if len(e_text) > 0: e_trans = clean_node(wxr, None, node.template_parameters.get(2, "")) diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py index 00e6b435..9c8804db 100644 --- a/src/wiktextract/extractor/nl/inflection.py +++ b/src/wiktextract/extractor/nl/inflection.py @@ -12,7 +12,13 @@ def extract_inflection_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode ) -> None: - if t_node.template_name in ["-nlnoun-", "adjcomp"]: + if t_node.template_name in [ + "-nlnoun-", + "adjcomp", + "-nlname-", + "-denoun-", + "-denoun1-", + ]: extract_noun_adj_table(wxr, word_entry, t_node) elif t_node.template_name == "-nlstam-": extract_nlstam_template(wxr, word_entry, t_node) diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index 4ce46523..c43885a4 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -40,7 +40,7 @@ def extract_pos_section( forms_data.forms.clear() forms_data.categories.clear() extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node) - if len(page_data[-1].senses) == 0: + if len(page_data[-1].senses) == 0 and pos_title in LINKAGE_SECTIONS: page_data.pop() @@ -56,9 +56,9 @@ def extract_pos_section_nodes( if ( isinstance(node, WikiNode) and node.kind == NodeKind.LIST - and node.sarg.endswith("#") + and node.sarg.endswith(("#", "::")) ): - if gloss_list_start == 0: + if gloss_list_start == 0 and node.sarg.endswith("#"): gloss_list_start = index extract_pos_header_line_nodes( wxr, page_data[-1], level_node.children[:index] @@ -120,12 +120,33 @@ def extract_pos_section_nodes( extract_verb_form_of_template( wxr, page_data, base_data, forms_data, node ) + elif isinstance(node, TemplateNode): + # tag template after form-of template + cats = {} + expanded_text = clean_node(wxr, cats, node) + if ( + expanded_text.startswith("(") + and expanded_text.endswith(")") + and len(page_data[-1].senses) > 0 + ): + page_data[-1].senses[-1].raw_tags.append( + expanded_text.strip("() ") + ) + page_data[-1].senses[-1].categories.extend( + cats.get("categories", []) + ) + translate_raw_tags(page_data[-1].senses[-1]) def extract_gloss_list_item( - wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode + wxr: WiktextractContext, + word_entry: WordEntry, + list_item: WikiNode, ) -> None: - sense = Sense() + create_new_sense = ( + False if list_item.sarg == "::" and len(word_entry.senses) > 0 else True + ) + sense = Sense() if create_new_sense else word_entry.senses[-1] gloss_nodes = [] for child in list_item.children: if isinstance(child, TemplateNode): @@ -158,13 +179,28 @@ def extract_gloss_list_item( while gloss_text.startswith(","): # between qualifier templates gloss_text = gloss_text.removeprefix(",").strip() m = re.match(r"\(([^()]+)\)", gloss_text) - if m is not None: # expanded "verouderd" template in "2ps" template - gloss_text = gloss_text[m.end() :].strip() - sense.raw_tags.append(m.group(1)) + if m is not None: + new_gloss_text = gloss_text[m.end() :].strip() + if new_gloss_text != "": + # expanded "verouderd" template in "2ps" template + gloss_text = new_gloss_text + sense.raw_tags.append(m.group(1)) + else: # gloss text after form-of template + gloss_text = m.group(1) + if len(gloss_text) > 0: sense.glosses.append(gloss_text) + if ( + len(sense.glosses) > 0 + or len(sense.tags) > 0 + or len(sense.raw_tags) > 0 + or len(sense.examples) > 0 + ): translate_raw_tags(sense) - word_entry.senses.append(sense) + if len(sense.glosses) == 0: + sense.tags.append("no-gloss") + if create_new_sense: + word_entry.senses.append(sense) def extract_pos_header_line_nodes( diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py index e46eea13..ffcc0bce 100644 --- a/src/wiktextract/extractor/nl/tags.py +++ b/src/wiktextract/extractor/nl/tags.py @@ -123,6 +123,12 @@ "derde": "third-person", "verleden": "past", "voorwaardelijk": "conditional", + # Sjabloon:-nlname- + "nominatief": "nominative", + "genitief": "genitive", + # Sjabloon:-denoun- + "datief": "dative", + "accusatief": "accusative", } diff --git a/tests/test_nl_gloss.py b/tests/test_nl_gloss.py index 28cea48b..c73702f7 100644 --- a/tests/test_nl_gloss.py +++ b/tests/test_nl_gloss.py @@ -290,3 +290,121 @@ def test_eng_onv_d(self): self.assertEqual( data[1]["categories"], ["Zelfstandig naamwoord in het Engels"] ) + + def test_no_gloss_but_has_tag_example(self): + self.wxr.wtp.add_page( + "Sjabloon:naam-m", + 10, + """([[mannelijk]]e [[naam]])[[Categorie:Mannelijke naam_in_het_Engels]]""", + ) + data = parse_page( + self.wxr, + "Clark", + """==Engels== +====Eigennaam==== +'''Clark''' +#{{naam-m|eng}} +{{bijv-2|'''Clark''' Gable was a popular movie star|'''Clark''' Gable was een bekende filmster.}}""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "categories": ["Mannelijke naam_in_het_Engels"], + "tags": ["masculine", "name", "no-gloss"], + "examples": [ + { + "text": "Clark Gable was a popular movie star", + "translation": "Clark Gable was een bekende filmster.", + } + ], + } + ], + ) + + def test_double_colons_list(self): + self.wxr.wtp.add_page( + "Sjabloon:oudeschrijfwijze", + 10, + """'''Ehstland''' +# verouderde spelling of vorm van [[Estland#Duits|Estland]][[Categorie:Oude spelling van het Duits]]""", + ) + self.wxr.wtp.add_page( + "Sjabloon:verouderd", + 10, + "([[verouderd]])[[Categorie:Verouderd_in_het_Duits]]", + ) + data = parse_page( + self.wxr, + "Ehstland", + """==Duits== +====Eigennaam==== +{{oudeschrijfwijze|Estland||deu}} +::{{verouderd|deu}} nominatief enkelvoud van [[Ehstland#Duits|Ehstland]]""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "categories": [ + "Oude spelling van het Duits", + "Verouderd_in_het_Duits", + ], + "glosses": [ + "verouderde spelling of vorm van Estland", + "nominatief enkelvoud van Ehstland", + ], + "tags": ["form-of", "obsolete"], + "form_of": [{"word": "Estland"}], + } + ], + ) + + def test_tag_template_after_form_of_template(self): + self.wxr.wtp.add_page( + "Sjabloon:geologie", + 10, + "([[geologie]])[[Categorie:Geologie_in_het_Nederlands]]", + ) + data = parse_page( + self.wxr, + "Fanerozoïcum", + """==Nederlands== +====Zelfstandig naamwoord==== +{{oudeschrijfwijze|fanerozoïcum|2006|nld|g=n}} {{geologie|nld}}""", + ) + self.assertEqual(data[0]["senses"][0]["topics"], ["geology"]) + self.assertEqual( + data[0]["senses"][0]["categories"], ["Geologie_in_het_Nederlands"] + ) + + def test_double_colons_list_in_parentheses(self): + self.wxr.wtp.add_page( + "Sjabloon:oudeschrijfwijze", + 10, + """'''Haafer''' +# verouderde spelling of vorm van [[Hafer#Duits|Hafer]] tot 1876[[Categorie:Oude spelling van het Duits van voor 1876]]""", + ) + self.wxr.wtp.add_page("Sjabloon:Q", 10, "[[Haafer#Duits|Haafer]]") + data = parse_page( + self.wxr, + "Haafer", + """==Duits== +====Zelfstandig naamwoord==== +{{oudeschrijfwijze|Hafer|1876|deu}} +::(nominatief mannelijk enkelvoud van {{Q|Haafer|deu}})""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": [ + "verouderde spelling of vorm van Hafer tot 1876", + "nominatief mannelijk enkelvoud van Haafer", + ], + "categories": ["Oude spelling van het Duits van voor 1876"], + "tags": ["form-of"], + "form_of": [{"word": "Hafer"}], + } + ], + )