diff --git a/src/wiktextract/extractor/nl/inflection.py b/src/wiktextract/extractor/nl/inflection.py index e1ce78a0f..01d8d091e 100644 --- a/src/wiktextract/extractor/nl/inflection.py +++ b/src/wiktextract/extractor/nl/inflection.py @@ -22,6 +22,7 @@ "-denoun1-", "-nlstam-", "-csadjc-comp-", + "-dumstam-", ] ) @@ -41,6 +42,8 @@ def extract_inflection_template( extract_nlstam_template(wxr, word_entry, t_node) elif t_node.template_name.startswith("-csadjc-comp-"): extract_csadjc_comp_template(wxr, word_entry, t_node) + elif t_node.template_name == "-dumstam-": + extract_dumstam_template(wxr, word_entry, t_node) def extract_noun_adj_table( @@ -122,9 +125,19 @@ def extract_vervoeging_page( if t_node.template_name in table_templates: extract_nlverb_template(wxr, word_entry, t_node, "") sense = "" - for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): - sense = clean_node(wxr, None, level_node.largs) - for t_node in level_node.find_child(NodeKind.TEMPLATE): + for lang_level_node in root.find_child(NodeKind.LEVEL2): + lang_name = clean_node(wxr, None, lang_level_node.largs) + if lang_name != word_entry.lang: + continue + for sense_level_node in lang_level_node.find_child_recursively( + LEVEL_KIND_FLAGS + ): + sense = clean_node(wxr, None, sense_level_node.largs) + for t_node in sense_level_node.find_child(NodeKind.TEMPLATE): + if t_node.template_name in table_templates: + extract_nlverb_template(wxr, word_entry, t_node, sense) + # only have language level node + for t_node in lang_level_node.find_child(NodeKind.TEMPLATE): if t_node.template_name in table_templates: extract_nlverb_template(wxr, word_entry, t_node, sense) @@ -333,3 +346,23 @@ def extract_csadjc_comp_template( form.raw_tags.append(row_header) translate_raw_tags(form) word_entry.forms.append(form) + + +def extract_dumstam_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://nl.wiktionary.org/wiki/Sjabloon:-dumstam- + tags = [ + ["infinitive"], + ["past", "singular"], + ["past", "plural"], + ["past", "participle"], + ] + for arg_name in range(1, 5): + word = clean_node( + wxr, None, t_node.template_parameters.get(arg_name, "") + ) + if word not in ["", word_entry.word]: + form = Form(form=word, tags=tags[arg_name - 1]) + word_entry.forms.append(form) + clean_node(wxr, word_entry, t_node) diff --git a/src/wiktextract/extractor/nl/pos.py b/src/wiktextract/extractor/nl/pos.py index c43885a4a..1d9e886a1 100644 --- a/src/wiktextract/extractor/nl/pos.py +++ b/src/wiktextract/extractor/nl/pos.py @@ -260,6 +260,37 @@ def extract_l_template( } +def extract_oudeschrijfwijze_template_g_arg( + wxr: WiktextractContext, g_arg: str, sense: Sense +) -> bool: + for tags_dict in [ + NOUN_FORM_OF_TEMPLATE_GENDER_TAGS, + NOUN_FORM_OF_TEMPLATE_NUM_TAGS, + ]: + if g_arg in tags_dict: + tag = tags_dict[g_arg] + if isinstance(tag, str): + sense.tags.append(tag) + elif isinstance(tag, list): + sense.tags.extend(tag) + return True + return False + + +def extract_oudeschrijfwijze_template( + wxr: WiktextractContext, t_node: TemplateNode, sense: Sense +) -> None: + g_arg_str = clean_node(wxr, None, t_node.template_parameters.get("g", "")) + if not extract_oudeschrijfwijze_template_g_arg(wxr, g_arg_str, sense): + g_args = t_node.template_parameters.get("g", "") + if isinstance(g_args, list): + for g_arg in g_args: + if isinstance(g_arg, TemplateNode): + extract_oudeschrijfwijze_template_g_arg( + wxr, g_arg.template_name, sense + ) + + def extract_noun_form_of_template( wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode ) -> None: @@ -268,11 +299,15 @@ def extract_noun_form_of_template( if t_node.template_name.endswith("-pl"): sense.tags.append("plural") else: - num_arg = t_node.template_parameters.get("getal", "") + num_arg = clean_node( + wxr, None, t_node.template_parameters.get("getal", "") + ) if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS: sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg]) - gender_arg = t_node.template_parameters.get("gesl", "") + gender_arg = clean_node( + wxr, None, t_node.template_parameters.get("gesl", "") + ) if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS: gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg] if isinstance(gender_tag, str): @@ -281,17 +316,8 @@ def extract_noun_form_of_template( sense.tags.extend(gender_tag) # Sjabloon:oudeschrijfwijze - g_arg = t_node.template_parameters.get("g", "") - for tags_dict in [ - NOUN_FORM_OF_TEMPLATE_GENDER_TAGS, - NOUN_FORM_OF_TEMPLATE_NUM_TAGS, - ]: - if g_arg in tags_dict: - tag = tags_dict[g_arg] - if isinstance(tag, str): - sense.tags.append(tag) - elif isinstance(tag, list): - sense.tags.extend(tag) + if t_node.template_name == "oudeschrijfwijze": + extract_oudeschrijfwijze_template(wxr, t_node, sense) form_of = clean_node(wxr, None, t_node.template_parameters.get(1, "")) if form_of != "": diff --git a/src/wiktextract/extractor/nl/spelling_form.py b/src/wiktextract/extractor/nl/spelling_form.py index 539a3cdee..6d8cd391b 100644 --- a/src/wiktextract/extractor/nl/spelling_form.py +++ b/src/wiktextract/extractor/nl/spelling_form.py @@ -18,7 +18,9 @@ def extract_spelling_form_section( note_str = new_note_str.strip("() ") else: form_nodes.append(new_note_str) - else: + elif isinstance(node, str) or ( + isinstance(node, WikiNode) and node.kind == NodeKind.LINK + ): form_nodes.append(node) form_str = clean_node(wxr, None, form_nodes) if len(form_str) > 0: diff --git a/src/wiktextract/extractor/nl/tags.py b/src/wiktextract/extractor/nl/tags.py index 4e029a3bb..e1f01749b 100644 --- a/src/wiktextract/extractor/nl/tags.py +++ b/src/wiktextract/extractor/nl/tags.py @@ -66,7 +66,7 @@ # "stopwoord": "filled pause", "straattaal": "slang", "streektaal": "regiolectal", - # "taal": "language", + "taal": "linguistics", "toponiem": "toponymic", "verkorting": "clipping", "verouderd": "obsolete", @@ -92,6 +92,8 @@ "alleen meervoud": "plural-only", # Sjabloon:plurt "geen meervoud": "no-plural", # Sjabloon:singt "versterkend voorvoegsel": ["intensifier", "prefix"], + "in een bijzin": "with-subordinate-clause", # Sjabloon:ovt-mv-bijz + "bij inversie": "inversion", # Sjabloon:1ps } TABLE_TAGS = { @@ -99,6 +101,7 @@ "enkelvoud": "singular", "meervoud": "plural", "verkleinwoord": "diminutive", + "bezitsvorm": "possessive", # Sjabloon:adjcomp "stellend": "positive", "vergrotend": "comparative", @@ -123,6 +126,8 @@ "derde": "third-person", "verleden": "past", "voorwaardelijk": "conditional", + "hoofdzin": "main-clause", + "bijzin": "subordinate-clause", # Sjabloon:-nlname- "nominatief": "nominative", "genitief": "genitive", @@ -202,7 +207,7 @@ "ecologie": "ecology", "economie": "economics", # "eendvogels": "anseriform", - # "eenheid": "", + "eenheid": "units-of-measure", "effectenhandel": "trading", "egyptologie": "Egyptology", # "toponiem: eiland": "", @@ -211,7 +216,7 @@ # "element": "element", "emotie": "emotion", # "evenhoevigen": "", - # "familie": "family", + "familie": "familiar", "farmacologie": "pharmacology", # "feest": "party", "fietsen": "cycling", diff --git a/src/wiktextract/extractor/nl/translation.py b/src/wiktextract/extractor/nl/translation.py index 474b8475f..0c5341c95 100644 --- a/src/wiktextract/extractor/nl/translation.py +++ b/src/wiktextract/extractor/nl/translation.py @@ -52,17 +52,19 @@ def extract_translation_list_item( elif not before_colon: if brackets == 0 and isinstance(node, TemplateNode): if node.template_name == "trad": - word_entry.translations.append( - Translation( - lang=lang_name, - lang_code=node.template_parameters.get(1, ""), - word=clean_node( - wxr, None, node.template_parameters.get(2, "") - ), - sense=sense, - sense_index=sense_index, - ) + tr_word = clean_node( + wxr, None, node.template_parameters.get(2, "") ) + if tr_word != "": + word_entry.translations.append( + Translation( + lang=lang_name, + lang_code=node.template_parameters.get(1, ""), + word=tr_word, + sense=sense, + sense_index=sense_index, + ) + ) elif ( node.template_name in LIST_ITEM_TAG_TEMPLATES and len(word_entry.translations) > 0 diff --git a/tests/test_nl_gloss.py b/tests/test_nl_gloss.py index c73702f76..505d4d489 100644 --- a/tests/test_nl_gloss.py +++ b/tests/test_nl_gloss.py @@ -408,3 +408,34 @@ def test_double_colons_list_in_parentheses(self): } ], ) + + def test_template_arg_in_oudeschrijfwijze(self): + self.wxr.wtp.add_page( + "Sjabloon:oudeschrijfwijze", + 10, + """'''Jura''' [[WikiWoordenboek:Genus|v]] / [[WikiWoordenboek:Genus|m]], soms ook: [[WikiWoordenboek:Genus|o]][[Categorie:WikiWoordenboek:Test/Bijzonder genus]] +# verouderde spelling of vorm van [[jura#Nederlands|jura]] tot 2006[[Categorie:Oude spelling van het Nederlands van voor 2006]]""", + ) + data = parse_page( + self.wxr, + "Jura", + """==Nederlands== +====Zelfstandig naamwoord==== +{{oudeschrijfwijze|jura|2006|nld|g={{f}} / {{m}}, soms ook: {{n}}}}""", + ) + self.assertEqual( + data[0]["senses"], + [ + { + "glosses": [ + "verouderde spelling of vorm van jura tot 2006", + ], + "categories": [ + "WikiWoordenboek:Test/Bijzonder genus", + "Oude spelling van het Nederlands van voor 2006", + ], + "tags": ["form-of", "feminine", "masculine", "neuter"], + "form_of": [{"word": "jura"}], + } + ], + )