Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[nl] fix exception and check json error, extract "-dumstam-" template #923

Merged
merged 6 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 36 additions & 3 deletions src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"-denoun1-",
"-nlstam-",
"-csadjc-comp-",
"-dumstam-",
]
)

Expand All @@ -41,6 +42,8 @@ def extract_inflection_template(
extract_nlstam_template(wxr, word_entry, t_node)
elif t_node.template_name.startswith("-csadjc-comp-"):
extract_csadjc_comp_template(wxr, word_entry, t_node)
elif t_node.template_name == "-dumstam-":
extract_dumstam_template(wxr, word_entry, t_node)


def extract_noun_adj_table(
Expand Down Expand Up @@ -122,9 +125,19 @@ def extract_vervoeging_page(
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, "")
sense = ""
for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
sense = clean_node(wxr, None, level_node.largs)
for t_node in level_node.find_child(NodeKind.TEMPLATE):
for lang_level_node in root.find_child(NodeKind.LEVEL2):
lang_name = clean_node(wxr, None, lang_level_node.largs)
if lang_name != word_entry.lang:
continue
for sense_level_node in lang_level_node.find_child_recursively(
LEVEL_KIND_FLAGS
):
sense = clean_node(wxr, None, sense_level_node.largs)
for t_node in sense_level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, sense)
# only have language level node
for t_node in lang_level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, sense)

Expand Down Expand Up @@ -333,3 +346,23 @@ def extract_csadjc_comp_template(
form.raw_tags.append(row_header)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_dumstam_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:-dumstam-
tags = [
["infinitive"],
["past", "singular"],
["past", "plural"],
["past", "participle"],
]
for arg_name in range(1, 5):
word = clean_node(
wxr, None, t_node.template_parameters.get(arg_name, "")
)
if word not in ["", word_entry.word]:
form = Form(form=word, tags=tags[arg_name - 1])
word_entry.forms.append(form)
clean_node(wxr, word_entry, t_node)
52 changes: 39 additions & 13 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,37 @@ def extract_l_template(
}


def extract_oudeschrijfwijze_template_g_arg(
wxr: WiktextractContext, g_arg: str, sense: Sense
) -> bool:
for tags_dict in [
NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
]:
if g_arg in tags_dict:
tag = tags_dict[g_arg]
if isinstance(tag, str):
sense.tags.append(tag)
elif isinstance(tag, list):
sense.tags.extend(tag)
return True
return False


def extract_oudeschrijfwijze_template(
wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
) -> None:
g_arg_str = clean_node(wxr, None, t_node.template_parameters.get("g", ""))
if not extract_oudeschrijfwijze_template_g_arg(wxr, g_arg_str, sense):
g_args = t_node.template_parameters.get("g", "")
if isinstance(g_args, list):
for g_arg in g_args:
if isinstance(g_arg, TemplateNode):
extract_oudeschrijfwijze_template_g_arg(
wxr, g_arg.template_name, sense
)


def extract_noun_form_of_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
Expand All @@ -268,11 +299,15 @@ def extract_noun_form_of_template(
if t_node.template_name.endswith("-pl"):
sense.tags.append("plural")
else:
num_arg = t_node.template_parameters.get("getal", "")
num_arg = clean_node(
wxr, None, t_node.template_parameters.get("getal", "")
)
if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS:
sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg])

gender_arg = t_node.template_parameters.get("gesl", "")
gender_arg = clean_node(
wxr, None, t_node.template_parameters.get("gesl", "")
)
if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS:
gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg]
if isinstance(gender_tag, str):
Expand All @@ -281,17 +316,8 @@ def extract_noun_form_of_template(
sense.tags.extend(gender_tag)

# Sjabloon:oudeschrijfwijze
g_arg = t_node.template_parameters.get("g", "")
for tags_dict in [
NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
]:
if g_arg in tags_dict:
tag = tags_dict[g_arg]
if isinstance(tag, str):
sense.tags.append(tag)
elif isinstance(tag, list):
sense.tags.extend(tag)
if t_node.template_name == "oudeschrijfwijze":
extract_oudeschrijfwijze_template(wxr, t_node, sense)

form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
if form_of != "":
Expand Down
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/nl/spelling_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ def extract_spelling_form_section(
note_str = new_note_str.strip("() ")
else:
form_nodes.append(new_note_str)
else:
elif isinstance(node, str) or (
isinstance(node, WikiNode) and node.kind == NodeKind.LINK
):
form_nodes.append(node)
form_str = clean_node(wxr, None, form_nodes)
if len(form_str) > 0:
Expand Down
11 changes: 8 additions & 3 deletions src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
# "stopwoord": "filled pause",
"straattaal": "slang",
"streektaal": "regiolectal",
# "taal": "language",
"taal": "linguistics",
"toponiem": "toponymic",
"verkorting": "clipping",
"verouderd": "obsolete",
Expand All @@ -92,13 +92,16 @@
"alleen meervoud": "plural-only", # Sjabloon:plurt
"geen meervoud": "no-plural", # Sjabloon:singt
"versterkend voorvoegsel": ["intensifier", "prefix"],
"in een bijzin": "with-subordinate-clause", # Sjabloon:ovt-mv-bijz
"bij inversie": "inversion", # Sjabloon:1ps
}

TABLE_TAGS = {
# Sjabloon:-nlnoun-
"enkelvoud": "singular",
"meervoud": "plural",
"verkleinwoord": "diminutive",
"bezitsvorm": "possessive",
# Sjabloon:adjcomp
"stellend": "positive",
"vergrotend": "comparative",
Expand All @@ -123,6 +126,8 @@
"derde": "third-person",
"verleden": "past",
"voorwaardelijk": "conditional",
"hoofdzin": "main-clause",
"bijzin": "subordinate-clause",
# Sjabloon:-nlname-
"nominatief": "nominative",
"genitief": "genitive",
Expand Down Expand Up @@ -202,7 +207,7 @@
"ecologie": "ecology",
"economie": "economics",
# "eendvogels": "anseriform",
# "eenheid": "",
"eenheid": "units-of-measure",
"effectenhandel": "trading",
"egyptologie": "Egyptology",
# "toponiem: eiland": "",
Expand All @@ -211,7 +216,7 @@
# "element": "element",
"emotie": "emotion",
# "evenhoevigen": "",
# "familie": "family",
"familie": "familiar",
"farmacologie": "pharmacology",
# "feest": "party",
"fietsen": "cycling",
Expand Down
22 changes: 12 additions & 10 deletions src/wiktextract/extractor/nl/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,19 @@ def extract_translation_list_item(
elif not before_colon:
if brackets == 0 and isinstance(node, TemplateNode):
if node.template_name == "trad":
word_entry.translations.append(
Translation(
lang=lang_name,
lang_code=node.template_parameters.get(1, ""),
word=clean_node(
wxr, None, node.template_parameters.get(2, "")
),
sense=sense,
sense_index=sense_index,
)
tr_word = clean_node(
wxr, None, node.template_parameters.get(2, "")
)
if tr_word != "":
word_entry.translations.append(
Translation(
lang=lang_name,
lang_code=node.template_parameters.get(1, ""),
word=tr_word,
sense=sense,
sense_index=sense_index,
)
)
elif (
node.template_name in LIST_ITEM_TAG_TEMPLATES
and len(word_entry.translations) > 0
Expand Down
31 changes: 31 additions & 0 deletions tests/test_nl_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,3 +408,34 @@ def test_double_colons_list_in_parentheses(self):
}
],
)

def test_template_arg_in_oudeschrijfwijze(self):
self.wxr.wtp.add_page(
"Sjabloon:oudeschrijfwijze",
10,
"""'''Jura''' [[WikiWoordenboek:Genus|<span>v</span>]] / [[WikiWoordenboek:Genus|<span>m</span>]], soms ook: [[WikiWoordenboek:Genus|<span>o</span>]][[Categorie:WikiWoordenboek:Test/Bijzonder genus]]
# verouderde spelling of vorm van [[jura#Nederlands|jura]]&#32;tot 2006[[Categorie:Oude spelling van het Nederlands van voor 2006]]""",
)
data = parse_page(
self.wxr,
"Jura",
"""==Nederlands==
====Zelfstandig naamwoord====
{{oudeschrijfwijze|jura|2006|nld|g={{f}} / {{m}}, soms ook: {{n}}}}""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"verouderde spelling of vorm van jura tot 2006",
],
"categories": [
"WikiWoordenboek:Test/Bijzonder genus",
"Oude spelling van het Nederlands van voor 2006",
],
"tags": ["form-of", "feminine", "masculine", "neuter"],
"form_of": [{"word": "jura"}],
}
],
)