Skip to content

Commit

Permalink
Merge pull request #923 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] fix exception and check json error, extract "-dumstam-" template
xxyzz authored Nov 26, 2024
2 parents cfb637d + d0db24a commit 22970c4
Showing 6 changed files with 129 additions and 30 deletions.
39 changes: 36 additions & 3 deletions src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@
"-denoun1-",
"-nlstam-",
"-csadjc-comp-",
"-dumstam-",
]
)

@@ -41,6 +42,8 @@ def extract_inflection_template(
extract_nlstam_template(wxr, word_entry, t_node)
elif t_node.template_name.startswith("-csadjc-comp-"):
extract_csadjc_comp_template(wxr, word_entry, t_node)
elif t_node.template_name == "-dumstam-":
extract_dumstam_template(wxr, word_entry, t_node)


def extract_noun_adj_table(
@@ -122,9 +125,19 @@ def extract_vervoeging_page(
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, "")
sense = ""
for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
sense = clean_node(wxr, None, level_node.largs)
for t_node in level_node.find_child(NodeKind.TEMPLATE):
for lang_level_node in root.find_child(NodeKind.LEVEL2):
lang_name = clean_node(wxr, None, lang_level_node.largs)
if lang_name != word_entry.lang:
continue
for sense_level_node in lang_level_node.find_child_recursively(
LEVEL_KIND_FLAGS
):
sense = clean_node(wxr, None, sense_level_node.largs)
for t_node in sense_level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, sense)
# only have language level node
for t_node in lang_level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name in table_templates:
extract_nlverb_template(wxr, word_entry, t_node, sense)

@@ -333,3 +346,23 @@ def extract_csadjc_comp_template(
form.raw_tags.append(row_header)
translate_raw_tags(form)
word_entry.forms.append(form)


def extract_dumstam_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:-dumstam-
tags = [
["infinitive"],
["past", "singular"],
["past", "plural"],
["past", "participle"],
]
for arg_name in range(1, 5):
word = clean_node(
wxr, None, t_node.template_parameters.get(arg_name, "")
)
if word not in ["", word_entry.word]:
form = Form(form=word, tags=tags[arg_name - 1])
word_entry.forms.append(form)
clean_node(wxr, word_entry, t_node)
52 changes: 39 additions & 13 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
@@ -260,6 +260,37 @@ def extract_l_template(
}


def extract_oudeschrijfwijze_template_g_arg(
wxr: WiktextractContext, g_arg: str, sense: Sense
) -> bool:
for tags_dict in [
NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
]:
if g_arg in tags_dict:
tag = tags_dict[g_arg]
if isinstance(tag, str):
sense.tags.append(tag)
elif isinstance(tag, list):
sense.tags.extend(tag)
return True
return False


def extract_oudeschrijfwijze_template(
wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
) -> None:
g_arg_str = clean_node(wxr, None, t_node.template_parameters.get("g", ""))
if not extract_oudeschrijfwijze_template_g_arg(wxr, g_arg_str, sense):
g_args = t_node.template_parameters.get("g", "")
if isinstance(g_args, list):
for g_arg in g_args:
if isinstance(g_arg, TemplateNode):
extract_oudeschrijfwijze_template_g_arg(
wxr, g_arg.template_name, sense
)


def extract_noun_form_of_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
@@ -268,11 +299,15 @@ def extract_noun_form_of_template(
if t_node.template_name.endswith("-pl"):
sense.tags.append("plural")
else:
num_arg = t_node.template_parameters.get("getal", "")
num_arg = clean_node(
wxr, None, t_node.template_parameters.get("getal", "")
)
if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS:
sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg])

gender_arg = t_node.template_parameters.get("gesl", "")
gender_arg = clean_node(
wxr, None, t_node.template_parameters.get("gesl", "")
)
if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS:
gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg]
if isinstance(gender_tag, str):
@@ -281,17 +316,8 @@ def extract_noun_form_of_template(
sense.tags.extend(gender_tag)

# Sjabloon:oudeschrijfwijze
g_arg = t_node.template_parameters.get("g", "")
for tags_dict in [
NOUN_FORM_OF_TEMPLATE_GENDER_TAGS,
NOUN_FORM_OF_TEMPLATE_NUM_TAGS,
]:
if g_arg in tags_dict:
tag = tags_dict[g_arg]
if isinstance(tag, str):
sense.tags.append(tag)
elif isinstance(tag, list):
sense.tags.extend(tag)
if t_node.template_name == "oudeschrijfwijze":
extract_oudeschrijfwijze_template(wxr, t_node, sense)

form_of = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
if form_of != "":
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/nl/spelling_form.py
Original file line number Diff line number Diff line change
@@ -18,7 +18,9 @@ def extract_spelling_form_section(
note_str = new_note_str.strip("() ")
else:
form_nodes.append(new_note_str)
else:
elif isinstance(node, str) or (
isinstance(node, WikiNode) and node.kind == NodeKind.LINK
):
form_nodes.append(node)
form_str = clean_node(wxr, None, form_nodes)
if len(form_str) > 0:
11 changes: 8 additions & 3 deletions src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
@@ -66,7 +66,7 @@
# "stopwoord": "filled pause",
"straattaal": "slang",
"streektaal": "regiolectal",
# "taal": "language",
"taal": "linguistics",
"toponiem": "toponymic",
"verkorting": "clipping",
"verouderd": "obsolete",
@@ -92,13 +92,16 @@
"alleen meervoud": "plural-only", # Sjabloon:plurt
"geen meervoud": "no-plural", # Sjabloon:singt
"versterkend voorvoegsel": ["intensifier", "prefix"],
"in een bijzin": "with-subordinate-clause", # Sjabloon:ovt-mv-bijz
"bij inversie": "inversion", # Sjabloon:1ps
}

TABLE_TAGS = {
# Sjabloon:-nlnoun-
"enkelvoud": "singular",
"meervoud": "plural",
"verkleinwoord": "diminutive",
"bezitsvorm": "possessive",
# Sjabloon:adjcomp
"stellend": "positive",
"vergrotend": "comparative",
@@ -123,6 +126,8 @@
"derde": "third-person",
"verleden": "past",
"voorwaardelijk": "conditional",
"hoofdzin": "main-clause",
"bijzin": "subordinate-clause",
# Sjabloon:-nlname-
"nominatief": "nominative",
"genitief": "genitive",
@@ -202,7 +207,7 @@
"ecologie": "ecology",
"economie": "economics",
# "eendvogels": "anseriform",
# "eenheid": "",
"eenheid": "units-of-measure",
"effectenhandel": "trading",
"egyptologie": "Egyptology",
# "toponiem: eiland": "",
@@ -211,7 +216,7 @@
# "element": "element",
"emotie": "emotion",
# "evenhoevigen": "",
# "familie": "family",
"familie": "familiar",
"farmacologie": "pharmacology",
# "feest": "party",
"fietsen": "cycling",
22 changes: 12 additions & 10 deletions src/wiktextract/extractor/nl/translation.py
Original file line number Diff line number Diff line change
@@ -52,17 +52,19 @@ def extract_translation_list_item(
elif not before_colon:
if brackets == 0 and isinstance(node, TemplateNode):
if node.template_name == "trad":
word_entry.translations.append(
Translation(
lang=lang_name,
lang_code=node.template_parameters.get(1, ""),
word=clean_node(
wxr, None, node.template_parameters.get(2, "")
),
sense=sense,
sense_index=sense_index,
)
tr_word = clean_node(
wxr, None, node.template_parameters.get(2, "")
)
if tr_word != "":
word_entry.translations.append(
Translation(
lang=lang_name,
lang_code=node.template_parameters.get(1, ""),
word=tr_word,
sense=sense,
sense_index=sense_index,
)
)
elif (
node.template_name in LIST_ITEM_TAG_TEMPLATES
and len(word_entry.translations) > 0
31 changes: 31 additions & 0 deletions tests/test_nl_gloss.py
Original file line number Diff line number Diff line change
@@ -408,3 +408,34 @@ def test_double_colons_list_in_parentheses(self):
}
],
)

def test_template_arg_in_oudeschrijfwijze(self):
self.wxr.wtp.add_page(
"Sjabloon:oudeschrijfwijze",
10,
"""'''Jura''' [[WikiWoordenboek:Genus|<span>v</span>]] / [[WikiWoordenboek:Genus|<span>m</span>]], soms ook: [[WikiWoordenboek:Genus|<span>o</span>]][[Categorie:WikiWoordenboek:Test/Bijzonder genus]]
# verouderde spelling of vorm van [[jura#Nederlands|jura]]&#32;tot 2006[[Categorie:Oude spelling van het Nederlands van voor 2006]]""",
)
data = parse_page(
self.wxr,
"Jura",
"""==Nederlands==
====Zelfstandig naamwoord====
{{oudeschrijfwijze|jura|2006|nld|g={{f}} / {{m}}, soms ook: {{n}}}}""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"verouderde spelling of vorm van jura tot 2006",
],
"categories": [
"WikiWoordenboek:Test/Bijzonder genus",
"Oude spelling van het Nederlands van voor 2006",
],
"tags": ["form-of", "feminine", "masculine", "neuter"],
"form_of": [{"word": "jura"}],
}
],
)

0 comments on commit 22970c4

Please sign in to comment.