Skip to content

Commit

Permalink
Merge pull request #901 from xxyzz/nl
Browse files Browse the repository at this point in the history
[nl] improve extract pos section code
  • Loading branch information
xxyzz authored Nov 7, 2024
2 parents d4e9874 + 8cfabac commit b666add
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 15 deletions.
13 changes: 10 additions & 3 deletions src/wiktextract/extractor/nl/analyze_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,16 @@ def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]:
# pre-expand section templates, like "=nld=", "-pron-"
# don't expand "=="
# don't expand inflection table templates like "-nlnoun-"
return (
set(),
need_pre_expand = (
re.fullmatch(r"Sjabloon:=.+=", page.title) is not None
or page.title in POS_TEMPLATES
or page.title in SECTION_TEMPLATES,
or page.title in SECTION_TEMPLATES
)

# magic word breaks level2 node in "=qtu=" template
if need_pre_expand and page.body.startswith("__NOEDITSECTION__"):
wtp.add_page(
page.title, 10, page.body.removeprefix("__NOEDITSECTION__").strip()
)

return set(), need_pre_expand
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/nl/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ...wxr_context import WiktextractContext
from .models import Example, Sense

EXAMPLE_TEMPLATES = frozenset({"bijv-1", "bijv-2", "citeer"})
EXAMPLE_TEMPLATES = frozenset({"bijv-1", "bijv-2", "bijv-e", "citeer"})


def extract_example_list_item(
Expand All @@ -28,7 +28,7 @@ def extract_example_template(
e_text = clean_node(wxr, None, node.template_parameters.get(1, ""))
if len(e_text) > 0:
sense.examples.append(Example(text=e_text))
elif node.template_name == "bijv-2":
elif node.template_name in ["bijv-2", "bijv-e"]:
e_text = clean_node(wxr, None, node.template_parameters.get(1, ""))
if len(e_text) > 0:
e_trans = clean_node(wxr, None, node.template_parameters.get(2, ""))
Expand Down
8 changes: 7 additions & 1 deletion src/wiktextract/extractor/nl/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@
def extract_inflection_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
if t_node.template_name in ["-nlnoun-", "adjcomp"]:
if t_node.template_name in [
"-nlnoun-",
"adjcomp",
"-nlname-",
"-denoun-",
"-denoun1-",
]:
extract_noun_adj_table(wxr, word_entry, t_node)
elif t_node.template_name == "-nlstam-":
extract_nlstam_template(wxr, word_entry, t_node)
Expand Down
54 changes: 45 additions & 9 deletions src/wiktextract/extractor/nl/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def extract_pos_section(
forms_data.forms.clear()
forms_data.categories.clear()
extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node)
if len(page_data[-1].senses) == 0:
if len(page_data[-1].senses) == 0 and pos_title in LINKAGE_SECTIONS:
page_data.pop()


Expand All @@ -56,9 +56,9 @@ def extract_pos_section_nodes(
if (
isinstance(node, WikiNode)
and node.kind == NodeKind.LIST
and node.sarg.endswith("#")
and node.sarg.endswith(("#", "::"))
):
if gloss_list_start == 0:
if gloss_list_start == 0 and node.sarg.endswith("#"):
gloss_list_start = index
extract_pos_header_line_nodes(
wxr, page_data[-1], level_node.children[:index]
Expand Down Expand Up @@ -120,12 +120,33 @@ def extract_pos_section_nodes(
extract_verb_form_of_template(
wxr, page_data, base_data, forms_data, node
)
elif isinstance(node, TemplateNode):
# tag template after form-of template
cats = {}
expanded_text = clean_node(wxr, cats, node)
if (
expanded_text.startswith("(")
and expanded_text.endswith(")")
and len(page_data[-1].senses) > 0
):
page_data[-1].senses[-1].raw_tags.append(
expanded_text.strip("() ")
)
page_data[-1].senses[-1].categories.extend(
cats.get("categories", [])
)
translate_raw_tags(page_data[-1].senses[-1])


def extract_gloss_list_item(
wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
) -> None:
sense = Sense()
create_new_sense = (
False if list_item.sarg == "::" and len(word_entry.senses) > 0 else True
)
sense = Sense() if create_new_sense else word_entry.senses[-1]
gloss_nodes = []
for child in list_item.children:
if isinstance(child, TemplateNode):
Expand Down Expand Up @@ -158,13 +179,28 @@ def extract_gloss_list_item(
while gloss_text.startswith(","): # between qualifier templates
gloss_text = gloss_text.removeprefix(",").strip()
m = re.match(r"\(([^()]+)\)", gloss_text)
if m is not None: # expanded "verouderd" template in "2ps" template
gloss_text = gloss_text[m.end() :].strip()
sense.raw_tags.append(m.group(1))
if m is not None:
new_gloss_text = gloss_text[m.end() :].strip()
if new_gloss_text != "":
# expanded "verouderd" template in "2ps" template
gloss_text = new_gloss_text
sense.raw_tags.append(m.group(1))
else: # gloss text after form-of template
gloss_text = m.group(1)

if len(gloss_text) > 0:
sense.glosses.append(gloss_text)
if (
len(sense.glosses) > 0
or len(sense.tags) > 0
or len(sense.raw_tags) > 0
or len(sense.examples) > 0
):
translate_raw_tags(sense)
word_entry.senses.append(sense)
if len(sense.glosses) == 0:
sense.tags.append("no-gloss")
if create_new_sense:
word_entry.senses.append(sense)


def extract_pos_header_line_nodes(
Expand Down
6 changes: 6 additions & 0 deletions src/wiktextract/extractor/nl/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@
"derde": "third-person",
"verleden": "past",
"voorwaardelijk": "conditional",
# Sjabloon:-nlname-
"nominatief": "nominative",
"genitief": "genitive",
# Sjabloon:-denoun-
"datief": "dative",
"accusatief": "accusative",
}


Expand Down
118 changes: 118 additions & 0 deletions tests/test_nl_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,121 @@ def test_eng_onv_d(self):
self.assertEqual(
data[1]["categories"], ["Zelfstandig naamwoord in het Engels"]
)

def test_no_gloss_but_has_tag_example(self):
self.wxr.wtp.add_page(
"Sjabloon:naam-m",
10,
"""<span>([[mannelijk]]e [[naam]])</span>[[Categorie:Mannelijke naam_in_het_Engels]]""",
)
data = parse_page(
self.wxr,
"Clark",
"""==Engels==
====Eigennaam====
'''Clark'''
#{{naam-m|eng}}
{{bijv-2|'''Clark''' Gable was a popular movie star|'''Clark''' Gable was een bekende filmster.}}""",
)
self.assertEqual(
data[0]["senses"],
[
{
"categories": ["Mannelijke naam_in_het_Engels"],
"tags": ["masculine", "name", "no-gloss"],
"examples": [
{
"text": "Clark Gable was a popular movie star",
"translation": "Clark Gable was een bekende filmster.",
}
],
}
],
)

def test_double_colons_list(self):
self.wxr.wtp.add_page(
"Sjabloon:oudeschrijfwijze",
10,
"""'''Ehstland'''
# verouderde spelling of vorm van [[Estland#Duits|Estland]][[Categorie:Oude spelling van het Duits]]""",
)
self.wxr.wtp.add_page(
"Sjabloon:verouderd",
10,
"<span>([[verouderd]])</span>[[Categorie:Verouderd_in_het_Duits]]",
)
data = parse_page(
self.wxr,
"Ehstland",
"""==Duits==
====Eigennaam====
{{oudeschrijfwijze|Estland||deu}}
::{{verouderd|deu}} nominatief enkelvoud van [[Ehstland#Duits|Ehstland]]""",
)
self.assertEqual(
data[0]["senses"],
[
{
"categories": [
"Oude spelling van het Duits",
"Verouderd_in_het_Duits",
],
"glosses": [
"verouderde spelling of vorm van Estland",
"nominatief enkelvoud van Ehstland",
],
"tags": ["form-of", "obsolete"],
"form_of": [{"word": "Estland"}],
}
],
)

def test_tag_template_after_form_of_template(self):
self.wxr.wtp.add_page(
"Sjabloon:geologie",
10,
"<span>([[geologie]])</span>[[Categorie:Geologie_in_het_Nederlands]]",
)
data = parse_page(
self.wxr,
"Fanerozoïcum",
"""==Nederlands==
====Zelfstandig naamwoord====
{{oudeschrijfwijze|fanerozoïcum|2006|nld|g=n}} {{geologie|nld}}""",
)
self.assertEqual(data[0]["senses"][0]["topics"], ["geology"])
self.assertEqual(
data[0]["senses"][0]["categories"], ["Geologie_in_het_Nederlands"]
)

def test_double_colons_list_in_parentheses(self):
self.wxr.wtp.add_page(
"Sjabloon:oudeschrijfwijze",
10,
"""'''Haafer'''
# verouderde spelling of vorm van [[Hafer#Duits|Hafer]]&#32;tot 1876[[Categorie:Oude spelling van het Duits van voor 1876]]""",
)
self.wxr.wtp.add_page("Sjabloon:Q", 10, "[[Haafer#Duits|Haafer]]")
data = parse_page(
self.wxr,
"Haafer",
"""==Duits==
====Zelfstandig naamwoord====
{{oudeschrijfwijze|Hafer|1876|deu}}
::(nominatief mannelijk enkelvoud van {{Q|Haafer|deu}})""",
)
self.assertEqual(
data[0]["senses"],
[
{
"glosses": [
"verouderde spelling of vorm van Hafer tot 1876",
"nominatief mannelijk enkelvoud van Haafer",
],
"categories": ["Oude spelling van het Duits van voor 1876"],
"tags": ["form-of"],
"form_of": [{"word": "Hafer"}],
}
],
)

0 comments on commit b666add

Please sign in to comment.