Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pydantic and json_schema to German extractor #418

Merged
merged 1 commit into from
Dec 6, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Add pydantic and json_schema to German extractor
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
empiriker committed Dec 5, 2023
commit 4c5503fd9aa907427b1d41e22be6e897a968fddf
932 changes: 932 additions & 0 deletions json_schema/de.json

Large diffs are not rendered by default.

62 changes: 49 additions & 13 deletions src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,48 @@
from collections import defaultdict
from typing import Dict, List
import copy

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.models import Example, Reference, WordEntry
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

REF_KEY_MAP = {
"autor": "author",
"a": "author",
"titel": "title",
"titelerg": "title_complement",
"auflage": "edition",
"verlag": "publisher",
"ort": "place",
"jahr": "year",
"seiten": "pages",
"isbn": "isbn",
"übersetzer": "translator",
"herausgeber": "editor",
"sammelwerk": "collection",
"werk": "collection",
"band": "volume",
"kommentar": "comment",
"online": "url",
"tag": "day",
"monat": "month",
"zugriff": "accessdate",
"nummer": "number",
"datum": "date",
"hrsg": "editor",
}


def extract_examples(
wxr: WiktextractContext,
page_data: List[Dict],
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
example_data = defaultdict(str)
example_data = Example()

ref_nodes = find_and_remove_child(
list_item_node,
@@ -30,12 +57,12 @@ def extract_examples(
senseid, example_text = match_senseid(example_text)

if example_text:
example_data["text"] = example_text
example_data.text = example_text

if senseid:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid:
sense["examples"].append(example_data)
for sense in word_entry.senses:
if sense.senseid == senseid:
sense.examples.append(copy.deepcopy(example_data))

else:
if example_data:
@@ -51,11 +78,11 @@ def extract_examples(


def extract_reference(
wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode
wxr: WiktextractContext, example_data: Example, ref_node: WikiNode
):
reference_data = defaultdict()
reference_data = Reference()

reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children)
reference_data.raw_ref = clean_node(wxr, {}, ref_node.children)

template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE))

@@ -72,9 +99,18 @@ def extract_reference(
# https://de.wiktionary.org/wiki/Vorlage:Literatur
for key, value in template_node.template_parameters.items():
if isinstance(key, str):
reference_data[key.lower()] = clean_node(wxr, {}, value)
key_english = REF_KEY_MAP.get(key.lower(), key.lower())
if key_english in reference_data.model_fields:
setattr(
reference_data, key_english, clean_node(wxr, {}, value)
)
else:
wxr.wtp.debug(
f"Unexpected key in reference: {key_english}",
sortid="extractor/de/examples/extract_examples/77",
)

# XXX: Treat other templates as well.
# E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID

example_data["ref"] = reference_data
example_data.ref = reference_data
59 changes: 30 additions & 29 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,36 @@
import copy
import re
from collections import defaultdict
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.models import Sense, WordEntry
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_glosses(
wxr: WiktextractContext,
page_data: List[Dict],
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
process_gloss_list_item(wxr, page_data, list_node)
process_gloss_list_item(wxr, word_entry, list_node)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
f"Found unexpected non-list node in pronunciation section: {non_list_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/64",
f"Found unexpected non-list node in gloss section: {non_list_node}",
sortid="extractor/de/gloss/extract_gloss/24",
)


def process_gloss_list_item(
wxr: WiktextractContext,
page_data: List[Dict],
word_entry: WordEntry,
list_node: WikiNode,
parent_senseid: str = "",
parent_gloss_data: defaultdict(list) = None,
parent_gloss_data: Sense = None,
) -> None:
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
item_type = list_item_node.sarg
@@ -48,10 +49,10 @@ def process_gloss_list_item(
):
continue

gloss_data = (
defaultdict(list)
sense_data = (
Sense()
if parent_gloss_data is None
else parent_gloss_data.copy()
else copy.deepcopy(parent_gloss_data)
)

# Extract sub-glosses for later processing
@@ -60,11 +61,11 @@ def process_gloss_list_item(
)

raw_gloss = clean_node(wxr, {}, list_item_node.children)
gloss_data["raw_glosses"] = [raw_gloss]
sense_data.raw_glosses = [raw_gloss]

process_K_template(wxr, gloss_data, list_item_node)
process_K_template(wxr, sense_data, list_item_node)

gloss_text = clean_node(wxr, gloss_data, list_item_node.children)
gloss_text = clean_node(wxr, sense_data, list_item_node.children)

senseid, gloss_text = match_senseid(gloss_text)

@@ -74,27 +75,27 @@ def process_gloss_list_item(
if senseid[0].isnumeric()
else parent_senseid + senseid
)
gloss_data["senseid"] = senseid
sense_data.senseid = senseid
else:
wxr.wtp.debug(
f"Failed to extract sense number from gloss node: {list_item_node}",
sortid="extractor/de/glosses/extract_glosses/28",
)

# XXX: Extract tags from nodes instead using Italic and Template
gloss_text = extract_tags_from_gloss_text(gloss_data, gloss_text)
gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text)

if gloss_text or not sub_glosses_list_nodes:
gloss_data["glosses"] = [gloss_text]
page_data[-1]["senses"].append(gloss_data)
sense_data.glosses = [gloss_text]
word_entry.senses.append(sense_data)

for sub_list_node in sub_glosses_list_nodes:
process_gloss_list_item(
wxr,
page_data,
word_entry,
sub_list_node,
senseid,
gloss_data if not gloss_text else None,
sense_data if not gloss_text else None,
)

else:
@@ -105,7 +106,7 @@ def process_gloss_list_item(
continue


def handle_sense_modifier(wxr, list_item_node):
def handle_sense_modifier(wxr: WiktextractContext, list_item_node: WikiNode):
wxr.wtp.debug(
f"Skipped a sense modifier in gloss list: {list_item_node}",
sortid="extractor/de/glosses/extract_glosses/19",
@@ -117,14 +118,16 @@ def handle_sense_modifier(wxr, list_item_node):

def process_K_template(
wxr: WiktextractContext,
gloss_data: defaultdict(list),
sense_data: Sense,
list_item_node: NodeKind.LIST_ITEM,
) -> None:
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "K":
text = clean_node(wxr, gloss_data, template_node).removesuffix(":")
categories = {"categories": []}
text = clean_node(wxr, categories, template_node).removesuffix(":")
sense_data.categories.extend(categories["categories"])
tags = re.split(r";|,", text)
gloss_data["tags"] = [t.strip() for t in tags]
sense_data.tags = [t.strip() for t in tags]

# Prepositional and case information is sometimes only expanded to
# category links and not present in cleaned node. We still want it
@@ -133,7 +136,7 @@ def process_K_template(
case = template_node.template_parameters.get("Kas")
category = (prep if prep else "") + (" + " + case if case else "")
if category:
gloss_data["tags"].append(category)
sense_data.tags.append(category)

# XXX: Investigate better ways to handle free text in K template
ft = template_node.template_parameters.get("ft")
@@ -149,16 +152,14 @@ def process_K_template(
]


def extract_tags_from_gloss_text(
gloss_data: defaultdict(list), gloss_text: str
) -> None:
def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None:
parts = gloss_text.split(":", 1)
if len(parts) > 1:
tags_part = parts[0].strip()

categories = [c.strip() for c in re.split(",", tags_part)]
if all(c.isalnum() for c in categories):
gloss_data["tags"].extend(categories)
sense_data.tags.extend(categories)
return parts[1].strip()

return gloss_text
39 changes: 29 additions & 10 deletions src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import re
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.models import WordEntry
from wiktextract.extractor.de.utils import split_senseids
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_linkages(
wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
):
linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0])
for list_node in level_node.find_child(NodeKind.LIST):
@@ -25,7 +26,7 @@ def extract_linkages(
)

# Extract links
linkages = []
linkages: list[str] = []
if linkage_type == "expressions":
for child in list_item.children:
if isinstance(child, str) and contains_dash(child):
@@ -43,15 +44,33 @@ def extract_linkages(
process_link(wxr, linkages, link)

# Add links to the page data
if len(page_data[-1]["senses"]) == 1:
page_data[-1]["senses"][0][linkage_type].extend(linkages)
if len(word_entry.senses) == 1:
if linkage_type in word_entry.senses[0].model_fields:
getattr(word_entry.senses[0], linkage_type).extend(linkages)
else:
wxr.wtp.debug(
f"Linkage type {linkage_type} not in sense model fields",
sortid="extractor/de/linkages/extract_linkages/54}",
)
elif len(senseids) > 0:
for senseid in senseids:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid:
sense[linkage_type].extend(linkages)
for sense in word_entry.senses:
if sense.senseid == senseid:
if linkage_type in sense.model_fields:
getattr(sense, linkage_type).extend(linkages)
else:
wxr.wtp.debug(
f"Linkage type {linkage_type} not in sense model fields",
sortid="extractor/de/linkages/extract_linkages/54}",
)
else:
page_data[-1][linkage_type].extend(linkages)
if linkage_type in word_entry.model_fields:
getattr(word_entry, linkage_type).extend(linkages)
else:
wxr.wtp.debug(
f"Linkage type {linkage_type} not in entry model fields",
sortid="extractor/de/linkages/extract_linkages/54}",
)

# Check for potentially missed data
for non_link in list_item.invert_find_child(NodeKind.LINK):
@@ -72,7 +91,7 @@ def extract_linkages(


def process_link(
wxr: WiktextractContext, semantic_links: List[str], link: WikiNode
wxr: WiktextractContext, semantic_links: list[str], link: WikiNode
):
clean_link = clean_node(wxr, {}, link)
if clean_link.startswith("Verzeichnis:"):
Loading