Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pydantic and json_schema to German extractor #418

Merged
merged 1 commit into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
932 changes: 932 additions & 0 deletions json_schema/de.json

Large diffs are not rendered by default.

62 changes: 49 additions & 13 deletions src/wiktextract/extractor/de/example.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,48 @@
from collections import defaultdict
from typing import Dict, List
import copy

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.models import Example, Reference, WordEntry
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

REF_KEY_MAP = {
"autor": "author",
"a": "author",
"titel": "title",
"titelerg": "title_complement",
"auflage": "edition",
"verlag": "publisher",
"ort": "place",
"jahr": "year",
"seiten": "pages",
"isbn": "isbn",
"übersetzer": "translator",
"herausgeber": "editor",
"sammelwerk": "collection",
"werk": "collection",
"band": "volume",
"kommentar": "comment",
"online": "url",
"tag": "day",
"monat": "month",
"zugriff": "accessdate",
"nummer": "number",
"datum": "date",
"hrsg": "editor",
}


def extract_examples(
wxr: WiktextractContext,
page_data: List[Dict],
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
example_data = defaultdict(str)
example_data = Example()

ref_nodes = find_and_remove_child(
list_item_node,
Expand All @@ -30,12 +57,12 @@ def extract_examples(
senseid, example_text = match_senseid(example_text)

if example_text:
example_data["text"] = example_text
example_data.text = example_text

if senseid:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid:
sense["examples"].append(example_data)
for sense in word_entry.senses:
if sense.senseid == senseid:
sense.examples.append(copy.deepcopy(example_data))

else:
if example_data:
Expand All @@ -51,11 +78,11 @@ def extract_examples(


def extract_reference(
wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode
wxr: WiktextractContext, example_data: Example, ref_node: WikiNode
):
reference_data = defaultdict()
reference_data = Reference()

reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children)
reference_data.raw_ref = clean_node(wxr, {}, ref_node.children)

template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE))

Expand All @@ -72,9 +99,18 @@ def extract_reference(
# https://de.wiktionary.org/wiki/Vorlage:Literatur
for key, value in template_node.template_parameters.items():
if isinstance(key, str):
reference_data[key.lower()] = clean_node(wxr, {}, value)
key_english = REF_KEY_MAP.get(key.lower(), key.lower())
if key_english in reference_data.model_fields:
setattr(
reference_data, key_english, clean_node(wxr, {}, value)
)
else:
wxr.wtp.debug(
f"Unexpected key in reference: {key_english}",
sortid="extractor/de/examples/extract_examples/77",
)

# XXX: Treat other templates as well.
# E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID

example_data["ref"] = reference_data
example_data.ref = reference_data
59 changes: 30 additions & 29 deletions src/wiktextract/extractor/de/gloss.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,36 @@
import copy
import re
from collections import defaultdict
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.models import Sense, WordEntry
from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_glosses(
wxr: WiktextractContext,
page_data: List[Dict],
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
process_gloss_list_item(wxr, page_data, list_node)
process_gloss_list_item(wxr, word_entry, list_node)

for non_list_node in level_node.invert_find_child(NodeKind.LIST):
wxr.wtp.debug(
f"Found unexpected non-list node in pronunciation section: {non_list_node}",
sortid="extractor/de/pronunciation/extract_pronunciation/64",
f"Found unexpected non-list node in gloss section: {non_list_node}",
sortid="extractor/de/gloss/extract_gloss/24",
)


def process_gloss_list_item(
wxr: WiktextractContext,
page_data: List[Dict],
word_entry: WordEntry,
list_node: WikiNode,
parent_senseid: str = "",
parent_gloss_data: defaultdict(list) = None,
parent_gloss_data: Sense = None,
) -> None:
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
item_type = list_item_node.sarg
Expand All @@ -48,10 +49,10 @@ def process_gloss_list_item(
):
continue

gloss_data = (
defaultdict(list)
sense_data = (
Sense()
if parent_gloss_data is None
else parent_gloss_data.copy()
else copy.deepcopy(parent_gloss_data)
)

# Extract sub-glosses for later processing
Expand All @@ -60,11 +61,11 @@ def process_gloss_list_item(
)

raw_gloss = clean_node(wxr, {}, list_item_node.children)
gloss_data["raw_glosses"] = [raw_gloss]
sense_data.raw_glosses = [raw_gloss]

process_K_template(wxr, gloss_data, list_item_node)
process_K_template(wxr, sense_data, list_item_node)

gloss_text = clean_node(wxr, gloss_data, list_item_node.children)
gloss_text = clean_node(wxr, sense_data, list_item_node.children)

senseid, gloss_text = match_senseid(gloss_text)

Expand All @@ -74,27 +75,27 @@ def process_gloss_list_item(
if senseid[0].isnumeric()
else parent_senseid + senseid
)
gloss_data["senseid"] = senseid
sense_data.senseid = senseid
else:
wxr.wtp.debug(
f"Failed to extract sense number from gloss node: {list_item_node}",
sortid="extractor/de/glosses/extract_glosses/28",
)

# XXX: Extract tags from nodes instead using Italic and Template
gloss_text = extract_tags_from_gloss_text(gloss_data, gloss_text)
gloss_text = extract_tags_from_gloss_text(sense_data, gloss_text)

if gloss_text or not sub_glosses_list_nodes:
gloss_data["glosses"] = [gloss_text]
page_data[-1]["senses"].append(gloss_data)
sense_data.glosses = [gloss_text]
word_entry.senses.append(sense_data)

for sub_list_node in sub_glosses_list_nodes:
process_gloss_list_item(
wxr,
page_data,
word_entry,
sub_list_node,
senseid,
gloss_data if not gloss_text else None,
sense_data if not gloss_text else None,
)

else:
Expand All @@ -105,7 +106,7 @@ def process_gloss_list_item(
continue


def handle_sense_modifier(wxr, list_item_node):
def handle_sense_modifier(wxr: WiktextractContext, list_item_node: WikiNode):
wxr.wtp.debug(
f"Skipped a sense modifier in gloss list: {list_item_node}",
sortid="extractor/de/glosses/extract_glosses/19",
Expand All @@ -117,14 +118,16 @@ def handle_sense_modifier(wxr, list_item_node):

def process_K_template(
wxr: WiktextractContext,
gloss_data: defaultdict(list),
sense_data: Sense,
list_item_node: NodeKind.LIST_ITEM,
) -> None:
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
if template_node.template_name == "K":
text = clean_node(wxr, gloss_data, template_node).removesuffix(":")
categories = {"categories": []}
text = clean_node(wxr, categories, template_node).removesuffix(":")
sense_data.categories.extend(categories["categories"])
tags = re.split(r";|,", text)
gloss_data["tags"] = [t.strip() for t in tags]
sense_data.tags = [t.strip() for t in tags]

# Prepositional and case information is sometimes only expanded to
# category links and not present in cleaned node. We still want it
Expand All @@ -133,7 +136,7 @@ def process_K_template(
case = template_node.template_parameters.get("Kas")
category = (prep if prep else "") + (" + " + case if case else "")
if category:
gloss_data["tags"].append(category)
sense_data.tags.append(category)

# XXX: Investigate better ways to handle free text in K template
ft = template_node.template_parameters.get("ft")
Expand All @@ -149,16 +152,14 @@ def process_K_template(
]


def extract_tags_from_gloss_text(
gloss_data: defaultdict(list), gloss_text: str
) -> None:
def extract_tags_from_gloss_text(sense_data: Sense, gloss_text: str) -> None:
parts = gloss_text.split(":", 1)
if len(parts) > 1:
tags_part = parts[0].strip()

categories = [c.strip() for c in re.split(",", tags_part)]
if all(c.isalnum() for c in categories):
gloss_data["tags"].extend(categories)
sense_data.tags.extend(categories)
return parts[1].strip()

return gloss_text
39 changes: 29 additions & 10 deletions src/wiktextract/extractor/de/linkage.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import re
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import LevelNode

from wiktextract.extractor.de.models import WordEntry
from wiktextract.extractor.de.utils import split_senseids
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_linkages(
wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
):
linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0])
for list_node in level_node.find_child(NodeKind.LIST):
Expand All @@ -25,7 +26,7 @@ def extract_linkages(
)

# Extract links
linkages = []
linkages: list[str] = []
if linkage_type == "expressions":
for child in list_item.children:
if isinstance(child, str) and contains_dash(child):
Expand All @@ -43,15 +44,33 @@ def extract_linkages(
process_link(wxr, linkages, link)

# Add links to the page data
if len(page_data[-1]["senses"]) == 1:
page_data[-1]["senses"][0][linkage_type].extend(linkages)
if len(word_entry.senses) == 1:
if linkage_type in word_entry.senses[0].model_fields:
getattr(word_entry.senses[0], linkage_type).extend(linkages)
else:
wxr.wtp.debug(
f"Linkage type {linkage_type} not in sense model fields",
sortid="extractor/de/linkages/extract_linkages/54}",
)
elif len(senseids) > 0:
for senseid in senseids:
for sense in page_data[-1]["senses"]:
if sense["senseid"] == senseid:
sense[linkage_type].extend(linkages)
for sense in word_entry.senses:
if sense.senseid == senseid:
if linkage_type in sense.model_fields:
getattr(sense, linkage_type).extend(linkages)
else:
wxr.wtp.debug(
f"Linkage type {linkage_type} not in sense model fields",
sortid="extractor/de/linkages/extract_linkages/54}",
)
else:
page_data[-1][linkage_type].extend(linkages)
if linkage_type in word_entry.model_fields:
getattr(word_entry, linkage_type).extend(linkages)
else:
wxr.wtp.debug(
f"Linkage type {linkage_type} not in entry model fields",
sortid="extractor/de/linkages/extract_linkages/54}",
)

# Check for potentially missed data
for non_link in list_item.invert_find_child(NodeKind.LINK):
Expand All @@ -72,7 +91,7 @@ def extract_linkages(


def process_link(
wxr: WiktextractContext, semantic_links: List[str], link: WikiNode
wxr: WiktextractContext, semantic_links: list[str], link: WikiNode
):
clean_link = clean_node(wxr, {}, link)
if clean_link.startswith("Verzeichnis:"):
Expand Down
Loading