-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extract semantic relations from German Wiktionary
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. Fix types for python3.9 Import SEMANTIC_RELATIONS into pages.py
- Loading branch information
Showing
6 changed files
with
293 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import re | ||
from typing import Dict, List | ||
|
||
from wikitextprocessor import NodeKind, WikiNode | ||
from wikitextprocessor.parser import LevelNode | ||
|
||
from wiktextract.extractor.de.utils import split_senseids | ||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
SEMANTIC_RELATIONS = { | ||
"Gegenwörter": "antonyms", | ||
"Holonyme": "holonyms", | ||
"Oberbegriffe": "hypernyms", | ||
"Redewendungen": "expressions", | ||
"Sinnverwandte Wörter": "coordinate_terms", | ||
"Sprichwörter": "proverbs", | ||
"Synonyme": "synonyms", | ||
"Unterbegriffe": "hyponyms", | ||
"Wortbildungen": "derived", | ||
} | ||
|
||
|
||
def extract_semantic_relations( | ||
wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode | ||
): | ||
relation_key = SEMANTIC_RELATIONS.get(level_node.largs[0][0]) | ||
for list_node in level_node.find_child(NodeKind.LIST): | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
# Get the senseids | ||
senseids = ( | ||
split_senseids(list_item.children[0]) | ||
if ( | ||
len(list_item.children) > 0 | ||
and isinstance(list_item.children[0], str) | ||
) | ||
else [] | ||
) | ||
|
||
# Extract links | ||
semantic_links = [] | ||
if relation_key == "expressions": | ||
for child in list_item.children: | ||
if isinstance(child, str) and contains_dash(child): | ||
# XXX Capture the part after the dash as an explanatory note to the expression, e.g.: | ||
# https://de.wiktionary.org/wiki/Beispiel | ||
# ":[[ein gutes Beispiel geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]" | ||
break | ||
elif ( | ||
isinstance(child, WikiNode) | ||
and child.kind == NodeKind.LINK | ||
): | ||
process_link(wxr, semantic_links, child) | ||
else: | ||
for link in list_item.find_child(NodeKind.LINK): | ||
process_link(wxr, semantic_links, link) | ||
|
||
# Add links to the page data | ||
if len(page_data[-1]["senses"]) == 1: | ||
page_data[-1]["senses"][0][relation_key].extend(semantic_links) | ||
elif len(senseids) > 0: | ||
for senseid in senseids: | ||
for sense in page_data[-1]["senses"]: | ||
if sense["senseid"] == senseid: | ||
sense[relation_key].extend(semantic_links) | ||
else: | ||
page_data[-1][relation_key].extend(semantic_links) | ||
|
||
# Check for potentially missed data | ||
for non_link in list_item.invert_find_child(NodeKind.LINK): | ||
if ( | ||
relation_key == "expressions" | ||
and isinstance(non_link, str) | ||
and contains_dash(non_link) | ||
): | ||
break | ||
elif isinstance(non_link, str) and ( | ||
non_link.startswith("[") or len(non_link.strip()) <= 3 | ||
): | ||
continue | ||
wxr.wtp.debug( | ||
f"Found unexpected non-link node '{non_link}' in: {list_item}", | ||
sortid="extractor/de/semantic_relations/extract_semantic_relations/84", | ||
) | ||
|
||
|
||
def process_link( | ||
wxr: WiktextractContext, semantic_links: List[str], link: WikiNode | ||
): | ||
clean_link = clean_node(wxr, {}, link) | ||
if clean_link.startswith("Verzeichnis:"): | ||
return | ||
semantic_links.append(clean_link) | ||
|
||
|
||
def contains_dash(text: str): | ||
return re.search(r"[–—―‒-]", text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import unittest | ||
from collections import defaultdict | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.config import WiktionaryConfig | ||
from wiktextract.extractor.de.semantic_relations import ( | ||
extract_semantic_relations, | ||
) | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestDETranslation(unittest.TestCase): | ||
maxDiff = None | ||
|
||
def setUp(self) -> None: | ||
self.wxr = WiktextractContext( | ||
Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") | ||
) | ||
|
||
def tearDown(self) -> None: | ||
self.wxr.wtp.close_db_conn() | ||
|
||
def test_de_extract_semantic_relations(self): | ||
test_cases = [ | ||
# https://de.wiktionary.org/wiki/Beispiel | ||
# Extracts linkages and places them in the correct sense. | ||
{ | ||
"input": "==== Sinnverwandte Wörter ====\n:[1] [[Beleg]], [[Exempel]]\n:[2] [[Muster]], [[Vorbild]]", | ||
"page_data": [ | ||
defaultdict( | ||
list, | ||
{ | ||
"senses": [ | ||
defaultdict(list, {"senseid": "1"}), | ||
defaultdict(list, {"senseid": "2"}), | ||
] | ||
}, | ||
) | ||
], | ||
"expected": [ | ||
{ | ||
"senses": [ | ||
{ | ||
"senseid": "1", | ||
"coordinate_terms": ["Beleg", "Exempel"], | ||
}, | ||
{ | ||
"senseid": "2", | ||
"coordinate_terms": ["Muster", "Vorbild"], | ||
}, | ||
] | ||
} | ||
], | ||
}, | ||
# https://de.wiktionary.org/wiki/Beispiel | ||
# Cleans explanatory text from expressions. | ||
{ | ||
"input": "====Redewendungen====\n:[[ein gutes Beispiel geben|ein gutes ''Beispiel'' geben]] – als [[Vorbild]] zur [[Nachahmung]] [[dienen]]/[[herausfordern]]", | ||
"page_data": [defaultdict(list)], | ||
"expected": [ | ||
{ | ||
"expressions": ["ein gutes Beispiel geben"], | ||
"senses": [], | ||
}, | ||
], | ||
}, | ||
# Always places relations in first sense if just one sense. | ||
{ | ||
"input": "====Synonyme====\n:[[Synonym1]]", | ||
"page_data": [ | ||
defaultdict( | ||
list, {"senses": [defaultdict(list, {"senseid": "1"})]} | ||
) | ||
], | ||
"expected": [ | ||
{ | ||
"senses": [{"senseid": "1", "synonyms": ["Synonym1"]}], | ||
}, | ||
], | ||
}, | ||
# https://de.wiktionary.org/wiki/Kokospalme | ||
# Ignores modifiers of relations and all other text. | ||
{ | ||
"input": "====Synonyme====\n:[1] [[Kokosnusspalme]], ''wissenschaftlich:'' [[Cocos nucifera]]", | ||
"page_data": [ | ||
defaultdict( | ||
list, {"senses": [defaultdict(list, {"senseid": "1"})]} | ||
) | ||
], | ||
"expected": [ | ||
{ | ||
"senses": [ | ||
{ | ||
"senseid": "1", | ||
"synonyms": [ | ||
"Kokosnusspalme", | ||
"Cocos nucifera", | ||
], | ||
} | ||
], | ||
}, | ||
], | ||
}, | ||
] | ||
|
||
for case in test_cases: | ||
with self.subTest(case=case): | ||
self.wxr.wtp.start_page("") | ||
root = self.wxr.wtp.parse(case["input"]) | ||
|
||
extract_semantic_relations( | ||
self.wxr, case["page_data"], root.children[0] | ||
) | ||
|
||
self.assertEqual(case["page_data"], case["expected"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import unittest | ||
|
||
from wiktextract.extractor.de.utils import split_senseids | ||
|
||
|
||
class TestDEUtils(unittest.TestCase): | ||
maxDiff = None | ||
|
||
def test_split_senseids(self): | ||
test_cases = [ | ||
("[1]", ["1"]), | ||
("[1,2]", ["1", "2"]), | ||
("[1, 2]", ["1", "2"]), | ||
("[1, 2 ]", ["1", "2"]), | ||
("[1-3]", ["1", "2", "3"]), | ||
("[1, 3-5]", ["1", "3", "4", "5"]), | ||
("[1, 3-4, 6]", ["1", "3", "4", "6"]), | ||
("[1a]", ["1a"]), | ||
("[1, 2a]", ["1", "2a"]), | ||
("[1, 2a-3]", ["1", "2", "3"]), | ||
] | ||
|
||
for test_case in test_cases: | ||
self.assertEqual(split_senseids(test_case[0]), test_case[1]) |