diff --git a/src/wiktextract/data/de/linkage_subtitles.json b/src/wiktextract/data/de/linkage_subtitles.json new file mode 100644 index 000000000..7e7748830 --- /dev/null +++ b/src/wiktextract/data/de/linkage_subtitles.json @@ -0,0 +1,11 @@ +{ + "Gegenwörter": "antonyms", + "Holonyme": "holonyms", + "Oberbegriffe": "hypernyms", + "Redewendungen": "expressions", + "Sinnverwandte Wörter": "coordinate_terms", + "Sprichwörter": "proverbs", + "Synonyme": "synonyms", + "Unterbegriffe": "hyponyms", + "Wortbildungen": "derived" +} diff --git a/src/wiktextract/data/fr/linkage_subtitles.json b/src/wiktextract/data/fr/linkage_subtitles.json index e7d865974..91ac0b5b8 100644 --- a/src/wiktextract/data/fr/linkage_subtitles.json +++ b/src/wiktextract/data/fr/linkage_subtitles.json @@ -1,47 +1,46 @@ { - "synonymes": "synonyms", - "syn": "synonyms", - "hyponymes": "hyponyms", - "hypo": "hyponyms", - "hyperonymes": "hypernyms", - "hyper": "hypernyms", - "holonymes": "holonyms", - "holo": "holonyms", - "méronymes": "meronyms", - "méro": "meronyms", - "dérivés": "derived", - "drv": "derived", - "troponymes": "troponyms", - "tropo": "troponyms", - "paronymes": "paronyms", - "paro": "paronyms", - "apparentés": "related", - "apr": "related", + "abrév": "abbreviation", + "abréviations": "abbreviation", "app": "related", "apparentés": "related", - "étymologiques": "related", - "quasi-synonymes": "synonyms", - "quasi-syn": "synonyms", - "q-syn": "synonyms", + "apr": "related", "dérivés autres langues": "derived", "dérivés int": "derived", + "dérivés": "derived", + "dial": "related", + "dialectes": "related", "drv-int": "derived", - "variantes": "related", - "var": "related", - "variantes orthographiques": "related", - "variantes ortho": "related", + "drv": "derived", + "étymologiques": "related", + "holo": "holonyms", + "holonymes": "holonyms", + "hyper": "hypernyms", + "hyperonymes": "hypernyms", + "hypo": "hyponyms", + "hyponymes": "hyponyms", + "méro": "meronyms", + "méronymes": "meronyms", + "paro": "paronyms", + "paronymes": "paronyms", + "phrases": "proverbs", + "q-syn": "synonyms", + "quasi-syn": "synonyms", + "quasi-synonymes": "synonyms", + "syn": "synonyms", + "synonymes": "synonyms", + "tropo": "troponyms", + "troponymes": "troponyms", + "var-dial": "related", "var-ortho": "related", - "variantes dialectales": "related", + "var": "related", "variantes dial": "related", - "var-dial": "related", - "dial": "related", + "variantes dialectales": "related", "variantes dialectes": "related", - "dialectes": "related", - "abréviations": "abbreviation", - "abrév": "abbreviation", - "phrases": "proverbs", - "vocabulaire": "related", - "vocabulaire apparenté": "related", + "variantes ortho": "related", + "variantes orthographiques": "related", + "variantes": "related", "voc": "related", - "vocabulaire proche": "related" + "vocabulaire apparenté": "related", + "vocabulaire proche": "related", + "vocabulaire": "related" } diff --git a/src/wiktextract/extractor/de/semantic_relations.py b/src/wiktextract/extractor/de/linkage.py similarity index 73% rename from src/wiktextract/extractor/de/semantic_relations.py rename to src/wiktextract/extractor/de/linkage.py index 5ea0b6899..eb4425e74 100644 --- a/src/wiktextract/extractor/de/semantic_relations.py +++ b/src/wiktextract/extractor/de/linkage.py @@ -8,23 +8,11 @@ from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -SEMANTIC_RELATIONS = { - "Gegenwörter": "antonyms", - "Holonyme": "holonyms", - "Oberbegriffe": "hypernyms", - "Redewendungen": "expressions", - "Sinnverwandte Wörter": "coordinate_terms", - "Sprichwörter": "proverbs", - "Synonyme": "synonyms", - "Unterbegriffe": "hyponyms", - "Wortbildungen": "derived", -} - -def extract_semantic_relations( +def extract_linkages( wxr: WiktextractContext, page_data: List[Dict], level_node: LevelNode ): - relation_key = SEMANTIC_RELATIONS.get(level_node.largs[0][0]) + linkage_type = wxr.config.LINKAGE_SUBTITLES.get(level_node.largs[0][0]) for list_node in level_node.find_child(NodeKind.LIST): for list_item in list_node.find_child(NodeKind.LIST_ITEM): # Get the senseids @@ -38,8 +26,8 @@ def extract_semantic_relations( ) # Extract links - semantic_links = [] - if relation_key == "expressions": + linkages = [] + if linkage_type == "expressions": for child in list_item.children: if isinstance(child, str) and contains_dash(child): # XXX Capture the part after the dash as an explanatory note to the expression, e.g.: @@ -50,26 +38,26 @@ def extract_semantic_relations( isinstance(child, WikiNode) and child.kind == NodeKind.LINK ): - process_link(wxr, semantic_links, child) + process_link(wxr, linkages, child) else: for link in list_item.find_child(NodeKind.LINK): - process_link(wxr, semantic_links, link) + process_link(wxr, linkages, link) # Add links to the page data if len(page_data[-1]["senses"]) == 1: - page_data[-1]["senses"][0][relation_key].extend(semantic_links) + page_data[-1]["senses"][0][linkage_type].extend(linkages) elif len(senseids) > 0: for senseid in senseids: for sense in page_data[-1]["senses"]: if sense["senseid"] == senseid: - sense[relation_key].extend(semantic_links) + sense[linkage_type].extend(linkages) else: - page_data[-1][relation_key].extend(semantic_links) + page_data[-1][linkage_type].extend(linkages) # Check for potentially missed data for non_link in list_item.invert_find_child(NodeKind.LINK): if ( - relation_key == "expressions" + linkage_type == "expressions" and isinstance(non_link, str) and contains_dash(non_link) ): @@ -80,7 +68,7 @@ def extract_semantic_relations( continue wxr.wtp.debug( f"Found unexpected non-link node '{non_link}' in: {list_item}", - sortid="extractor/de/semantic_relations/extract_semantic_relations/84", + sortid="extractor/de/linkages/extract_linkages/84", ) diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 231fd071c..d4353e483 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -11,8 +11,8 @@ from .example import extract_examples from .gloss import extract_glosses +from .linkage import extract_linkages from .pronunciation import extract_pronunciation -from .semantic_relations import SEMANTIC_RELATIONS, extract_semantic_relations from .translation import extract_translation # Templates that are used to form panels on pages and that should be ignored in @@ -67,14 +67,19 @@ def parse_section( wxr.wtp.start_subsection(section_name) if section_name == "Bedeutungen": extract_glosses(wxr, page_data, level_node) - elif section_name == "Aussprache": + elif wxr.config.capture_pronunciation and section_name == "Aussprache": extract_pronunciation(wxr, page_data, level_node) - elif section_name == "Beispiele": + elif wxr.config.capture_examples and section_name == "Beispiele": extract_examples(wxr, page_data, level_node) - elif section_name == "Übersetzungen": + elif ( + wxr.config.capture_translations and section_name == "Übersetzungen" + ): extract_translation(wxr, page_data, level_node) - elif section_name in SEMANTIC_RELATIONS: - extract_semantic_relations(wxr, page_data, level_node) + elif ( + wxr.config.capture_linkages + and section_name in wxr.config.LINKAGE_SUBTITLES + ): + extract_linkages(wxr, page_data, level_node) FORM_POS = { diff --git a/tests/test_de_semantic_relations.py b/tests/test_de_linkages.py similarity index 92% rename from tests/test_de_semantic_relations.py rename to tests/test_de_linkages.py index 78ad86c27..8f73cc6c3 100644 --- a/tests/test_de_semantic_relations.py +++ b/tests/test_de_linkages.py @@ -4,13 +4,11 @@ from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.de.semantic_relations import ( - extract_semantic_relations, -) +from wiktextract.extractor.de.linkage import extract_linkages from wiktextract.wxr_context import WiktextractContext -class TestDETranslation(unittest.TestCase): +class TestDELinkages(unittest.TestCase): maxDiff = None def setUp(self) -> None: @@ -21,7 +19,7 @@ def setUp(self) -> None: def tearDown(self) -> None: self.wxr.wtp.close_db_conn() - def test_de_extract_semantic_relations(self): + def test_de_extract_linkages(self): test_cases = [ # https://de.wiktionary.org/wiki/Beispiel # Extracts linkages and places them in the correct sense. @@ -109,8 +107,6 @@ def test_de_extract_semantic_relations(self): self.wxr.wtp.start_page("") root = self.wxr.wtp.parse(case["input"]) - extract_semantic_relations( - self.wxr, case["page_data"], root.children[0] - ) + extract_linkages(self.wxr, case["page_data"], root.children[0]) self.assertEqual(case["page_data"], case["expected"])