From 9235e394416b25c905e50dacbac1096ec57ef74d Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 29 Jul 2024 17:58:43 +0800 Subject: [PATCH 1/2] [pl] extract linkage sections --- src/wiktextract/data/overrides/pl.json | 4 +- src/wiktextract/extractor/pl/linkage.py | 83 +++++++++++++++++++++++++ src/wiktextract/extractor/pl/models.py | 15 +++++ src/wiktextract/extractor/pl/page.py | 9 +++ tests/test_pl_linkage.py | 60 ++++++++++++++++++ 5 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 src/wiktextract/extractor/pl/linkage.py create mode 100644 tests/test_pl_linkage.py diff --git a/src/wiktextract/data/overrides/pl.json b/src/wiktextract/data/overrides/pl.json index 29c9de695..711387e1a 100644 --- a/src/wiktextract/data/overrides/pl.json +++ b/src/wiktextract/data/overrides/pl.json @@ -20,7 +20,7 @@ "need_pre_expand": true }, "Szablon:frazeologia": { - "body": "===frazeologia===\n", + "body": "===związki frazeologiczne===\n", "namespace_id": 10, "need_pre_expand": true }, @@ -95,7 +95,7 @@ "need_pre_expand": true }, "Szablon:pokrewne": { - "body": "===pokrewne===\n", + "body": "===wyrazy pokrewne===\n", "namespace_id": 10, "need_pre_expand": true }, diff --git a/src/wiktextract/extractor/pl/linkage.py b/src/wiktextract/extractor/pl/linkage.py new file mode 100644 index 000000000..6851463f0 --- /dev/null +++ b/src/wiktextract/extractor/pl/linkage.py @@ -0,0 +1,83 @@ +import re +from collections import defaultdict + +from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Linkage, WordEntry +from .tags import translate_raw_tags + +LINKAGE_TYPES = { + "antonimy": "antonyms", + "hiperonimy": "hypernyms", + "hiponimy": "hyponyms", + "holonimy": "holonyms", + "meronimy": "meronyms", + "synonimy": "synonyms", + "wyrazy pokrewne": "related", + "związki frazeologiczne": "proverbs", +} + + +def extract_linkage_section( + wxr: WiktextractContext, + page_data: list[WordEntry], + level_node: WikiNode, + linkage_type: str, + lang_code: str, +) -> None: + linkages = defaultdict(list) + for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): + process_linakge_list_item(wxr, list_item, linkages) + for data in page_data: + if data.lang_code == lang_code: + for sense in data.senses: + if sense.sense_index in linkages: + getattr(data, linkage_type).extend( + linkages[sense.sense_index] + ) + del linkages[sense.sense_index] + getattr(data, linkage_type).extend(linkages.get("", [])) + if "" in linkages: + del linkages[""] + for data in page_data: + if data.lang_code == lang_code: + for linkage_list in linkages.values(): + getattr(data, linkage_type).extend(linkage_list) + break + + +def process_linakge_list_item( + wxr: WiktextractContext, + list_item: WikiNode, + linkages: dict[str, list[Linkage]], +) -> None: + raw_tags = [] + sense_index = "" + last_linkage = None + for node in list_item.children: + if isinstance(node, str): + m = re.search(r"\(\d+\.\d+\)", node) + if m is not None: + sense_index = m.group(0).strip("()") + if ";" in node or "•" in node: + raw_tags.clear() + last_linkage = None + elif isinstance(node, TemplateNode): + raw_tag = clean_node(wxr, None, node) + if raw_tag.endswith("."): + if last_linkage is None: + raw_tags.append(raw_tag) + else: + last_linkage.raw_tags.append(raw_tag) + translate_raw_tags(last_linkage) + elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: + linkage = Linkage( + word=clean_node(wxr, None, node), + sense_index=sense_index, + raw_tags=raw_tags, + ) + translate_raw_tags(linkage) + linkages[sense_index].append(linkage) + last_linkage = linkage diff --git a/src/wiktextract/extractor/pl/models.py b/src/wiktextract/extractor/pl/models.py index 3c6641e18..28f75b067 100644 --- a/src/wiktextract/extractor/pl/models.py +++ b/src/wiktextract/extractor/pl/models.py @@ -56,6 +56,13 @@ class Sound(PolishBaseModel): raw_tags: list[str] = [] +class Linkage(PolishBaseModel): + word: str + tags: list[str] = [] + raw_tags: list[str] = [] + sense_index: str = "" + + class WordEntry(PolishBaseModel): model_config = ConfigDict(title="Polish Wiktionary") @@ -73,3 +80,11 @@ class WordEntry(PolishBaseModel): etymology_texts: list[str] = [] translations: list[Translation] = [] sounds: list[Sound] = [] + antonyms: list[Linkage] = [] + hypernyms: list[Linkage] = [] + hyponyms: list[Linkage] = [] + holonyms: list[Linkage] = [] + meronyms: list[Linkage] = [] + related: list[Linkage] = [] + proverbs: list[Linkage] = [] + synonyms: list[Linkage] = [] diff --git a/src/wiktextract/extractor/pl/page.py b/src/wiktextract/extractor/pl/page.py index 215b5398a..3c74e93ce 100644 --- a/src/wiktextract/extractor/pl/page.py +++ b/src/wiktextract/extractor/pl/page.py @@ -11,6 +11,7 @@ from ...wxr_context import WiktextractContext from .etymology import extract_etymology_section from .example import extract_example_section +from .linkage import LINKAGE_TYPES, extract_linkage_section from .models import Sense, WordEntry from .pos import extract_pos_section from .sound import extract_sound_section @@ -41,6 +42,14 @@ def parse_section( extract_etymology_section(wxr, page_data, base_data, level_node) elif title_text == "tłumaczenia" and wxr.config.capture_translations: extract_translation_section(wxr, page_data, base_data, level_node) + elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections: + extract_linkage_section( + wxr, + page_data, + level_node, + LINKAGE_TYPES[title_text], + base_data.lang_code, + ) def parse_page( diff --git a/tests/test_pl_linkage.py b/tests/test_pl_linkage.py new file mode 100644 index 000000000..a00000da0 --- /dev/null +++ b/tests/test_pl_linkage.py @@ -0,0 +1,60 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.pl.linkage import extract_linkage_section +from wiktextract.extractor.pl.models import Linkage, Sense, WordEntry +from wiktextract.wxr_context import WiktextractContext + + +class TestPlLinkage(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="pl"), + WiktionaryConfig( + dump_file_lang_code="pl", + capture_language_codes=None, + ), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def test_pies(self): + self.wxr.wtp.start_page("pies") + self.wxr.wtp.add_page("Szablon:neutr", 10, "neutr.") + + root = self.wxr.wtp.parse(""": (1.1) [[czworonożny przyjaciel]] +: (2.1) [[pała]]; {{neutr}} [[policjant]]""") + page_data = [ + WordEntry( + word="pies", + lang="język polski", + lang_code="pl", + pos="noun", + senses=[Sense(sense_index="1.1")], + ), + WordEntry( + word="pies", + lang="język polski", + lang_code="pl", + pos="noun", + senses=[Sense(sense_index="2.1")], + ), + ] + extract_linkage_section(self.wxr, page_data, root, "synonyms", "pl") + self.assertEqual( + page_data[0].synonyms, + [Linkage(word="czworonożny przyjaciel", sense_index="1.1")], + ) + self.assertEqual( + page_data[1].synonyms, + [ + Linkage(word="pała", sense_index="2.1"), + Linkage( + word="policjant", raw_tags=["neutr."], sense_index="2.1" + ), + ], + ) From 4bf303ceaa2107ea7a4ac79f025100ee3884e44a Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 29 Jul 2024 18:18:23 +0800 Subject: [PATCH 2/2] [pl] add translation data to `WordEntry` has the same `sense_index` --- src/wiktextract/extractor/pl/linkage.py | 2 + src/wiktextract/extractor/pl/page.py | 4 +- src/wiktextract/extractor/pl/translation.py | 98 +++++++++++++-------- 3 files changed, 67 insertions(+), 37 deletions(-) diff --git a/src/wiktextract/extractor/pl/linkage.py b/src/wiktextract/extractor/pl/linkage.py index 6851463f0..9c552c7f5 100644 --- a/src/wiktextract/extractor/pl/linkage.py +++ b/src/wiktextract/extractor/pl/linkage.py @@ -30,6 +30,7 @@ def extract_linkage_section( linkages = defaultdict(list) for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): process_linakge_list_item(wxr, list_item, linkages) + for data in page_data: if data.lang_code == lang_code: for sense in data.senses: @@ -39,6 +40,7 @@ def extract_linkage_section( ) del linkages[sense.sense_index] getattr(data, linkage_type).extend(linkages.get("", [])) + if "" in linkages: del linkages[""] for data in page_data: diff --git a/src/wiktextract/extractor/pl/page.py b/src/wiktextract/extractor/pl/page.py index 3c74e93ce..f7b4486c4 100644 --- a/src/wiktextract/extractor/pl/page.py +++ b/src/wiktextract/extractor/pl/page.py @@ -41,7 +41,9 @@ def parse_section( elif title_text == "etymologia" and wxr.config.capture_etymologies: extract_etymology_section(wxr, page_data, base_data, level_node) elif title_text == "tłumaczenia" and wxr.config.capture_translations: - extract_translation_section(wxr, page_data, base_data, level_node) + extract_translation_section( + wxr, page_data, level_node, base_data.lang_code + ) elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections: extract_linkage_section( wxr, diff --git a/src/wiktextract/extractor/pl/translation.py b/src/wiktextract/extractor/pl/translation.py index aaf479f5b..daf7a761c 100644 --- a/src/wiktextract/extractor/pl/translation.py +++ b/src/wiktextract/extractor/pl/translation.py @@ -1,4 +1,5 @@ import re +from collections import defaultdict from mediawiki_langcodes import name_to_code from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode @@ -12,44 +13,69 @@ def extract_translation_section( wxr: WiktextractContext, page_data: list[WordEntry], - base_data: WordEntry, level_node: WikiNode, + lang_code: str, ) -> None: - translations = [] + translations = defaultdict(list) for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): - base_tr_data = Translation() - for index, node in enumerate(list_item.children): - if isinstance(node, str): - if index == 0 and ":" in node: - lang_name = node[: node.index(":")].strip() - base_tr_data.lang = lang_name - lang_code = name_to_code(lang_name, "pl") - if lang_code == "": - lang_code = "unknown" - base_tr_data.lang_code = lang_code - m_index = re.search(r"\(\d+\.\d+\)", node) - if m_index is not None: - base_tr_data.sense_index = m_index.group(0).strip("()") - m_roman = re.search(r"\([^()]+\)", node) - if ( - m_roman is not None - and len(translations) > 0 - and (m_index is None or m_index.start() != m_roman.start()) - ): - translations[-1].roman = m_roman.group(0).strip("()") - elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: - word = clean_node(wxr, None, node) - if len(word) > 0: - new_tr_data = base_tr_data.model_copy(deep=True) - new_tr_data.word = word - translations.append(new_tr_data) - elif isinstance(node, TemplateNode) and len(translations) > 0: - raw_tag = clean_node(wxr, None, node) - if len(raw_tag) > 0: - translations[-1].raw_tags.append(raw_tag) - translate_raw_tags(translations[-1]) + process_translation_list_item(wxr, list_item, translations) for data in page_data: - if data.lang_code == base_data.lang_code: - data.translations = translations - base_data.translations = translations + if data.lang_code == lang_code: + for sense in data.senses: + if sense.sense_index in translations: + data.translations.extend(translations[sense.sense_index]) + del translations[sense.sense_index] + data.translations.extend(translations.get("", [])) + + if "" in translations: + del translations[""] + for data in page_data: + if data.lang_code == lang_code: + for translation_list in translations.values(): + data.translations.extend(translation_list) + break + + +def process_translation_list_item( + wxr: WiktextractContext, + list_item: WikiNode, + translations: dict[str, list[Translation]], +) -> None: + lang_name = "" + lang_code = "" + sense_index = "" + last_tr_data = None + for index, node in enumerate(list_item.children): + if isinstance(node, str): + if index == 0 and ":" in node: + lang_name = node[: node.index(":")].strip() + lang_code = name_to_code(lang_name, "pl") + if lang_code == "": + lang_code = "unknown" + m_index = re.search(r"\(\d+\.\d+\)", node) + if m_index is not None: + sense_index = m_index.group(0).strip("()") + m_roman = re.search(r"\([^()]+\)", node) + if ( + m_roman is not None + and last_tr_data is not None + and (m_index is None or m_index.start() != m_roman.start()) + ): + last_tr_data.roman = m_roman.group(0).strip("()") + elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: + word = clean_node(wxr, None, node) + if len(word) > 0: + new_tr_data = Translation( + word=word, + sense_index=sense_index, + lang=lang_name, + lang_code=lang_code, + ) + translations[sense_index].append(new_tr_data) + last_tr_data = new_tr_data + elif isinstance(node, TemplateNode) and last_tr_data is not None: + raw_tag = clean_node(wxr, None, node) + if len(raw_tag) > 0: + last_tr_data.raw_tags.append(raw_tag) + translate_raw_tags(last_tr_data)