Skip to content

Commit

Permalink
Merge pull request #744 from xxyzz/pl
Browse files Browse the repository at this point in the history
[pl] extract linkage sections and fix translation code
  • Loading branch information
xxyzz authored Jul 29, 2024
2 parents 5029a44 + 4bf303c commit cd79404
Show file tree
Hide file tree
Showing 6 changed files with 236 additions and 39 deletions.
4 changes: 2 additions & 2 deletions src/wiktextract/data/overrides/pl.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"need_pre_expand": true
},
"Szablon:frazeologia": {
"body": "===frazeologia===\n",
"body": "===związki frazeologiczne===\n",
"namespace_id": 10,
"need_pre_expand": true
},
Expand Down Expand Up @@ -95,7 +95,7 @@
"need_pre_expand": true
},
"Szablon:pokrewne": {
"body": "===pokrewne===\n",
"body": "===wyrazy pokrewne===\n",
"namespace_id": 10,
"need_pre_expand": true
},
Expand Down
85 changes: 85 additions & 0 deletions src/wiktextract/extractor/pl/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import re
from collections import defaultdict

from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry
from .tags import translate_raw_tags

LINKAGE_TYPES = {
"antonimy": "antonyms",
"hiperonimy": "hypernyms",
"hiponimy": "hyponyms",
"holonimy": "holonyms",
"meronimy": "meronyms",
"synonimy": "synonyms",
"wyrazy pokrewne": "related",
"związki frazeologiczne": "proverbs",
}


def extract_linkage_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: WikiNode,
linkage_type: str,
lang_code: str,
) -> None:
linkages = defaultdict(list)
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
process_linakge_list_item(wxr, list_item, linkages)

for data in page_data:
if data.lang_code == lang_code:
for sense in data.senses:
if sense.sense_index in linkages:
getattr(data, linkage_type).extend(
linkages[sense.sense_index]
)
del linkages[sense.sense_index]
getattr(data, linkage_type).extend(linkages.get("", []))

if "" in linkages:
del linkages[""]
for data in page_data:
if data.lang_code == lang_code:
for linkage_list in linkages.values():
getattr(data, linkage_type).extend(linkage_list)
break


def process_linakge_list_item(
wxr: WiktextractContext,
list_item: WikiNode,
linkages: dict[str, list[Linkage]],
) -> None:
raw_tags = []
sense_index = ""
last_linkage = None
for node in list_item.children:
if isinstance(node, str):
m = re.search(r"\(\d+\.\d+\)", node)
if m is not None:
sense_index = m.group(0).strip("()")
if ";" in node or "•" in node:
raw_tags.clear()
last_linkage = None
elif isinstance(node, TemplateNode):
raw_tag = clean_node(wxr, None, node)
if raw_tag.endswith("."):
if last_linkage is None:
raw_tags.append(raw_tag)
else:
last_linkage.raw_tags.append(raw_tag)
translate_raw_tags(last_linkage)
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
linkage = Linkage(
word=clean_node(wxr, None, node),
sense_index=sense_index,
raw_tags=raw_tags,
)
translate_raw_tags(linkage)
linkages[sense_index].append(linkage)
last_linkage = linkage
15 changes: 15 additions & 0 deletions src/wiktextract/extractor/pl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ class Sound(PolishBaseModel):
raw_tags: list[str] = []


class Linkage(PolishBaseModel):
word: str
tags: list[str] = []
raw_tags: list[str] = []
sense_index: str = ""


class WordEntry(PolishBaseModel):
model_config = ConfigDict(title="Polish Wiktionary")

Expand All @@ -73,3 +80,11 @@ class WordEntry(PolishBaseModel):
etymology_texts: list[str] = []
translations: list[Translation] = []
sounds: list[Sound] = []
antonyms: list[Linkage] = []
hypernyms: list[Linkage] = []
hyponyms: list[Linkage] = []
holonyms: list[Linkage] = []
meronyms: list[Linkage] = []
related: list[Linkage] = []
proverbs: list[Linkage] = []
synonyms: list[Linkage] = []
13 changes: 12 additions & 1 deletion src/wiktextract/extractor/pl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .example import extract_example_section
from .linkage import LINKAGE_TYPES, extract_linkage_section
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .sound import extract_sound_section
Expand Down Expand Up @@ -40,7 +41,17 @@ def parse_section(
elif title_text == "etymologia" and wxr.config.capture_etymologies:
extract_etymology_section(wxr, page_data, base_data, level_node)
elif title_text == "tłumaczenia" and wxr.config.capture_translations:
extract_translation_section(wxr, page_data, base_data, level_node)
extract_translation_section(
wxr, page_data, level_node, base_data.lang_code
)
elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections:
extract_linkage_section(
wxr,
page_data,
level_node,
LINKAGE_TYPES[title_text],
base_data.lang_code,
)


def parse_page(
Expand Down
98 changes: 62 additions & 36 deletions src/wiktextract/extractor/pl/translation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from collections import defaultdict

from mediawiki_langcodes import name_to_code
from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
Expand All @@ -12,44 +13,69 @@
def extract_translation_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
base_data: WordEntry,
level_node: WikiNode,
lang_code: str,
) -> None:
translations = []
translations = defaultdict(list)
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
base_tr_data = Translation()
for index, node in enumerate(list_item.children):
if isinstance(node, str):
if index == 0 and ":" in node:
lang_name = node[: node.index(":")].strip()
base_tr_data.lang = lang_name
lang_code = name_to_code(lang_name, "pl")
if lang_code == "":
lang_code = "unknown"
base_tr_data.lang_code = lang_code
m_index = re.search(r"\(\d+\.\d+\)", node)
if m_index is not None:
base_tr_data.sense_index = m_index.group(0).strip("()")
m_roman = re.search(r"\([^()]+\)", node)
if (
m_roman is not None
and len(translations) > 0
and (m_index is None or m_index.start() != m_roman.start())
):
translations[-1].roman = m_roman.group(0).strip("()")
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if len(word) > 0:
new_tr_data = base_tr_data.model_copy(deep=True)
new_tr_data.word = word
translations.append(new_tr_data)
elif isinstance(node, TemplateNode) and len(translations) > 0:
raw_tag = clean_node(wxr, None, node)
if len(raw_tag) > 0:
translations[-1].raw_tags.append(raw_tag)
translate_raw_tags(translations[-1])
process_translation_list_item(wxr, list_item, translations)

for data in page_data:
if data.lang_code == base_data.lang_code:
data.translations = translations
base_data.translations = translations
if data.lang_code == lang_code:
for sense in data.senses:
if sense.sense_index in translations:
data.translations.extend(translations[sense.sense_index])
del translations[sense.sense_index]
data.translations.extend(translations.get("", []))

if "" in translations:
del translations[""]
for data in page_data:
if data.lang_code == lang_code:
for translation_list in translations.values():
data.translations.extend(translation_list)
break


def process_translation_list_item(
wxr: WiktextractContext,
list_item: WikiNode,
translations: dict[str, list[Translation]],
) -> None:
lang_name = ""
lang_code = ""
sense_index = ""
last_tr_data = None
for index, node in enumerate(list_item.children):
if isinstance(node, str):
if index == 0 and ":" in node:
lang_name = node[: node.index(":")].strip()
lang_code = name_to_code(lang_name, "pl")
if lang_code == "":
lang_code = "unknown"
m_index = re.search(r"\(\d+\.\d+\)", node)
if m_index is not None:
sense_index = m_index.group(0).strip("()")
m_roman = re.search(r"\([^()]+\)", node)
if (
m_roman is not None
and last_tr_data is not None
and (m_index is None or m_index.start() != m_roman.start())
):
last_tr_data.roman = m_roman.group(0).strip("()")
elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
word = clean_node(wxr, None, node)
if len(word) > 0:
new_tr_data = Translation(
word=word,
sense_index=sense_index,
lang=lang_name,
lang_code=lang_code,
)
translations[sense_index].append(new_tr_data)
last_tr_data = new_tr_data
elif isinstance(node, TemplateNode) and last_tr_data is not None:
raw_tag = clean_node(wxr, None, node)
if len(raw_tag) > 0:
last_tr_data.raw_tags.append(raw_tag)
translate_raw_tags(last_tr_data)
60 changes: 60 additions & 0 deletions tests/test_pl_linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from unittest import TestCase

from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.pl.linkage import extract_linkage_section
from wiktextract.extractor.pl.models import Linkage, Sense, WordEntry
from wiktextract.wxr_context import WiktextractContext


class TestPlLinkage(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="pl"),
WiktionaryConfig(
dump_file_lang_code="pl",
capture_language_codes=None,
),
)

def tearDown(self) -> None:
self.wxr.wtp.close_db_conn()

def test_pies(self):
self.wxr.wtp.start_page("pies")
self.wxr.wtp.add_page("Szablon:neutr", 10, "neutr.")

root = self.wxr.wtp.parse(""": (1.1) [[czworonożny przyjaciel]]
: (2.1) [[pała]]; {{neutr}} [[policjant]]""")
page_data = [
WordEntry(
word="pies",
lang="język polski",
lang_code="pl",
pos="noun",
senses=[Sense(sense_index="1.1")],
),
WordEntry(
word="pies",
lang="język polski",
lang_code="pl",
pos="noun",
senses=[Sense(sense_index="2.1")],
),
]
extract_linkage_section(self.wxr, page_data, root, "synonyms", "pl")
self.assertEqual(
page_data[0].synonyms,
[Linkage(word="czworonożny przyjaciel", sense_index="1.1")],
)
self.assertEqual(
page_data[1].synonyms,
[
Linkage(word="pała", sense_index="2.1"),
Linkage(
word="policjant", raw_tags=["neutr."], sense_index="2.1"
),
],
)

0 comments on commit cd79404

Please sign in to comment.