Skip to content

Commit

Permalink
Merge pull request #1012 from xxyzz/ku
Browse files Browse the repository at this point in the history
[ku] extract sound section
  • Loading branch information
xxyzz authored Jan 29, 2025
2 parents 70a4daf + 240b11a commit 7c73d1b
Show file tree
Hide file tree
Showing 6 changed files with 240 additions and 6 deletions.
15 changes: 15 additions & 0 deletions src/wiktextract/extractor/ku/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,19 @@ class Linkage(KurdishBaseModel):
sense: str = ""


class Sound(KurdishBaseModel):
ipa: str = ""
audio: str = Field(default="", description="Audio file name")
wav_url: str = ""
oga_url: str = ""
ogg_url: str = ""
mp3_url: str = ""
opus_url: str = ""
flac_url: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(KurdishBaseModel):
model_config = ConfigDict(title="Kurdish Wiktionary")
word: str = Field(description="Word string")
Expand All @@ -88,3 +101,5 @@ class WordEntry(KurdishBaseModel):
hyponyms: list[Linkage] = []
anagrams: list[Linkage] = []
rhymes: list[Linkage] = []
sounds: list[Sound] = []
hyphenation: str = ""
10 changes: 8 additions & 2 deletions src/wiktextract/extractor/ku/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .section_titles import LINKAGE_SECTIONS, POS_DATA
from .sound import extract_sound_section
from .translation import extract_translation_section, is_translation_page


Expand All @@ -28,9 +29,12 @@ def parse_section(
extract_etymology_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
)
elif title_text == "Werger":
elif title_text in ["Werger", "Bi zaravayên din"]:
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
tags=["dialectal"] if title_text == "Bi zaravayên din" else [],
)
elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]:
extract_linkage_section(
Expand All @@ -46,6 +50,8 @@ def parse_section(
level_node,
LINKAGE_SECTIONS[title_text],
)
elif title_text == "Bilêvkirin":
extract_sound_section(wxr, base_data, level_node)

for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)
Expand Down
76 changes: 76 additions & 0 deletions src/wiktextract/extractor/ku/sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from wikitextprocessor import LevelNode, NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from ..share import set_sound_file_url_fields
from .models import Sound, WordEntry


def extract_sound_section(
wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
for t_node in list_item.find_child(NodeKind.TEMPLATE):
if t_node.template_name == "ku-IPA":
extract_ku_ipa_template(wxr, word_entry, t_node)
elif t_node.template_name == "deng":
extract_deng_template(wxr, word_entry, t_node)
elif t_node.template_name == "ku-kîte":
extract_ku_kîte(wxr, word_entry, t_node)


def extract_ku_ipa_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for span_tag in expanded_node.find_html(
"span", attr_name="class", attr_value="IPA"
):
sound = Sound(ipa=clean_node(wxr, None, span_tag))
if sound.ipa != "":
word_entry.sounds.append(sound)
clean_node(wxr, word_entry, expanded_node)


def extract_deng_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
sound = Sound(
ipa=clean_node(wxr, None, t_node.template_parameters.get("ipa", ""))
)
raw_tag = clean_node(
wxr,
None,
t_node.template_parameters.get(
4, t_node.template_parameters.get("dever", "")
),
)
if raw_tag != "":
sound.raw_tags.append(raw_tag)
filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
if filename != "":
set_sound_file_url_fields(wxr, filename, sound)
word_entry.sounds.append(sound)
clean_node(wxr, word_entry, t_node)


def extract_ku_kîte(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for index, node in enumerate(expanded_node.children):
if isinstance(node, str) and ":" in node:
hyphenation = clean_node(
wxr,
None,
[node[node.index(":") + 1 :]]
+ expanded_node.children[index + 1 :],
).strip()
if hyphenation != "":
word_entry.hyphenation = hyphenation
break
35 changes: 31 additions & 4 deletions src/wiktextract/extractor/ku/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from mediawiki_langcodes import name_to_code
from wikitextprocessor.parser import (
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
TemplateNode,
WikiNode,
Expand All @@ -20,8 +21,9 @@ def is_translation_page(title: str) -> bool:
def extract_translation_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: WikiNode,
level_node: LevelNode,
source: str = "",
tags: list[str] = [],
) -> None:
sense = ""
sense_index = 0
Expand All @@ -41,7 +43,13 @@ def extract_translation_section(
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, list_item, sense, sense_index, source
wxr,
word_entry,
list_item,
sense,
sense_index,
source,
tags=tags,
)
elif node.kind in (NodeKind.ITALIC | NodeKind.BOLD):
for link_node in node.find_child(NodeKind.LINK):
Expand All @@ -66,8 +74,10 @@ def extract_translation_list_item(
sense: str,
sense_index: int,
source: str,
tags: list[str] = [],
) -> None:
lang_name = "unknown"
lang_code = "unknown"
before_colon = True
for index, node in enumerate(list_item.children):
if isinstance(node, str) and ":" in node and lang_name == "unknown":
Expand All @@ -77,6 +87,10 @@ def extract_translation_list_item(
list_item.children[:index] + [node[: node.index(":")]],
)
before_colon = False
elif isinstance(node, TemplateNode) and node.template_name == "Z":
lang_code = clean_node(
wxr, None, node.template_parameters.get(1, "")
)
elif isinstance(node, TemplateNode) and node.template_name in [
"W",
"W+",
Expand All @@ -88,20 +102,31 @@ def extract_translation_list_item(
elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_translation_list_item(
wxr, word_entry, child_list_item, sense, sense_index, source
wxr,
word_entry,
child_list_item,
sense,
sense_index,
source,
tags=tags,
)
elif (
isinstance(node, WikiNode)
and node.kind == NodeKind.LINK
and not before_colon
):
if lang_code in ["", "unknown"]:
new_code = name_to_code(lang_name, "ku")
if new_code != "":
lang_code = new_code
tr_data = Translation(
word=clean_node(wxr, None, node),
lang=lang_name,
lang_code=name_to_code(lang_name, "ku") or "unknown",
lang_code=lang_code,
sense=sense,
sense_index=sense_index,
source=source,
tags=tags,
)
if tr_data.word != "":
word_entry.translations.append(tr_data)
Expand All @@ -115,6 +140,7 @@ def extract_w_template(
sense_index: int,
lang_name: str,
source: str,
tags: list[str] = [],
) -> None:
# https://ku.wiktionary.org/wiki/Şablon:W
tr_data = Translation(
Expand All @@ -130,6 +156,7 @@ def extract_w_template(
),
),
source=source,
tags=tags,
)
tag_args = {
"n": "masculine",
Expand Down
86 changes: 86 additions & 0 deletions tests/test_ku_sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.ku.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestKuSound(TestCase):
maxDiff = None

def setUp(self) -> None:
self.wxr = WiktextractContext(
Wtp(lang_code="ku"),
WiktionaryConfig(
dump_file_lang_code="ku", capture_language_codes=None
),
)

def tearDown(self):
self.wxr.wtp.close_db_conn()

def test_ku_ipa(self):
self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî")
self.wxr.wtp.add_page(
"Şablon:ku-IPA",
10,
"""[[IPA]]<sup>([[Wîkîferheng:IPA kurdî|kilîd]])</sup>: <span class="IPA">/ɑːv/</span>[[Kategorî:Bilêvkirina IPAyê bi kurmancî]]""",
)
page_data = parse_page(
self.wxr,
"av",
"""== {{ziman|ku}} ==
=== Bilêvkirin ===
* {{ku-IPA}}
=== Navdêr ===
# [[vexwarin|Vexwarin]]a bê[[reng]]""",
)
self.assertEqual(
page_data[0]["categories"], ["Bilêvkirina IPAyê bi kurmancî"]
)
self.assertEqual(page_data[0]["sounds"], [{"ipa": "/ɑːv/"}])

def test_deng(self):
self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî")
self.wxr.wtp.add_page(
"Şablon:deng",
10,
"""<phonos lang="ku" text="" wikibase="" file="LL-Q36368 (kur)-Dildadil-av.wav">Deng&nbsp;(Amed)</phonos></span>[[Kategorî:Dengên &nbsp;kurmancî ji Amedê]][[Kategorî:Deng&nbsp;bi kurmancî]]""",
)
page_data = parse_page(
self.wxr,
"av",
"""== {{ziman|ku}} ==
=== Bilêvkirin ===
* {{deng|ku|LL-Q36368 (kur)-Dildadil-av.wav|Deng|Amed}}
=== Navdêr ===
# [[vexwarin|Vexwarin]]a bê[[reng]]""",
)
self.assertEqual(
page_data[0]["categories"],
["Dengên kurmancî ji Amedê", "Deng bi kurmancî"],
)
self.assertEqual(
page_data[0]["sounds"][0]["audio"],
"LL-Q36368 (kur)-Dildadil-av.wav",
)

def test_ku_kîte(self):
self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî")
self.wxr.wtp.add_page(
"Şablon:ku-kîte",
10,
"""[[kîte#ku|Kîtekirin]]: lê·ker""",
)
page_data = parse_page(
self.wxr,
"lêker",
"""== {{ziman|ku}} ==
=== Bilêvkirin ===
* {{ku-kîte}}
=== Navdêr 1 ===
# [[peyv|Peyvên]]""",
)
self.assertEqual(page_data[0]["hyphenation"], "lê·ker")
24 changes: 24 additions & 0 deletions tests/test_ku_translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,27 @@ def test_link(self):
page_data[0]["translations"],
[{"word": "âcua", "lang": "bolognezî", "lang_code": "unknown"}],
)

def test_dialects(self):
self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî")
self.wxr.wtp.add_page("Şablon:Z", 10, "Hewramî")
page_data = parse_page(
self.wxr,
"av",
"""== {{ziman|ku}} ==
=== Navdêr ===
# [[vexwarin|Vexwarin]]a bê[[reng]]
==== Bi zaravayên din ====
* {{Z|hac}}: [[awî]]""",
)
self.assertEqual(
page_data[0]["translations"],
[
{
"word": "awî",
"lang": "Hewramî",
"lang_code": "hac",
"tags": ["dialectal"],
}
],
)

0 comments on commit 7c73d1b

Please sign in to comment.