diff --git a/src/wiktextract/extractor/ku/models.py b/src/wiktextract/extractor/ku/models.py index 07f1d0fc0..f6316b194 100644 --- a/src/wiktextract/extractor/ku/models.py +++ b/src/wiktextract/extractor/ku/models.py @@ -66,6 +66,19 @@ class Linkage(KurdishBaseModel): sense: str = "" +class Sound(KurdishBaseModel): + ipa: str = "" + audio: str = Field(default="", description="Audio file name") + wav_url: str = "" + oga_url: str = "" + ogg_url: str = "" + mp3_url: str = "" + opus_url: str = "" + flac_url: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + + class WordEntry(KurdishBaseModel): model_config = ConfigDict(title="Kurdish Wiktionary") word: str = Field(description="Word string") @@ -88,3 +101,5 @@ class WordEntry(KurdishBaseModel): hyponyms: list[Linkage] = [] anagrams: list[Linkage] = [] rhymes: list[Linkage] = [] + sounds: list[Sound] = [] + hyphenation: str = "" diff --git a/src/wiktextract/extractor/ku/page.py b/src/wiktextract/extractor/ku/page.py index cb92bee32..0964e6aaf 100644 --- a/src/wiktextract/extractor/ku/page.py +++ b/src/wiktextract/extractor/ku/page.py @@ -10,6 +10,7 @@ from .models import Sense, WordEntry from .pos import extract_pos_section from .section_titles import LINKAGE_SECTIONS, POS_DATA +from .sound import extract_sound_section from .translation import extract_translation_section, is_translation_page @@ -28,9 +29,12 @@ def parse_section( extract_etymology_section( wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node ) - elif title_text == "Werger": + elif title_text in ["Werger", "Bi zaravayên din"]: extract_translation_section( - wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node + wxr, + page_data[-1] if len(page_data) > 0 else base_data, + level_node, + tags=["dialectal"] if title_text == "Bi zaravayên din" else [], ) elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]: extract_linkage_section( @@ -46,6 +50,8 @@ def parse_section( level_node, LINKAGE_SECTIONS[title_text], ) + elif title_text == "Bilêvkirin": + extract_sound_section(wxr, base_data, level_node) for next_level in level_node.find_child(LEVEL_KIND_FLAGS): parse_section(wxr, page_data, base_data, next_level) diff --git a/src/wiktextract/extractor/ku/sound.py b/src/wiktextract/extractor/ku/sound.py new file mode 100644 index 000000000..ac00f6d82 --- /dev/null +++ b/src/wiktextract/extractor/ku/sound.py @@ -0,0 +1,76 @@ +from wikitextprocessor import LevelNode, NodeKind, TemplateNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from ..share import set_sound_file_url_fields +from .models import Sound, WordEntry + + +def extract_sound_section( + wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + for t_node in list_item.find_child(NodeKind.TEMPLATE): + if t_node.template_name == "ku-IPA": + extract_ku_ipa_template(wxr, word_entry, t_node) + elif t_node.template_name == "deng": + extract_deng_template(wxr, word_entry, t_node) + elif t_node.template_name == "ku-kîte": + extract_ku_kîte(wxr, word_entry, t_node) + + +def extract_ku_ipa_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for span_tag in expanded_node.find_html( + "span", attr_name="class", attr_value="IPA" + ): + sound = Sound(ipa=clean_node(wxr, None, span_tag)) + if sound.ipa != "": + word_entry.sounds.append(sound) + clean_node(wxr, word_entry, expanded_node) + + +def extract_deng_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + sound = Sound( + ipa=clean_node(wxr, None, t_node.template_parameters.get("ipa", "")) + ) + raw_tag = clean_node( + wxr, + None, + t_node.template_parameters.get( + 4, t_node.template_parameters.get("dever", "") + ), + ) + if raw_tag != "": + sound.raw_tags.append(raw_tag) + filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) + if filename != "": + set_sound_file_url_fields(wxr, filename, sound) + word_entry.sounds.append(sound) + clean_node(wxr, word_entry, t_node) + + +def extract_ku_kîte( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for index, node in enumerate(expanded_node.children): + if isinstance(node, str) and ":" in node: + hyphenation = clean_node( + wxr, + None, + [node[node.index(":") + 1 :]] + + expanded_node.children[index + 1 :], + ).strip() + if hyphenation != "": + word_entry.hyphenation = hyphenation + break diff --git a/src/wiktextract/extractor/ku/translation.py b/src/wiktextract/extractor/ku/translation.py index b03324d4d..2f4fdf287 100644 --- a/src/wiktextract/extractor/ku/translation.py +++ b/src/wiktextract/extractor/ku/translation.py @@ -3,6 +3,7 @@ from mediawiki_langcodes import name_to_code from wikitextprocessor.parser import ( LEVEL_KIND_FLAGS, + LevelNode, NodeKind, TemplateNode, WikiNode, @@ -20,8 +21,9 @@ def is_translation_page(title: str) -> bool: def extract_translation_section( wxr: WiktextractContext, word_entry: WordEntry, - level_node: WikiNode, + level_node: LevelNode, source: str = "", + tags: list[str] = [], ) -> None: sense = "" sense_index = 0 @@ -41,7 +43,13 @@ def extract_translation_section( elif node.kind == NodeKind.LIST: for list_item in node.find_child(NodeKind.LIST_ITEM): extract_translation_list_item( - wxr, word_entry, list_item, sense, sense_index, source + wxr, + word_entry, + list_item, + sense, + sense_index, + source, + tags=tags, ) elif node.kind in (NodeKind.ITALIC | NodeKind.BOLD): for link_node in node.find_child(NodeKind.LINK): @@ -66,8 +74,10 @@ def extract_translation_list_item( sense: str, sense_index: int, source: str, + tags: list[str] = [], ) -> None: lang_name = "unknown" + lang_code = "unknown" before_colon = True for index, node in enumerate(list_item.children): if isinstance(node, str) and ":" in node and lang_name == "unknown": @@ -77,6 +87,10 @@ def extract_translation_list_item( list_item.children[:index] + [node[: node.index(":")]], ) before_colon = False + elif isinstance(node, TemplateNode) and node.template_name == "Z": + lang_code = clean_node( + wxr, None, node.template_parameters.get(1, "") + ) elif isinstance(node, TemplateNode) and node.template_name in [ "W", "W+", @@ -88,20 +102,31 @@ def extract_translation_list_item( elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: for child_list_item in node.find_child(NodeKind.LIST_ITEM): extract_translation_list_item( - wxr, word_entry, child_list_item, sense, sense_index, source + wxr, + word_entry, + child_list_item, + sense, + sense_index, + source, + tags=tags, ) elif ( isinstance(node, WikiNode) and node.kind == NodeKind.LINK and not before_colon ): + if lang_code in ["", "unknown"]: + new_code = name_to_code(lang_name, "ku") + if new_code != "": + lang_code = new_code tr_data = Translation( word=clean_node(wxr, None, node), lang=lang_name, - lang_code=name_to_code(lang_name, "ku") or "unknown", + lang_code=lang_code, sense=sense, sense_index=sense_index, source=source, + tags=tags, ) if tr_data.word != "": word_entry.translations.append(tr_data) @@ -115,6 +140,7 @@ def extract_w_template( sense_index: int, lang_name: str, source: str, + tags: list[str] = [], ) -> None: # https://ku.wiktionary.org/wiki/Şablon:W tr_data = Translation( @@ -130,6 +156,7 @@ def extract_w_template( ), ), source=source, + tags=tags, ) tag_args = { "n": "masculine", diff --git a/tests/test_ku_sound.py b/tests/test_ku_sound.py new file mode 100644 index 000000000..d0d37243c --- /dev/null +++ b/tests/test_ku_sound.py @@ -0,0 +1,86 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.ku.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestKuSound(TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="ku"), + WiktionaryConfig( + dump_file_lang_code="ku", capture_language_codes=None + ), + ) + + def tearDown(self): + self.wxr.wtp.close_db_conn() + + def test_ku_ipa(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + self.wxr.wtp.add_page( + "Şablon:ku-IPA", + 10, + """[[IPA]]<sup>([[Wîkîferheng:IPA kurdî|kilîd]])</sup>: <span class="IPA">/ɑːv/</span>[[Kategorî:Bilêvkirina IPAyê bi kurmancî]]""", + ) + page_data = parse_page( + self.wxr, + "av", + """== {{ziman|ku}} == +=== Bilêvkirin === +* {{ku-IPA}} +=== Navdêr === +# [[vexwarin|Vexwarin]]a bê[[reng]]""", + ) + self.assertEqual( + page_data[0]["categories"], ["Bilêvkirina IPAyê bi kurmancî"] + ) + self.assertEqual(page_data[0]["sounds"], [{"ipa": "/ɑːv/"}]) + + def test_deng(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + self.wxr.wtp.add_page( + "Şablon:deng", + 10, + """<phonos lang="ku" text="" wikibase="" file="LL-Q36368 (kur)-Dildadil-av.wav">Deng (Amed)</phonos></span>[[Kategorî:Dengên kurmancî ji Amedê]][[Kategorî:Deng bi kurmancî]]""", + ) + page_data = parse_page( + self.wxr, + "av", + """== {{ziman|ku}} == +=== Bilêvkirin === +* {{deng|ku|LL-Q36368 (kur)-Dildadil-av.wav|Deng|Amed}} +=== Navdêr === +# [[vexwarin|Vexwarin]]a bê[[reng]]""", + ) + self.assertEqual( + page_data[0]["categories"], + ["Dengên kurmancî ji Amedê", "Deng bi kurmancî"], + ) + self.assertEqual( + page_data[0]["sounds"][0]["audio"], + "LL-Q36368 (kur)-Dildadil-av.wav", + ) + + def test_ku_kîte(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + self.wxr.wtp.add_page( + "Şablon:ku-kîte", + 10, + """[[kîte#ku|Kîtekirin]]: lê·ker""", + ) + page_data = parse_page( + self.wxr, + "lêker", + """== {{ziman|ku}} == +=== Bilêvkirin === +* {{ku-kîte}} +=== Navdêr 1 === +# [[peyv|Peyvên]]""", + ) + self.assertEqual(page_data[0]["hyphenation"], "lê·ker") diff --git a/tests/test_ku_translation.py b/tests/test_ku_translation.py index 161628006..3a73e8dee 100644 --- a/tests/test_ku_translation.py +++ b/tests/test_ku_translation.py @@ -68,3 +68,27 @@ def test_link(self): page_data[0]["translations"], [{"word": "âcua", "lang": "bolognezî", "lang_code": "unknown"}], ) + + def test_dialects(self): + self.wxr.wtp.add_page("Şablon:ziman", 10, "Kurmancî") + self.wxr.wtp.add_page("Şablon:Z", 10, "Hewramî") + page_data = parse_page( + self.wxr, + "av", + """== {{ziman|ku}} == +=== Navdêr === +# [[vexwarin|Vexwarin]]a bê[[reng]] +==== Bi zaravayên din ==== +* {{Z|hac}}: [[awî]]""", + ) + self.assertEqual( + page_data[0]["translations"], + [ + { + "word": "awî", + "lang": "Hewramî", + "lang_code": "hac", + "tags": ["dialectal"], + } + ], + )