From ffe2a3df9d6c8bb60af6cbdfcfae2b8e5d521c6a Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 10 Jan 2024 11:30:24 +0800 Subject: [PATCH 1/2] Extract Conjugaison pages through "conj" form line template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Japanese pages use a different word for Conjugaison link, for example: https://fr.wiktionary.org/wiki/居る uses https://fr.wiktionary.org/wiki/Conjugaison:japonais/居る/いる and https://fr.wiktionary.org/wiki/格好 uses https://fr.wiktionary.org/wiki/Conjugaison:japonais/格好だ --- src/wiktextract/extractor/fr/conjugation.py | 80 +++++++++++---------- src/wiktextract/extractor/fr/form_line.py | 42 +++++++++++ src/wiktextract/extractor/fr/page.py | 10 --- tests/test_fr_conj.py | 16 ++--- 4 files changed, 93 insertions(+), 55 deletions(-) diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py index ec51664b0..1b37a9c55 100644 --- a/src/wiktextract/extractor/fr/conjugation.py +++ b/src/wiktextract/extractor/fr/conjugation.py @@ -1,5 +1,3 @@ -from typing import Optional - from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import HTMLNode, TemplateNode from wiktextract.page import clean_node @@ -11,7 +9,7 @@ def extract_conjugation( wxr: WiktextractContext, entry: WordEntry, - word: str = "", + conj_page_title: str, select_template: str = "1", ) -> None: """ @@ -21,26 +19,36 @@ def extract_conjugation( https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison https://fr.wiktionary.org/wiki/Aide:Conjugaisons """ - conj_ns = wxr.wtp.NAMESPACE_DATA["Conjugaison"] - if len(word) == 0: - word = entry.word - conj_page_title = f"{conj_ns['name']}:{entry.lang.lower()}/{word}" - conj_page = wxr.wtp.get_page_body(conj_page_title, conj_ns["id"]) + conj_page = wxr.wtp.get_page_body( + conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"] + ) if conj_page is None: return conj_root = wxr.wtp.parse(conj_page) for conj_template in conj_root.find_child(NodeKind.TEMPLATE): if conj_template.template_name.startswith("fr-conj-"): - process_fr_conj_template(wxr, entry, conj_template) + process_fr_conj_template(wxr, entry, conj_template, conj_page_title) elif conj_template.template_name == "Onglets conjugaison": - process_onglets_template(wxr, entry, conj_template, select_template) + # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison + # this template expands to two tabs of tables + selected_template = conj_template.template_parameters.get( + f"contenu{select_template}" + ) + if selected_template is not None: + process_fr_conj_template( + wxr, entry, selected_template, conj_page_title + ) elif conj_template.template_name.startswith(":Conjugaison:"): - word = conj_template.template_name.rsplit("/", 1)[-1] - extract_conjugation(wxr, entry, word, "2") + extract_conjugation( + wxr, entry, conj_template.template_name[1:], "2" + ) def process_fr_conj_template( - wxr: WiktextractContext, entry: WordEntry, template_node: TemplateNode + wxr: WiktextractContext, + entry: WordEntry, + template_node: TemplateNode, + conj_page_title: str, ) -> None: # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger @@ -54,13 +62,20 @@ def process_fr_conj_template( h3_text = clean_node(wxr, None, node) elif node.tag == "div": if h3_text == "Modes impersonnels": - process_fr_conj_modes_table(wxr, entry, node) + process_fr_conj_modes_table( + wxr, entry, node, conj_page_title + ) else: - process_fr_conj_table(wxr, entry, node, h3_text) + process_fr_conj_table( + wxr, entry, node, h3_text, conj_page_title + ) def process_fr_conj_modes_table( - wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode + wxr: WiktextractContext, + entry: WordEntry, + div_node: HTMLNode, + conj_page_title: str, ) -> None: # the first "Modes impersonnels" table for table_node in div_node.find_child(NodeKind.TABLE): @@ -81,7 +96,7 @@ def process_fr_conj_modes_table( form=form_text, tags=tags.copy(), ipas=[clean_node(wxr, None, cell)], - source="Conjugaison page", + source=conj_page_title, ) form.tags.append("Présent" if cell_index == 3 else "Passé") entry.forms.append(form) @@ -93,7 +108,11 @@ def process_fr_conj_modes_table( def process_fr_conj_table( - wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode, h3_text: str + wxr: WiktextractContext, + entry: WordEntry, + div_node: HTMLNode, + h3_text: str, + conj_page_title: str, ) -> None: for table_node in div_node.find_child(NodeKind.TABLE): for row_index, row in enumerate( @@ -109,11 +128,11 @@ def process_fr_conj_table( and cell_child.tag == "table" ): process_fr_conj_html_table( - wxr, entry, cell_child, h3_text + wxr, entry, cell_child, h3_text, conj_page_title ) elif cell_child.kind == NodeKind.TABLE: process_fr_conj_wiki_table( - wxr, entry, cell_child, h3_text + wxr, entry, cell_child, h3_text, conj_page_title ) @@ -122,13 +141,14 @@ def process_fr_conj_html_table( entry: WordEntry, table_node: HTMLNode, h3_text: str, + conj_page_title: str, ): tags = [h3_text] for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")): if tr_index == 0: tags.append(clean_node(wxr, None, tr_node.children)) else: - form = Form(tags=tags, source="Conjugaison page") + form = Form(tags=tags, source=conj_page_title) for td_index, td_node in enumerate( tr_node.find_html_recursively("td") ): @@ -153,13 +173,14 @@ def process_fr_conj_wiki_table( entry: WordEntry, table_node: WikiNode, h3_text: str, + conj_page_title: str, ): tags = [h3_text] for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)): if row_index == 0: tags.append(clean_node(wxr, None, row.children)) else: - form = Form(tags=tags, source="Conjugaison page") + form = Form(tags=tags, source=conj_page_title) for cell_index, cell in enumerate( row.find_child(NodeKind.TABLE_CELL) ): @@ -173,18 +194,3 @@ def process_fr_conj_wiki_table( if len(form.form) > 0 and form.form != "—": entry.forms.append(form) - - -def process_onglets_template( - wxr: WiktextractContext, - entry: WordEntry, - template_node: TemplateNode, - select: str, -) -> None: - # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison - # this template expands to two tabs of tables - selected_template = template_node.template_parameters.get( - f"contenu{select}" - ) - if selected_template is not None: - process_fr_conj_template(wxr, entry, selected_template) diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py index e315e0007..fc847945d 100644 --- a/src/wiktextract/extractor/fr/form_line.py +++ b/src/wiktextract/extractor/fr/form_line.py @@ -5,6 +5,7 @@ from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext +from .conjugation import extract_conjugation from .models import Form, Sound, WordEntry from .pronunciation import PRON_TEMPLATES, process_pron_template @@ -34,6 +35,8 @@ def extract_form_line( process_zh_mot_template(wxr, node, page_data) elif node.template_name == "ja-mot": process_ja_mot_template(wxr, node, page_data) + elif node.template_name in ("conj", "conjugaison"): + process_conj_template(wxr, node, page_data) else: tag = clean_node(wxr, page_data[-1], node) if ( @@ -121,3 +124,42 @@ def process_ja_mot_template( Form(form=form_text, tags=["romanization"]) ) break + + +def process_conj_template( + wxr: WiktextractContext, + template_node: TemplateNode, + page_data: list[WordEntry], +) -> None: + # https://fr.wiktionary.org/wiki/Modèle:conjugaison + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(template_node), expand_all=True + ) + for link in expanded_node.find_child(NodeKind.LINK): + if len(link.largs) == 0: + continue + conj_title = link.largs[0][0] + if not conj_title.startswith("Conjugaison:"): + continue + conj_word = conj_title.split("/", 1)[-1] + if conj_word in ( + "Premier groupe", + "Deuxième groupe", + "Troisième groupe", + ): + continue + if ( + len(page_data) > 1 + and page_data[-2].lang_code == page_data[-1].lang_code + and page_data[-2].pos == page_data[-1].pos + and len(page_data[-2].forms) > 0 + and page_data[-2].forms[-1].source == conj_title + ): + page_data[-1].forms = page_data[-2].forms + else: + extract_conjugation(wxr, page_data[-1], conj_title) + + tag = clean_node(wxr, page_data[-1], expanded_node) + if template_node.template_name in ("conj", "conjugaison"): + tag = tag.removesuffix("(voir la conjugaison)").strip() + page_data[-1].tags.append(tag) diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py index 03ee06445..1c5200a20 100644 --- a/src/wiktextract/extractor/fr/page.py +++ b/src/wiktextract/extractor/fr/page.py @@ -6,7 +6,6 @@ from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -from .conjugation import extract_conjugation from .etymology import EtymologyData, extract_etymology, insert_etymology_data from .form_line import extract_form_line from .gloss import extract_gloss, process_exemple_template @@ -152,15 +151,6 @@ def process_pos_block( form_line_nodes = child_nodes[form_line_start:gloss_start] extract_form_line(wxr, page_data, form_line_nodes) - if pos_type == "verb": - if ( - len(page_data) > 1 - and page_data[-2].pos == pos_type - and page_data[-2].lang_code == page_data[-1].lang_code - ): - page_data[-1].forms = page_data[-2].forms - else: - extract_conjugation(wxr, page_data[-1]) def parse_page( diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py index 82dbc5e63..5b009c0b0 100644 --- a/tests/test_fr_conj.py +++ b/tests/test_fr_conj.py @@ -68,32 +68,32 @@ def test_fr_conj_1(self): """, ) entry = WordEntry(lang_code="fr", lang="Français", word="lancer") - extract_conjugation(self.wxr, entry) + extract_conjugation(self.wxr, entry, "Conjugaison:français/lancer") self.assertEqual( [f.model_dump(exclude_defaults=True) for f in entry.forms], [ { "form": "lancer", "ipas": ["\\lɑ̃.se\\"], - "source": "Conjugaison page", + "source": "Conjugaison:français/lancer", "tags": ["Modes impersonnels", "Infinitif", "Présent"], }, { "form": "avoir lancé", "ipas": ["\\a.vwaʁ lɑ̃.se\\"], - "source": "Conjugaison page", + "source": "Conjugaison:français/lancer", "tags": ["Modes impersonnels", "Infinitif", "Passé"], }, { "form": "je lance", "ipas": ["\\ʒə lɑ̃s\\"], - "source": "Conjugaison page", + "source": "Conjugaison:français/lancer", "tags": ["Indicatif", "Présent"], }, { "form": "j’ai lancé", "ipas": ["\\ʒ‿e lɑ̃.se\\"], - "source": "Conjugaison page", + "source": "Conjugaison:français/lancer", "tags": ["Indicatif", "Passé composé"], }, ], @@ -139,20 +139,20 @@ def test_onglets_conjugaison(self): """, ) entry = WordEntry(lang_code="fr", lang="Français", word="s’abattre") - extract_conjugation(self.wxr, entry) + extract_conjugation(self.wxr, entry, "Conjugaison:français/s’abattre") self.assertEqual( [f.model_dump(exclude_defaults=True) for f in entry.forms], [ { "form": "s’abattre", "ipas": ["\\s‿a.batʁ\\"], - "source": "Conjugaison page", + "source": "Conjugaison:français/abattre", "tags": ["Modes impersonnels", "Infinitif", "Présent"], }, { "form": "s’être abattu", "ipas": ["\\s‿ɛtʁ‿a.ba.ty\\"], - "source": "Conjugaison page", + "source": "Conjugaison:français/abattre", "tags": ["Modes impersonnels", "Infinitif", "Passé"], }, ], From 2655a6a4e72feafe5f5aae57f458830042940cf8 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Wed, 10 Jan 2024 15:14:29 +0800 Subject: [PATCH 2/2] Extract "ja-flx-adj*" templates in Conjugaison pages --- src/wiktextract/extractor/fr/conjugation.py | 47 +++++++++++++++++++ src/wiktextract/extractor/fr/form_line.py | 9 +++- src/wiktextract/extractor/fr/models.py | 6 ++- tests/test_fr_conj.py | 52 +++++++++++++++++++++ 4 files changed, 112 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py index 1b37a9c55..3168f02f0 100644 --- a/src/wiktextract/extractor/fr/conjugation.py +++ b/src/wiktextract/extractor/fr/conjugation.py @@ -42,6 +42,10 @@ def extract_conjugation( extract_conjugation( wxr, entry, conj_template.template_name[1:], "2" ) + elif conj_template.template_name.startswith("ja-flx-adj"): + proces_ja_flx_adj_template( + wxr, entry, conj_template, conj_page_title + ) def process_fr_conj_template( @@ -194,3 +198,46 @@ def process_fr_conj_wiki_table( if len(form.form) > 0 and form.form != "—": entry.forms.append(form) + + +def proces_ja_flx_adj_template( + wxr: WiktextractContext, + entry: WordEntry, + template_node: TemplateNode, + conj_page_title: str, +) -> None: + # https://fr.wiktionary.org/wiki/Modèle:ja-adj + # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な + expanded_template = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(template_node), expand_all=True + ) + for table_node in expanded_template.find_child(NodeKind.TABLE): + first_tag = "" + for row in table_node.find_child(NodeKind.TABLE_ROW): + forms = [] + tags = [first_tag] + for cell_index, row_child in enumerate( + row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) + ): + row_child_text = clean_node(wxr, None, row_child) + if row_child.kind == NodeKind.TABLE_HEADER_CELL: + first_tag = row_child_text + else: + for line_index, line in enumerate( + row_child_text.splitlines() + ): + if cell_index == 0: + tags.append(line) + continue + if line_index + 1 > len(forms): + forms.append( + Form(tags=tags, source=conj_page_title) + ) + if cell_index == 1: + forms[line_index].form = line + elif cell_index == 2: + forms[line_index].hiragana = line + elif cell_index == 3: + forms[line_index].roman = line + + entry.forms.extend(forms) diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py index fc847945d..e1e800999 100644 --- a/src/wiktextract/extractor/fr/form_line.py +++ b/src/wiktextract/extractor/fr/form_line.py @@ -35,7 +35,10 @@ def extract_form_line( process_zh_mot_template(wxr, node, page_data) elif node.template_name == "ja-mot": process_ja_mot_template(wxr, node, page_data) - elif node.template_name in ("conj", "conjugaison"): + elif node.template_name in ( + "conj", + "conjugaison", + ) or node.template_name.startswith(("ja-adj-", "ja-verb-")): process_conj_template(wxr, node, page_data) else: tag = clean_node(wxr, page_data[-1], node) @@ -162,4 +165,8 @@ def process_conj_template( tag = clean_node(wxr, page_data[-1], expanded_node) if template_node.template_name in ("conj", "conjugaison"): tag = tag.removesuffix("(voir la conjugaison)").strip() + elif template_node.template_name.startswith("ja-"): + tag = ( + tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip() + ) page_data[-1].tags.append(tag) diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py index 6984c498e..461a810ec 100644 --- a/src/wiktextract/extractor/fr/models.py +++ b/src/wiktextract/extractor/fr/models.py @@ -26,7 +26,11 @@ class Form(FrenchBaseModel): form: str = "" tags: list[str] = [] ipas: list[str] = [] - source: str = Field("", description="Form line template name") + source: str = Field( + "", description="Form line template name or Conjugaison page title" + ) + hiragana: str = "" + roman: str = "" class Sound(FrenchBaseModel): diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py index 5b009c0b0..ac9f2acc1 100644 --- a/tests/test_fr_conj.py +++ b/tests/test_fr_conj.py @@ -157,3 +157,55 @@ def test_onglets_conjugaison(self): }, ], ) + + def test_ja_flx_adj(self): + # https://fr.wiktionary.org/wiki/Conjugaison:japonais/格好だ + self.wxr.wtp.start_page("格好") + self.wxr.wtp.add_page( + "Conjugaison:japonais/格好だ", + 116, + "{{ja-flx-adj-な|格好|かっこう|kakkou}}", + ) + self.wxr.wtp.add_page( + "Modèle:ja-flx-adj-な", + 10, + """

Flexions

+{| +|- +! colspan=\"4\" | '''Formes de base''' +|- +| '''Imperfectif''' (未然形) || [[格好だろ]] || [[かっこうだろ]] || ''kakkou daro'' +|- +! colspan=\"4\" | '''Clefs de constructions''' +|- +| '''Neutre négatif''' || [[格好ではない]]
[[格好じゃない]]
|| [[かっこうではない]]
[[かっこうじゃない]]
|| ''kakkou dewa nai
kakkou ja nai'' +|}""", + ) + entry = WordEntry(lang_code="ja", lang="Japonais", word="格好") + extract_conjugation(self.wxr, entry, "Conjugaison:japonais/格好だ") + self.assertEqual( + [f.model_dump(exclude_defaults=True) for f in entry.forms], + [ + { + "form": "格好だろ", + "hiragana": "かっこうだろ", + "roman": "kakkou daro", + "source": "Conjugaison:japonais/格好だ", + "tags": ["Formes de base", "Imperfectif (未然形)"], + }, + { + "form": "格好ではない", + "hiragana": "かっこうではない", + "roman": "kakkou dewa nai", + "source": "Conjugaison:japonais/格好だ", + "tags": ["Clefs de constructions", "Neutre négatif"], + }, + { + "form": "格好じゃない", + "hiragana": "かっこうじゃない", + "roman": "kakkou ja nai", + "source": "Conjugaison:japonais/格好だ", + "tags": ["Clefs de constructions", "Neutre négatif"], + }, + ], + )