Extract Conjugaison pages through "conj" form line template

Some Japanese pages use a different word for Conjugaison link, for example: https://fr.wiktionary.org/wiki/居る uses https://fr.wiktionary.org/wiki/Conjugaison:japonais/居る/いる and https://fr.wiktionary.org/wiki/格好 uses https://fr.wiktionary.org/wiki/Conjugaison:japonais/格好だ
tatuylonen · Jan 10, 2024 · ffe2a3d · ffe2a3d
1 parent 9ca5eff
commit ffe2a3d
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 55 deletions.
diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import HTMLNode, TemplateNode
 from wiktextract.page import clean_node
@@ -11,7 +9,7 @@
 def extract_conjugation(
     wxr: WiktextractContext,
     entry: WordEntry,
-    word: str = "",
+    conj_page_title: str,
     select_template: str = "1",
 ) -> None:
     """
@@ -21,26 +19,36 @@ def extract_conjugation(
     https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
     https://fr.wiktionary.org/wiki/Aide:Conjugaisons
     """
-    conj_ns = wxr.wtp.NAMESPACE_DATA["Conjugaison"]
-    if len(word) == 0:
-        word = entry.word
-    conj_page_title = f"{conj_ns['name']}:{entry.lang.lower()}/{word}"
-    conj_page = wxr.wtp.get_page_body(conj_page_title, conj_ns["id"])
+    conj_page = wxr.wtp.get_page_body(
+        conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
+    )
     if conj_page is None:
         return
     conj_root = wxr.wtp.parse(conj_page)
     for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
         if conj_template.template_name.startswith("fr-conj-"):
-            process_fr_conj_template(wxr, entry, conj_template)
+            process_fr_conj_template(wxr, entry, conj_template, conj_page_title)
         elif conj_template.template_name == "Onglets conjugaison":
-            process_onglets_template(wxr, entry, conj_template, select_template)
+            # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
+            # this template expands to two tabs of tables
+            selected_template = conj_template.template_parameters.get(
+                f"contenu{select_template}"
+            )
+            if selected_template is not None:
+                process_fr_conj_template(
+                    wxr, entry, selected_template, conj_page_title
+                )
         elif conj_template.template_name.startswith(":Conjugaison:"):
-            word = conj_template.template_name.rsplit("/", 1)[-1]
-            extract_conjugation(wxr, entry, word, "2")
+            extract_conjugation(
+                wxr, entry, conj_template.template_name[1:], "2"
+            )
 
 
 def process_fr_conj_template(
-    wxr: WiktextractContext, entry: WordEntry, template_node: TemplateNode
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    template_node: TemplateNode,
+    conj_page_title: str,
 ) -> None:
     # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
     # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
@@ -54,13 +62,20 @@ def process_fr_conj_template(
                 h3_text = clean_node(wxr, None, node)
             elif node.tag == "div":
                 if h3_text == "Modes impersonnels":
-                    process_fr_conj_modes_table(wxr, entry, node)
+                    process_fr_conj_modes_table(
+                        wxr, entry, node, conj_page_title
+                    )
                 else:
-                    process_fr_conj_table(wxr, entry, node, h3_text)
+                    process_fr_conj_table(
+                        wxr, entry, node, h3_text, conj_page_title
+                    )
 
 
 def process_fr_conj_modes_table(
-    wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    div_node: HTMLNode,
+    conj_page_title: str,
 ) -> None:
     # the first "Modes impersonnels" table
     for table_node in div_node.find_child(NodeKind.TABLE):
@@ -81,7 +96,7 @@ def process_fr_conj_modes_table(
                         form=form_text,
                         tags=tags.copy(),
                         ipas=[clean_node(wxr, None, cell)],
-                        source="Conjugaison page",
+                        source=conj_page_title,
                     )
                     form.tags.append("Présent" if cell_index == 3 else "Passé")
                     entry.forms.append(form)
@@ -93,7 +108,11 @@ def process_fr_conj_modes_table(
 
 
 def process_fr_conj_table(
-    wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode, h3_text: str
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    div_node: HTMLNode,
+    h3_text: str,
+    conj_page_title: str,
 ) -> None:
     for table_node in div_node.find_child(NodeKind.TABLE):
         for row_index, row in enumerate(
@@ -109,11 +128,11 @@ def process_fr_conj_table(
                             and cell_child.tag == "table"
                         ):
                             process_fr_conj_html_table(
-                                wxr, entry, cell_child, h3_text
+                                wxr, entry, cell_child, h3_text, conj_page_title
                             )
                         elif cell_child.kind == NodeKind.TABLE:
                             process_fr_conj_wiki_table(
-                                wxr, entry, cell_child, h3_text
+                                wxr, entry, cell_child, h3_text, conj_page_title
                             )
 
 
@@ -122,13 +141,14 @@ def process_fr_conj_html_table(
     entry: WordEntry,
     table_node: HTMLNode,
     h3_text: str,
+    conj_page_title: str,
 ):
     tags = [h3_text]
     for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
         if tr_index == 0:
             tags.append(clean_node(wxr, None, tr_node.children))
         else:
-            form = Form(tags=tags, source="Conjugaison page")
+            form = Form(tags=tags, source=conj_page_title)
             for td_index, td_node in enumerate(
                 tr_node.find_html_recursively("td")
             ):
@@ -153,13 +173,14 @@ def process_fr_conj_wiki_table(
     entry: WordEntry,
     table_node: WikiNode,
     h3_text: str,
+    conj_page_title: str,
 ):
     tags = [h3_text]
     for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
         if row_index == 0:
             tags.append(clean_node(wxr, None, row.children))
         else:
-            form = Form(tags=tags, source="Conjugaison page")
+            form = Form(tags=tags, source=conj_page_title)
             for cell_index, cell in enumerate(
                 row.find_child(NodeKind.TABLE_CELL)
             ):
@@ -173,18 +194,3 @@ def process_fr_conj_wiki_table(
 
             if len(form.form) > 0 and form.form != "—":
                 entry.forms.append(form)
-
-
-def process_onglets_template(
-    wxr: WiktextractContext,
-    entry: WordEntry,
-    template_node: TemplateNode,
-    select: str,
-) -> None:
-    # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
-    # this template expands to two tabs of tables
-    selected_template = template_node.template_parameters.get(
-        f"contenu{select}"
-    )
-    if selected_template is not None:
-        process_fr_conj_template(wxr, entry, selected_template)
diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py
@@ -5,6 +5,7 @@
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from .conjugation import extract_conjugation
 from .models import Form, Sound, WordEntry
 from .pronunciation import PRON_TEMPLATES, process_pron_template
 
@@ -34,6 +35,8 @@ def extract_form_line(
                 process_zh_mot_template(wxr, node, page_data)
             elif node.template_name == "ja-mot":
                 process_ja_mot_template(wxr, node, page_data)
+            elif node.template_name in ("conj", "conjugaison"):
+                process_conj_template(wxr, node, page_data)
             else:
                 tag = clean_node(wxr, page_data[-1], node)
                 if (
@@ -121,3 +124,42 @@ def process_ja_mot_template(
                     Form(form=form_text, tags=["romanization"])
                 )
             break
+
+
+def process_conj_template(
+    wxr: WiktextractContext,
+    template_node: TemplateNode,
+    page_data: list[WordEntry],
+) -> None:
+    # https://fr.wiktionary.org/wiki/Modèle:conjugaison
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    for link in expanded_node.find_child(NodeKind.LINK):
+        if len(link.largs) == 0:
+            continue
+        conj_title = link.largs[0][0]
+        if not conj_title.startswith("Conjugaison:"):
+            continue
+        conj_word = conj_title.split("/", 1)[-1]
+        if conj_word in (
+            "Premier groupe",
+            "Deuxième groupe",
+            "Troisième groupe",
+        ):
+            continue
+        if (
+            len(page_data) > 1
+            and page_data[-2].lang_code == page_data[-1].lang_code
+            and page_data[-2].pos == page_data[-1].pos
+            and len(page_data[-2].forms) > 0
+            and page_data[-2].forms[-1].source == conj_title
+        ):
+            page_data[-1].forms = page_data[-2].forms
+        else:
+            extract_conjugation(wxr, page_data[-1], conj_title)
+
+    tag = clean_node(wxr, page_data[-1], expanded_node)
+    if template_node.template_name in ("conj", "conjugaison"):
+        tag = tag.removesuffix("(voir la conjugaison)").strip()
+    page_data[-1].tags.append(tag)
diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
@@ -6,7 +6,6 @@
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
-from .conjugation import extract_conjugation
 from .etymology import EtymologyData, extract_etymology, insert_etymology_data
 from .form_line import extract_form_line
 from .gloss import extract_gloss, process_exemple_template
@@ -152,15 +151,6 @@ def process_pos_block(
 
     form_line_nodes = child_nodes[form_line_start:gloss_start]
     extract_form_line(wxr, page_data, form_line_nodes)
-    if pos_type == "verb":
-        if (
-            len(page_data) > 1
-            and page_data[-2].pos == pos_type
-            and page_data[-2].lang_code == page_data[-1].lang_code
-        ):
-            page_data[-1].forms = page_data[-2].forms
-        else:
-            extract_conjugation(wxr, page_data[-1])
 
 
 def parse_page(

diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py
@@ -68,32 +68,32 @@ def test_fr_conj_1(self):
 </div>""",
         )
         entry = WordEntry(lang_code="fr", lang="Français", word="lancer")
-        extract_conjugation(self.wxr, entry)
+        extract_conjugation(self.wxr, entry, "Conjugaison:français/lancer")
         self.assertEqual(
             [f.model_dump(exclude_defaults=True) for f in entry.forms],
             [
                 {
                     "form": "lancer",
                     "ipas": ["\\lɑ̃.se\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Modes impersonnels", "Infinitif", "Présent"],
                 },
                 {
                     "form": "avoir lancé",
                     "ipas": ["\\a.vwaʁ lɑ̃.se\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Modes impersonnels", "Infinitif", "Passé"],
                 },
                 {
                     "form": "je lance",
                     "ipas": ["\\ʒə lɑ̃s\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Indicatif", "Présent"],
                 },
                 {
                     "form": "j’ai lancé",
                     "ipas": ["\\ʒ‿e lɑ̃.se\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Indicatif", "Passé composé"],
                 },
             ],
@@ -139,20 +139,20 @@ def test_onglets_conjugaison(self):
 </div>""",
         )
         entry = WordEntry(lang_code="fr", lang="Français", word="s’abattre")
-        extract_conjugation(self.wxr, entry)
+        extract_conjugation(self.wxr, entry, "Conjugaison:français/s’abattre")
         self.assertEqual(
             [f.model_dump(exclude_defaults=True) for f in entry.forms],
             [
                 {
                     "form": "s’abattre",
                     "ipas": ["\\s‿a.batʁ\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/abattre",
                     "tags": ["Modes impersonnels", "Infinitif", "Présent"],
                 },
                 {
                     "form": "s’être abattu",
                     "ipas": ["\\s‿ɛtʁ‿a.ba.ty\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/abattre",
                     "tags": ["Modes impersonnels", "Infinitif", "Passé"],
                 },
             ],