From ffe2a3df9d6c8bb60af6cbdfcfae2b8e5d521c6a Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 10 Jan 2024 11:30:24 +0800
Subject: [PATCH 1/2] Extract Conjugaison pages through "conj" form line
 template
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some Japanese pages use a different word for Conjugaison link, for
example: https://fr.wiktionary.org/wiki/居る uses
https://fr.wiktionary.org/wiki/Conjugaison:japonais/居る/いる
and https://fr.wiktionary.org/wiki/格好 uses
https://fr.wiktionary.org/wiki/Conjugaison:japonais/格好だ
---
 src/wiktextract/extractor/fr/conjugation.py | 80 +++++++++++----------
 src/wiktextract/extractor/fr/form_line.py   | 42 +++++++++++
 src/wiktextract/extractor/fr/page.py        | 10 ---
 tests/test_fr_conj.py                       | 16 ++---
 4 files changed, 93 insertions(+), 55 deletions(-)

diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py
index ec51664b0..1b37a9c55 100644
--- a/src/wiktextract/extractor/fr/conjugation.py
+++ b/src/wiktextract/extractor/fr/conjugation.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 from wikitextprocessor import NodeKind, WikiNode
 from wikitextprocessor.parser import HTMLNode, TemplateNode
 from wiktextract.page import clean_node
@@ -11,7 +9,7 @@
 def extract_conjugation(
     wxr: WiktextractContext,
     entry: WordEntry,
-    word: str = "",
+    conj_page_title: str,
     select_template: str = "1",
 ) -> None:
     """
@@ -21,26 +19,36 @@ def extract_conjugation(
     https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
     https://fr.wiktionary.org/wiki/Aide:Conjugaisons
     """
-    conj_ns = wxr.wtp.NAMESPACE_DATA["Conjugaison"]
-    if len(word) == 0:
-        word = entry.word
-    conj_page_title = f"{conj_ns['name']}:{entry.lang.lower()}/{word}"
-    conj_page = wxr.wtp.get_page_body(conj_page_title, conj_ns["id"])
+    conj_page = wxr.wtp.get_page_body(
+        conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
+    )
     if conj_page is None:
         return
     conj_root = wxr.wtp.parse(conj_page)
     for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
         if conj_template.template_name.startswith("fr-conj-"):
-            process_fr_conj_template(wxr, entry, conj_template)
+            process_fr_conj_template(wxr, entry, conj_template, conj_page_title)
         elif conj_template.template_name == "Onglets conjugaison":
-            process_onglets_template(wxr, entry, conj_template, select_template)
+            # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
+            # this template expands to two tabs of tables
+            selected_template = conj_template.template_parameters.get(
+                f"contenu{select_template}"
+            )
+            if selected_template is not None:
+                process_fr_conj_template(
+                    wxr, entry, selected_template, conj_page_title
+                )
         elif conj_template.template_name.startswith(":Conjugaison:"):
-            word = conj_template.template_name.rsplit("/", 1)[-1]
-            extract_conjugation(wxr, entry, word, "2")
+            extract_conjugation(
+                wxr, entry, conj_template.template_name[1:], "2"
+            )
 
 
 def process_fr_conj_template(
-    wxr: WiktextractContext, entry: WordEntry, template_node: TemplateNode
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    template_node: TemplateNode,
+    conj_page_title: str,
 ) -> None:
     # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
     # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
@@ -54,13 +62,20 @@ def process_fr_conj_template(
                 h3_text = clean_node(wxr, None, node)
             elif node.tag == "div":
                 if h3_text == "Modes impersonnels":
-                    process_fr_conj_modes_table(wxr, entry, node)
+                    process_fr_conj_modes_table(
+                        wxr, entry, node, conj_page_title
+                    )
                 else:
-                    process_fr_conj_table(wxr, entry, node, h3_text)
+                    process_fr_conj_table(
+                        wxr, entry, node, h3_text, conj_page_title
+                    )
 
 
 def process_fr_conj_modes_table(
-    wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    div_node: HTMLNode,
+    conj_page_title: str,
 ) -> None:
     # the first "Modes impersonnels" table
     for table_node in div_node.find_child(NodeKind.TABLE):
@@ -81,7 +96,7 @@ def process_fr_conj_modes_table(
                         form=form_text,
                         tags=tags.copy(),
                         ipas=[clean_node(wxr, None, cell)],
-                        source="Conjugaison page",
+                        source=conj_page_title,
                     )
                     form.tags.append("Présent" if cell_index == 3 else "Passé")
                     entry.forms.append(form)
@@ -93,7 +108,11 @@ def process_fr_conj_modes_table(
 
 
 def process_fr_conj_table(
-    wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode, h3_text: str
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    div_node: HTMLNode,
+    h3_text: str,
+    conj_page_title: str,
 ) -> None:
     for table_node in div_node.find_child(NodeKind.TABLE):
         for row_index, row in enumerate(
@@ -109,11 +128,11 @@ def process_fr_conj_table(
                             and cell_child.tag == "table"
                         ):
                             process_fr_conj_html_table(
-                                wxr, entry, cell_child, h3_text
+                                wxr, entry, cell_child, h3_text, conj_page_title
                             )
                         elif cell_child.kind == NodeKind.TABLE:
                             process_fr_conj_wiki_table(
-                                wxr, entry, cell_child, h3_text
+                                wxr, entry, cell_child, h3_text, conj_page_title
                             )
 
 
@@ -122,13 +141,14 @@ def process_fr_conj_html_table(
     entry: WordEntry,
     table_node: HTMLNode,
     h3_text: str,
+    conj_page_title: str,
 ):
     tags = [h3_text]
     for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
         if tr_index == 0:
             tags.append(clean_node(wxr, None, tr_node.children))
         else:
-            form = Form(tags=tags, source="Conjugaison page")
+            form = Form(tags=tags, source=conj_page_title)
             for td_index, td_node in enumerate(
                 tr_node.find_html_recursively("td")
             ):
@@ -153,13 +173,14 @@ def process_fr_conj_wiki_table(
     entry: WordEntry,
     table_node: WikiNode,
     h3_text: str,
+    conj_page_title: str,
 ):
     tags = [h3_text]
     for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
         if row_index == 0:
             tags.append(clean_node(wxr, None, row.children))
         else:
-            form = Form(tags=tags, source="Conjugaison page")
+            form = Form(tags=tags, source=conj_page_title)
             for cell_index, cell in enumerate(
                 row.find_child(NodeKind.TABLE_CELL)
             ):
@@ -173,18 +194,3 @@ def process_fr_conj_wiki_table(
 
             if len(form.form) > 0 and form.form != "—":
                 entry.forms.append(form)
-
-
-def process_onglets_template(
-    wxr: WiktextractContext,
-    entry: WordEntry,
-    template_node: TemplateNode,
-    select: str,
-) -> None:
-    # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
-    # this template expands to two tabs of tables
-    selected_template = template_node.template_parameters.get(
-        f"contenu{select}"
-    )
-    if selected_template is not None:
-        process_fr_conj_template(wxr, entry, selected_template)
diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py
index e315e0007..fc847945d 100644
--- a/src/wiktextract/extractor/fr/form_line.py
+++ b/src/wiktextract/extractor/fr/form_line.py
@@ -5,6 +5,7 @@
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
+from .conjugation import extract_conjugation
 from .models import Form, Sound, WordEntry
 from .pronunciation import PRON_TEMPLATES, process_pron_template
 
@@ -34,6 +35,8 @@ def extract_form_line(
                 process_zh_mot_template(wxr, node, page_data)
             elif node.template_name == "ja-mot":
                 process_ja_mot_template(wxr, node, page_data)
+            elif node.template_name in ("conj", "conjugaison"):
+                process_conj_template(wxr, node, page_data)
             else:
                 tag = clean_node(wxr, page_data[-1], node)
                 if (
@@ -121,3 +124,42 @@ def process_ja_mot_template(
                     Form(form=form_text, tags=["romanization"])
                 )
             break
+
+
+def process_conj_template(
+    wxr: WiktextractContext,
+    template_node: TemplateNode,
+    page_data: list[WordEntry],
+) -> None:
+    # https://fr.wiktionary.org/wiki/Modèle:conjugaison
+    expanded_node = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    for link in expanded_node.find_child(NodeKind.LINK):
+        if len(link.largs) == 0:
+            continue
+        conj_title = link.largs[0][0]
+        if not conj_title.startswith("Conjugaison:"):
+            continue
+        conj_word = conj_title.split("/", 1)[-1]
+        if conj_word in (
+            "Premier groupe",
+            "Deuxième groupe",
+            "Troisième groupe",
+        ):
+            continue
+        if (
+            len(page_data) > 1
+            and page_data[-2].lang_code == page_data[-1].lang_code
+            and page_data[-2].pos == page_data[-1].pos
+            and len(page_data[-2].forms) > 0
+            and page_data[-2].forms[-1].source == conj_title
+        ):
+            page_data[-1].forms = page_data[-2].forms
+        else:
+            extract_conjugation(wxr, page_data[-1], conj_title)
+
+    tag = clean_node(wxr, page_data[-1], expanded_node)
+    if template_node.template_name in ("conj", "conjugaison"):
+        tag = tag.removesuffix("(voir la conjugaison)").strip()
+    page_data[-1].tags.append(tag)
diff --git a/src/wiktextract/extractor/fr/page.py b/src/wiktextract/extractor/fr/page.py
index 03ee06445..1c5200a20 100644
--- a/src/wiktextract/extractor/fr/page.py
+++ b/src/wiktextract/extractor/fr/page.py
@@ -6,7 +6,6 @@
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
 
-from .conjugation import extract_conjugation
 from .etymology import EtymologyData, extract_etymology, insert_etymology_data
 from .form_line import extract_form_line
 from .gloss import extract_gloss, process_exemple_template
@@ -152,15 +151,6 @@ def process_pos_block(
 
     form_line_nodes = child_nodes[form_line_start:gloss_start]
     extract_form_line(wxr, page_data, form_line_nodes)
-    if pos_type == "verb":
-        if (
-            len(page_data) > 1
-            and page_data[-2].pos == pos_type
-            and page_data[-2].lang_code == page_data[-1].lang_code
-        ):
-            page_data[-1].forms = page_data[-2].forms
-        else:
-            extract_conjugation(wxr, page_data[-1])
 
 
 def parse_page(
diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py
index 82dbc5e63..5b009c0b0 100644
--- a/tests/test_fr_conj.py
+++ b/tests/test_fr_conj.py
@@ -68,32 +68,32 @@ def test_fr_conj_1(self):
 </div>""",
         )
         entry = WordEntry(lang_code="fr", lang="Français", word="lancer")
-        extract_conjugation(self.wxr, entry)
+        extract_conjugation(self.wxr, entry, "Conjugaison:français/lancer")
         self.assertEqual(
             [f.model_dump(exclude_defaults=True) for f in entry.forms],
             [
                 {
                     "form": "lancer",
                     "ipas": ["\\lɑ̃.se\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Modes impersonnels", "Infinitif", "Présent"],
                 },
                 {
                     "form": "avoir lancé",
                     "ipas": ["\\a.vwaʁ lɑ̃.se\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Modes impersonnels", "Infinitif", "Passé"],
                 },
                 {
                     "form": "je lance",
                     "ipas": ["\\ʒə lɑ̃s\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Indicatif", "Présent"],
                 },
                 {
                     "form": "j’ai lancé",
                     "ipas": ["\\ʒ‿e lɑ̃.se\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/lancer",
                     "tags": ["Indicatif", "Passé composé"],
                 },
             ],
@@ -139,20 +139,20 @@ def test_onglets_conjugaison(self):
 </div>""",
         )
         entry = WordEntry(lang_code="fr", lang="Français", word="s’abattre")
-        extract_conjugation(self.wxr, entry)
+        extract_conjugation(self.wxr, entry, "Conjugaison:français/s’abattre")
         self.assertEqual(
             [f.model_dump(exclude_defaults=True) for f in entry.forms],
             [
                 {
                     "form": "s’abattre",
                     "ipas": ["\\s‿a.batʁ\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/abattre",
                     "tags": ["Modes impersonnels", "Infinitif", "Présent"],
                 },
                 {
                     "form": "s’être abattu",
                     "ipas": ["\\s‿ɛtʁ‿a.ba.ty\\"],
-                    "source": "Conjugaison page",
+                    "source": "Conjugaison:français/abattre",
                     "tags": ["Modes impersonnels", "Infinitif", "Passé"],
                 },
             ],

From 2655a6a4e72feafe5f5aae57f458830042940cf8 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Wed, 10 Jan 2024 15:14:29 +0800
Subject: [PATCH 2/2] Extract "ja-flx-adj*" templates in Conjugaison pages

---
 src/wiktextract/extractor/fr/conjugation.py | 47 +++++++++++++++++++
 src/wiktextract/extractor/fr/form_line.py   |  9 +++-
 src/wiktextract/extractor/fr/models.py      |  6 ++-
 tests/test_fr_conj.py                       | 52 +++++++++++++++++++++
 4 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py
index 1b37a9c55..3168f02f0 100644
--- a/src/wiktextract/extractor/fr/conjugation.py
+++ b/src/wiktextract/extractor/fr/conjugation.py
@@ -42,6 +42,10 @@ def extract_conjugation(
             extract_conjugation(
                 wxr, entry, conj_template.template_name[1:], "2"
             )
+        elif conj_template.template_name.startswith("ja-flx-adj"):
+            proces_ja_flx_adj_template(
+                wxr, entry, conj_template, conj_page_title
+            )
 
 
 def process_fr_conj_template(
@@ -194,3 +198,46 @@ def process_fr_conj_wiki_table(
 
             if len(form.form) > 0 and form.form != "—":
                 entry.forms.append(form)
+
+
+def proces_ja_flx_adj_template(
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    template_node: TemplateNode,
+    conj_page_title: str,
+) -> None:
+    # https://fr.wiktionary.org/wiki/Modèle:ja-adj
+    # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
+    expanded_template = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    for table_node in expanded_template.find_child(NodeKind.TABLE):
+        first_tag = ""
+        for row in table_node.find_child(NodeKind.TABLE_ROW):
+            forms = []
+            tags = [first_tag]
+            for cell_index, row_child in enumerate(
+                row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
+            ):
+                row_child_text = clean_node(wxr, None, row_child)
+                if row_child.kind == NodeKind.TABLE_HEADER_CELL:
+                    first_tag = row_child_text
+                else:
+                    for line_index, line in enumerate(
+                        row_child_text.splitlines()
+                    ):
+                        if cell_index == 0:
+                            tags.append(line)
+                            continue
+                        if line_index + 1 > len(forms):
+                            forms.append(
+                                Form(tags=tags, source=conj_page_title)
+                            )
+                        if cell_index == 1:
+                            forms[line_index].form = line
+                        elif cell_index == 2:
+                            forms[line_index].hiragana = line
+                        elif cell_index == 3:
+                            forms[line_index].roman = line
+
+            entry.forms.extend(forms)
diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py
index fc847945d..e1e800999 100644
--- a/src/wiktextract/extractor/fr/form_line.py
+++ b/src/wiktextract/extractor/fr/form_line.py
@@ -35,7 +35,10 @@ def extract_form_line(
                 process_zh_mot_template(wxr, node, page_data)
             elif node.template_name == "ja-mot":
                 process_ja_mot_template(wxr, node, page_data)
-            elif node.template_name in ("conj", "conjugaison"):
+            elif node.template_name in (
+                "conj",
+                "conjugaison",
+            ) or node.template_name.startswith(("ja-adj-", "ja-verb-")):
                 process_conj_template(wxr, node, page_data)
             else:
                 tag = clean_node(wxr, page_data[-1], node)
@@ -162,4 +165,8 @@ def process_conj_template(
     tag = clean_node(wxr, page_data[-1], expanded_node)
     if template_node.template_name in ("conj", "conjugaison"):
         tag = tag.removesuffix("(voir la conjugaison)").strip()
+    elif template_node.template_name.startswith("ja-"):
+        tag = (
+            tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
+        )
     page_data[-1].tags.append(tag)
diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py
index 6984c498e..461a810ec 100644
--- a/src/wiktextract/extractor/fr/models.py
+++ b/src/wiktextract/extractor/fr/models.py
@@ -26,7 +26,11 @@ class Form(FrenchBaseModel):
     form: str = ""
     tags: list[str] = []
     ipas: list[str] = []
-    source: str = Field("", description="Form line template name")
+    source: str = Field(
+        "", description="Form line template name or Conjugaison page title"
+    )
+    hiragana: str = ""
+    roman: str = ""
 
 
 class Sound(FrenchBaseModel):
diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py
index 5b009c0b0..ac9f2acc1 100644
--- a/tests/test_fr_conj.py
+++ b/tests/test_fr_conj.py
@@ -157,3 +157,55 @@ def test_onglets_conjugaison(self):
                 },
             ],
         )
+
+    def test_ja_flx_adj(self):
+        # https://fr.wiktionary.org/wiki/Conjugaison:japonais/格好だ
+        self.wxr.wtp.start_page("格好")
+        self.wxr.wtp.add_page(
+            "Conjugaison:japonais/格好だ",
+            116,
+            "{{ja-flx-adj-な|格好|かっこう|kakkou}}",
+        )
+        self.wxr.wtp.add_page(
+            "Modèle:ja-flx-adj-な",
+            10,
+            """<h4>Flexions</h4>
+{|
+|-
+! colspan=\"4\" | '''Formes de base'''
+|-
+| '''Imperfectif''' (<span>未然形</span>) || <span>[[格好だろ]]</span>  || <span>[[かっこうだろ]]</span>  || ''kakkou daro''
+|-
+! colspan=\"4\" | '''Clefs de constructions'''
+|-
+| '''Neutre négatif''' || <span>[[格好ではない]]<br>[[格好じゃない]]</span> || <span>[[かっこうではない]]<br>[[かっこうじゃない]]</span> || ''kakkou dewa nai<br>kakkou ja nai''
+|}""",
+        )
+        entry = WordEntry(lang_code="ja", lang="Japonais", word="格好")
+        extract_conjugation(self.wxr, entry, "Conjugaison:japonais/格好だ")
+        self.assertEqual(
+            [f.model_dump(exclude_defaults=True) for f in entry.forms],
+            [
+                {
+                    "form": "格好だろ",
+                    "hiragana": "かっこうだろ",
+                    "roman": "kakkou daro",
+                    "source": "Conjugaison:japonais/格好だ",
+                    "tags": ["Formes de base", "Imperfectif (未然形)"],
+                },
+                {
+                    "form": "格好ではない",
+                    "hiragana": "かっこうではない",
+                    "roman": "kakkou dewa nai",
+                    "source": "Conjugaison:japonais/格好だ",
+                    "tags": ["Clefs de constructions", "Neutre négatif"],
+                },
+                {
+                    "form": "格好じゃない",
+                    "hiragana": "かっこうじゃない",
+                    "roman": "kakkou ja nai",
+                    "source": "Conjugaison:japonais/格好だ",
+                    "tags": ["Clefs de constructions", "Neutre négatif"],
+                },
+            ],
+        )