Extract "ja-flx-adj*" templates in Conjugaison pages

tatuylonen · Jan 10, 2024 · 2655a6a · 2655a6a
1 parent ffe2a3d
commit 2655a6a
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 2 deletions.
diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py
@@ -42,6 +42,10 @@ def extract_conjugation(
             extract_conjugation(
                 wxr, entry, conj_template.template_name[1:], "2"
             )
+        elif conj_template.template_name.startswith("ja-flx-adj"):
+            proces_ja_flx_adj_template(
+                wxr, entry, conj_template, conj_page_title
+            )
 
 
 def process_fr_conj_template(
@@ -194,3 +198,46 @@ def process_fr_conj_wiki_table(
 
             if len(form.form) > 0 and form.form != "—":
                 entry.forms.append(form)
+
+
+def proces_ja_flx_adj_template(
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    template_node: TemplateNode,
+    conj_page_title: str,
+) -> None:
+    # https://fr.wiktionary.org/wiki/Modèle:ja-adj
+    # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
+    expanded_template = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    for table_node in expanded_template.find_child(NodeKind.TABLE):
+        first_tag = ""
+        for row in table_node.find_child(NodeKind.TABLE_ROW):
+            forms = []
+            tags = [first_tag]
+            for cell_index, row_child in enumerate(
+                row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
+            ):
+                row_child_text = clean_node(wxr, None, row_child)
+                if row_child.kind == NodeKind.TABLE_HEADER_CELL:
+                    first_tag = row_child_text
+                else:
+                    for line_index, line in enumerate(
+                        row_child_text.splitlines()
+                    ):
+                        if cell_index == 0:
+                            tags.append(line)
+                            continue
+                        if line_index + 1 > len(forms):
+                            forms.append(
+                                Form(tags=tags, source=conj_page_title)
+                            )
+                        if cell_index == 1:
+                            forms[line_index].form = line
+                        elif cell_index == 2:
+                            forms[line_index].hiragana = line
+                        elif cell_index == 3:
+                            forms[line_index].roman = line
+
+            entry.forms.extend(forms)
diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py
@@ -35,7 +35,10 @@ def extract_form_line(
                 process_zh_mot_template(wxr, node, page_data)
             elif node.template_name == "ja-mot":
                 process_ja_mot_template(wxr, node, page_data)
-            elif node.template_name in ("conj", "conjugaison"):
+            elif node.template_name in (
+                "conj",
+                "conjugaison",
+            ) or node.template_name.startswith(("ja-adj-", "ja-verb-")):
                 process_conj_template(wxr, node, page_data)
             else:
                 tag = clean_node(wxr, page_data[-1], node)
@@ -162,4 +165,8 @@ def process_conj_template(
     tag = clean_node(wxr, page_data[-1], expanded_node)
     if template_node.template_name in ("conj", "conjugaison"):
         tag = tag.removesuffix("(voir la conjugaison)").strip()
+    elif template_node.template_name.startswith("ja-"):
+        tag = (
+            tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
+        )
     page_data[-1].tags.append(tag)
diff --git a/src/wiktextract/extractor/fr/models.py b/src/wiktextract/extractor/fr/models.py
@@ -26,7 +26,11 @@ class Form(FrenchBaseModel):
     form: str = ""
     tags: list[str] = []
     ipas: list[str] = []
-    source: str = Field("", description="Form line template name")
+    source: str = Field(
+        "", description="Form line template name or Conjugaison page title"
+    )
+    hiragana: str = ""
+    roman: str = ""
 
 
 class Sound(FrenchBaseModel):

diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py
@@ -157,3 +157,55 @@ def test_onglets_conjugaison(self):
                 },
             ],
         )
+
+    def test_ja_flx_adj(self):
+        # https://fr.wiktionary.org/wiki/Conjugaison:japonais/格好だ
+        self.wxr.wtp.start_page("格好")
+        self.wxr.wtp.add_page(
+            "Conjugaison:japonais/格好だ",
+            116,
+            "{{ja-flx-adj-な|格好|かっこう|kakkou}}",
+        )
+        self.wxr.wtp.add_page(
+            "Modèle:ja-flx-adj-な",
+            10,
+            """<h4>Flexions</h4>
+{|
+|-
+! colspan=\"4\" | '''Formes de base'''
+|-
+| '''Imperfectif''' (<span>未然形</span>) || <span>[[格好だろ]]</span>  || <span>[[かっこうだろ]]</span>  || ''kakkou daro''
+|-
+! colspan=\"4\" | '''Clefs de constructions'''
+|-
+| '''Neutre négatif''' || <span>[[格好ではない]]<br>[[格好じゃない]]</span> || <span>[[かっこうではない]]<br>[[かっこうじゃない]]</span> || ''kakkou dewa nai<br>kakkou ja nai''
+|}""",
+        )
+        entry = WordEntry(lang_code="ja", lang="Japonais", word="格好")
+        extract_conjugation(self.wxr, entry, "Conjugaison:japonais/格好だ")
+        self.assertEqual(
+            [f.model_dump(exclude_defaults=True) for f in entry.forms],
+            [
+                {
+                    "form": "格好だろ",
+                    "hiragana": "かっこうだろ",
+                    "roman": "kakkou daro",
+                    "source": "Conjugaison:japonais/格好だ",
+                    "tags": ["Formes de base", "Imperfectif (未然形)"],
+                },
+                {
+                    "form": "格好ではない",
+                    "hiragana": "かっこうではない",
+                    "roman": "kakkou dewa nai",
+                    "source": "Conjugaison:japonais/格好だ",
+                    "tags": ["Clefs de constructions", "Neutre négatif"],
+                },
+                {
+                    "form": "格好じゃない",
+                    "hiragana": "かっこうじゃない",
+                    "roman": "kakkou ja nai",
+                    "source": "Conjugaison:japonais/格好だ",
+                    "tags": ["Clefs de constructions", "Neutre négatif"],
+                },
+            ],
+        )