Merge pull request #457 from xxyzz/fr

Extract "ja-*" Conjugaison template tables
tatuylonen · Jan 10, 2024 · 7c67235 · 7c67235
2 parents 7ba706a + 19dc0a1
commit 7c67235
Show file tree

Hide file tree

Showing 4 changed files with 164 additions and 2 deletions.
diff --git a/src/wiktextract/extractor/fr/conjugation.py b/src/wiktextract/extractor/fr/conjugation.py
@@ -46,6 +46,8 @@ def extract_conjugation(
             proces_ja_flx_adj_template(
                 wxr, entry, conj_template, conj_page_title
             )
+        elif conj_template.template_name.startswith("ja-"):
+            proces_ja_conj_template(wxr, entry, conj_template, conj_page_title)
 
 
 def process_fr_conj_template(
@@ -241,3 +243,60 @@ def proces_ja_flx_adj_template(
                             forms[line_index].roman = line
 
             entry.forms.extend(forms)
+
+
+def proces_ja_conj_template(
+    wxr: WiktextractContext,
+    entry: WordEntry,
+    template_node: TemplateNode,
+    conj_page_title: str,
+) -> None:
+    # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
+    # Modèle:ja-在る
+    expanded_template = wxr.wtp.parse(
+        wxr.wtp.node_to_wikitext(template_node), expand_all=True
+    )
+    for table_node in expanded_template.find_child(NodeKind.TABLE):
+        first_tag = ""
+        row_headers = {}
+        for row in table_node.find_child(NodeKind.TABLE_ROW):
+            if (
+                all(
+                    isinstance(c, WikiNode)
+                    and c.kind == NodeKind.TABLE_HEADER_CELL
+                    for c in row.children
+                )
+                and len(row.children) > 1
+            ):
+                # skip header row of the "Clefs de constructions" table
+                continue
+
+            for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
+                header_text = clean_node(wxr, None, header)
+                if len(row.children) == 1:
+                    first_tag = header_text
+                else:
+                    row_headers[header_text] = int(
+                        header.attrs.get("rowspan", "1")
+                    )
+
+            tags = [first_tag]
+            for tag, rowspan in row_headers.copy().items():
+                tags.append(tag)
+                if rowspan == 1:
+                    del row_headers[tag]
+                else:
+                    row_headers[tag] = rowspan - 1
+            form = Form(tags=tags, source=conj_page_title)
+            for cell_index, cell in enumerate(
+                row.find_child(NodeKind.TABLE_CELL)
+            ):
+                cell_text = clean_node(wxr, None, cell)
+                if cell_index == 0:
+                    form.form = cell_text
+                elif cell_index == 1:
+                    form.hiragana = cell_text
+                elif cell_index == 2:
+                    form.roman = cell_text
+            if len(form.form) > 0:
+                entry.forms.append(form)
diff --git a/src/wiktextract/extractor/fr/form_line.py b/src/wiktextract/extractor/fr/form_line.py
@@ -38,7 +38,7 @@ def extract_form_line(
             elif node.template_name in (
                 "conj",
                 "conjugaison",
-            ) or node.template_name.startswith(("ja-adj-", "ja-verb-")):
+            ) or node.template_name.startswith(("ja-adj-", "ja-verbe")):
                 process_conj_template(wxr, node, page_data)
             else:
                 tag = clean_node(wxr, page_data[-1], node)
@@ -54,6 +54,10 @@ def extract_form_line(
                     page_data[-1].tags.append(tag.strip("()"))
 
             pre_template_name = node.template_name
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+            tag = clean_node(wxr, None, node)
+            if tag != "ou":
+                page_data[-1].tags.append(tag)
 
 
 def process_equiv_pour_template(
@@ -169,4 +173,5 @@ def process_conj_template(
         tag = (
             tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
         )
-    page_data[-1].tags.append(tag)
+    if len(tag) > 0:
+        page_data[-1].tags.append(tag)
diff --git a/tests/test_fr_conj.py b/tests/test_fr_conj.py
@@ -209,3 +209,94 @@ def test_ja_flx_adj(self):
                 },
             ],
         )
+
+    def test_ja_conj(self):
+        # https://fr.wiktionary.org/wiki/Conjugaison:japonais/在る
+        self.wxr.wtp.start_page("在る")
+        self.wxr.wtp.add_page("Conjugaison:japonais/在る", 116, "{{ja-在る}}")
+        self.wxr.wtp.add_page(
+            "Modèle:ja-在る",
+            10,
+            """{|
+! colspan=\"7\" | '''Formes de base'''
+|-
+! colspan=\"4\" | '''L'inaccompli'''
+| <bdi>[[在る#ja|在る]]</bdi>
+| <bdi>[[ある#ja|ある]]</bdi>
+| ''aru\n''
+|-
+! colspan=\"4\" | '''Imperfectif''' (<bdi>[[未然形#ja-nom|未然形]]</bdi>, <bdi>''mizen-kei''</bdi>)
+| <bdi>[[無い#ja|無い]]</bdi>
+| <bdi>[[ない#ja|ない]]</bdi>
+| ''nai\n''
+|-
+! colspan=\"7\" | '''Clefs de constructions'''
+|-
+! colspan=\"2\" | Temps
+! Forme
+! Terme
+! [[kanji|Kanji]]
+! [[hiragana|Hiragana]]
+! [[romaji|Rōmaji]]
+|-
+! rowspan=\"4\" colspan=\"2\" | Présent / Futur
+! rowspan=\"2\" | poli
+! affirmatif
+| <bdi>[[在ります#ja|在ります]]</bdi>
+| <bdi>[[あります#ja|あります]]</bdi>
+| ''arimasu\n''
+|-
+! négatif
+| <bdi>[[在りません#ja|在りません]]</bdi>
+| <bdi>[[ありません#ja|ありません]]</bdi>
+| ''arimasen\n''
+|}""",
+        )
+        entry = WordEntry(lang_code="ja", lang="Japonais", word="在る")
+        extract_conjugation(self.wxr, entry, "Conjugaison:japonais/在る")
+        self.assertEqual(
+            [f.model_dump(exclude_defaults=True) for f in entry.forms],
+            [
+                {
+                    "form": "在る",
+                    "hiragana": "ある",
+                    "roman": "aru",
+                    "source": "Conjugaison:japonais/在る",
+                    "tags": ["Formes de base", "L'inaccompli"],
+                },
+                {
+                    "form": "無い",
+                    "hiragana": "ない",
+                    "roman": "nai",
+                    "source": "Conjugaison:japonais/在る",
+                    "tags": [
+                        "Formes de base",
+                        "Imperfectif (未然形, mizen-kei)",
+                    ],
+                },
+                {
+                    "form": "在ります",
+                    "hiragana": "あります",
+                    "roman": "arimasu",
+                    "source": "Conjugaison:japonais/在る",
+                    "tags": [
+                        "Clefs de constructions",
+                        "Présent / Futur",
+                        "poli",
+                        "affirmatif",
+                    ],
+                },
+                {
+                    "form": "在りません",
+                    "hiragana": "ありません",
+                    "roman": "arimasen",
+                    "source": "Conjugaison:japonais/在る",
+                    "tags": [
+                        "Clefs de constructions",
+                        "Présent / Futur",
+                        "poli",
+                        "négatif",
+                    ],
+                },
+            ],
+        )
diff --git a/tests/test_fr_form_line.py b/tests/test_fr_form_line.py
@@ -141,3 +141,10 @@ def test_equiv_pour_template(self, mock_node_to_wikitext):
                 ],
             },
         )
+
+    def test_italic_tag(self):
+        self.wxr.wtp.start_page("飢える")
+        page_data = [WordEntry(word="飢える", lang_code="ja", lang="Japonais")]
+        root = self.wxr.wtp.parse("'''飢える''' ''ichidan''")
+        extract_form_line(self.wxr, page_data, root.children)
+        self.assertEqual(page_data[-1].tags, ["ichidan"])