Skip to content

Commit

Permalink
Merge pull request #457 from xxyzz/fr
Browse files Browse the repository at this point in the history
Extract "ja-*" Conjugaison template tables
  • Loading branch information
xxyzz authored Jan 10, 2024
2 parents 7ba706a + 19dc0a1 commit 7c67235
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 2 deletions.
59 changes: 59 additions & 0 deletions src/wiktextract/extractor/fr/conjugation.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ def extract_conjugation(
proces_ja_flx_adj_template(
wxr, entry, conj_template, conj_page_title
)
elif conj_template.template_name.startswith("ja-"):
proces_ja_conj_template(wxr, entry, conj_template, conj_page_title)


def process_fr_conj_template(
Expand Down Expand Up @@ -241,3 +243,60 @@ def proces_ja_flx_adj_template(
forms[line_index].roman = line

entry.forms.extend(forms)


def proces_ja_conj_template(
wxr: WiktextractContext,
entry: WordEntry,
template_node: TemplateNode,
conj_page_title: str,
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
# Modèle:ja-在る
expanded_template = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
for table_node in expanded_template.find_child(NodeKind.TABLE):
first_tag = ""
row_headers = {}
for row in table_node.find_child(NodeKind.TABLE_ROW):
if (
all(
isinstance(c, WikiNode)
and c.kind == NodeKind.TABLE_HEADER_CELL
for c in row.children
)
and len(row.children) > 1
):
# skip header row of the "Clefs de constructions" table
continue

for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
header_text = clean_node(wxr, None, header)
if len(row.children) == 1:
first_tag = header_text
else:
row_headers[header_text] = int(
header.attrs.get("rowspan", "1")
)

tags = [first_tag]
for tag, rowspan in row_headers.copy().items():
tags.append(tag)
if rowspan == 1:
del row_headers[tag]
else:
row_headers[tag] = rowspan - 1
form = Form(tags=tags, source=conj_page_title)
for cell_index, cell in enumerate(
row.find_child(NodeKind.TABLE_CELL)
):
cell_text = clean_node(wxr, None, cell)
if cell_index == 0:
form.form = cell_text
elif cell_index == 1:
form.hiragana = cell_text
elif cell_index == 2:
form.roman = cell_text
if len(form.form) > 0:
entry.forms.append(form)
9 changes: 7 additions & 2 deletions src/wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def extract_form_line(
elif node.template_name in (
"conj",
"conjugaison",
) or node.template_name.startswith(("ja-adj-", "ja-verb-")):
) or node.template_name.startswith(("ja-adj-", "ja-verbe")):
process_conj_template(wxr, node, page_data)
else:
tag = clean_node(wxr, page_data[-1], node)
Expand All @@ -54,6 +54,10 @@ def extract_form_line(
page_data[-1].tags.append(tag.strip("()"))

pre_template_name = node.template_name
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
tag = clean_node(wxr, None, node)
if tag != "ou":
page_data[-1].tags.append(tag)


def process_equiv_pour_template(
Expand Down Expand Up @@ -169,4 +173,5 @@ def process_conj_template(
tag = (
tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
)
page_data[-1].tags.append(tag)
if len(tag) > 0:
page_data[-1].tags.append(tag)
91 changes: 91 additions & 0 deletions tests/test_fr_conj.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,94 @@ def test_ja_flx_adj(self):
},
],
)

def test_ja_conj(self):
# https://fr.wiktionary.org/wiki/Conjugaison:japonais/在る
self.wxr.wtp.start_page("在る")
self.wxr.wtp.add_page("Conjugaison:japonais/在る", 116, "{{ja-在る}}")
self.wxr.wtp.add_page(
"Modèle:ja-在る",
10,
"""{|
! colspan=\"7\" | '''Formes de base'''
|-
! colspan=\"4\" | '''L'inaccompli'''
| <bdi>[[在る#ja|在る]]</bdi>
| <bdi>[[ある#ja|ある]]</bdi>
| ''aru\n''
|-
! colspan=\"4\" | '''Imperfectif''' (<bdi>[[未然形#ja-nom|未然形]]</bdi>, <bdi>''mizen-kei''</bdi>)
| <bdi>[[無い#ja|無い]]</bdi>
| <bdi>[[ない#ja|ない]]</bdi>
| ''nai\n''
|-
! colspan=\"7\" | '''Clefs de constructions'''
|-
! colspan=\"2\" | Temps
! Forme
! Terme
! [[kanji|Kanji]]
! [[hiragana|Hiragana]]
! [[romaji|Rōmaji]]
|-
! rowspan=\"4\" colspan=\"2\" | Présent / Futur
! rowspan=\"2\" | poli
! affirmatif
| <bdi>[[在ります#ja|在ります]]</bdi>
| <bdi>[[あります#ja|あります]]</bdi>
| ''arimasu\n''
|-
! négatif
| <bdi>[[在りません#ja|在りません]]</bdi>
| <bdi>[[ありません#ja|ありません]]</bdi>
| ''arimasen\n''
|}""",
)
entry = WordEntry(lang_code="ja", lang="Japonais", word="在る")
extract_conjugation(self.wxr, entry, "Conjugaison:japonais/在る")
self.assertEqual(
[f.model_dump(exclude_defaults=True) for f in entry.forms],
[
{
"form": "在る",
"hiragana": "ある",
"roman": "aru",
"source": "Conjugaison:japonais/在る",
"tags": ["Formes de base", "L'inaccompli"],
},
{
"form": "無い",
"hiragana": "ない",
"roman": "nai",
"source": "Conjugaison:japonais/在る",
"tags": [
"Formes de base",
"Imperfectif (未然形, mizen-kei)",
],
},
{
"form": "在ります",
"hiragana": "あります",
"roman": "arimasu",
"source": "Conjugaison:japonais/在る",
"tags": [
"Clefs de constructions",
"Présent / Futur",
"poli",
"affirmatif",
],
},
{
"form": "在りません",
"hiragana": "ありません",
"roman": "arimasen",
"source": "Conjugaison:japonais/在る",
"tags": [
"Clefs de constructions",
"Présent / Futur",
"poli",
"négatif",
],
},
],
)
7 changes: 7 additions & 0 deletions tests/test_fr_form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,3 +141,10 @@ def test_equiv_pour_template(self, mock_node_to_wikitext):
],
},
)

def test_italic_tag(self):
self.wxr.wtp.start_page("飢える")
page_data = [WordEntry(word="飢える", lang_code="ja", lang="Japonais")]
root = self.wxr.wtp.parse("'''飢える''' ''ichidan''")
extract_form_line(self.wxr, page_data, root.children)
self.assertEqual(page_data[-1].tags, ["ichidan"])

0 comments on commit 7c67235

Please sign in to comment.