Skip to content

Commit

Permalink
Extract "ja-flx-adj*" templates in Conjugaison pages
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 10, 2024
1 parent ffe2a3d commit 2655a6a
Show file tree
Hide file tree
Showing 4 changed files with 112 additions and 2 deletions.
47 changes: 47 additions & 0 deletions src/wiktextract/extractor/fr/conjugation.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def extract_conjugation(
extract_conjugation(
wxr, entry, conj_template.template_name[1:], "2"
)
elif conj_template.template_name.startswith("ja-flx-adj"):
proces_ja_flx_adj_template(
wxr, entry, conj_template, conj_page_title
)


def process_fr_conj_template(
Expand Down Expand Up @@ -194,3 +198,46 @@ def process_fr_conj_wiki_table(

if len(form.form) > 0 and form.form != "—":
entry.forms.append(form)


def proces_ja_flx_adj_template(
wxr: WiktextractContext,
entry: WordEntry,
template_node: TemplateNode,
conj_page_title: str,
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:ja-adj
# https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
expanded_template = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
for table_node in expanded_template.find_child(NodeKind.TABLE):
first_tag = ""
for row in table_node.find_child(NodeKind.TABLE_ROW):
forms = []
tags = [first_tag]
for cell_index, row_child in enumerate(
row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
):
row_child_text = clean_node(wxr, None, row_child)
if row_child.kind == NodeKind.TABLE_HEADER_CELL:
first_tag = row_child_text
else:
for line_index, line in enumerate(
row_child_text.splitlines()
):
if cell_index == 0:
tags.append(line)
continue
if line_index + 1 > len(forms):
forms.append(
Form(tags=tags, source=conj_page_title)
)
if cell_index == 1:
forms[line_index].form = line
elif cell_index == 2:
forms[line_index].hiragana = line
elif cell_index == 3:
forms[line_index].roman = line

entry.forms.extend(forms)
9 changes: 8 additions & 1 deletion src/wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ def extract_form_line(
process_zh_mot_template(wxr, node, page_data)
elif node.template_name == "ja-mot":
process_ja_mot_template(wxr, node, page_data)
elif node.template_name in ("conj", "conjugaison"):
elif node.template_name in (
"conj",
"conjugaison",
) or node.template_name.startswith(("ja-adj-", "ja-verb-")):
process_conj_template(wxr, node, page_data)
else:
tag = clean_node(wxr, page_data[-1], node)
Expand Down Expand Up @@ -162,4 +165,8 @@ def process_conj_template(
tag = clean_node(wxr, page_data[-1], expanded_node)
if template_node.template_name in ("conj", "conjugaison"):
tag = tag.removesuffix("(voir la conjugaison)").strip()
elif template_node.template_name.startswith("ja-"):
tag = (
tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
)
page_data[-1].tags.append(tag)
6 changes: 5 additions & 1 deletion src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,11 @@ class Form(FrenchBaseModel):
form: str = ""
tags: list[str] = []
ipas: list[str] = []
source: str = Field("", description="Form line template name")
source: str = Field(
"", description="Form line template name or Conjugaison page title"
)
hiragana: str = ""
roman: str = ""


class Sound(FrenchBaseModel):
Expand Down
52 changes: 52 additions & 0 deletions tests/test_fr_conj.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,55 @@ def test_onglets_conjugaison(self):
},
],
)

def test_ja_flx_adj(self):
# https://fr.wiktionary.org/wiki/Conjugaison:japonais/格好だ
self.wxr.wtp.start_page("格好")
self.wxr.wtp.add_page(
"Conjugaison:japonais/格好だ",
116,
"{{ja-flx-adj-な|格好|かっこう|kakkou}}",
)
self.wxr.wtp.add_page(
"Modèle:ja-flx-adj-な",
10,
"""<h4>Flexions</h4>
{|
|-
! colspan=\"4\" | '''Formes de base'''
|-
| '''Imperfectif''' (<span>未然形</span>) || <span>[[格好だろ]]</span> || <span>[[かっこうだろ]]</span> || ''kakkou daro''
|-
! colspan=\"4\" | '''Clefs de constructions'''
|-
| '''Neutre négatif''' || <span>[[格好ではない]]<br>[[格好じゃない]]</span> || <span>[[かっこうではない]]<br>[[かっこうじゃない]]</span> || ''kakkou dewa nai<br>kakkou ja nai''
|}""",
)
entry = WordEntry(lang_code="ja", lang="Japonais", word="格好")
extract_conjugation(self.wxr, entry, "Conjugaison:japonais/格好だ")
self.assertEqual(
[f.model_dump(exclude_defaults=True) for f in entry.forms],
[
{
"form": "格好だろ",
"hiragana": "かっこうだろ",
"roman": "kakkou daro",
"source": "Conjugaison:japonais/格好だ",
"tags": ["Formes de base", "Imperfectif (未然形)"],
},
{
"form": "格好ではない",
"hiragana": "かっこうではない",
"roman": "kakkou dewa nai",
"source": "Conjugaison:japonais/格好だ",
"tags": ["Clefs de constructions", "Neutre négatif"],
},
{
"form": "格好じゃない",
"hiragana": "かっこうじゃない",
"roman": "kakkou ja nai",
"source": "Conjugaison:japonais/格好だ",
"tags": ["Clefs de constructions", "Neutre négatif"],
},
],
)

0 comments on commit 2655a6a

Please sign in to comment.