Skip to content

Commit

Permalink
Extract Conjugaison pages through "conj" form line template
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Jan 10, 2024
1 parent 9ca5eff commit ffe2a3d
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 55 deletions.
80 changes: 43 additions & 37 deletions src/wiktextract/extractor/fr/conjugation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from typing import Optional

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import HTMLNode, TemplateNode
from wiktextract.page import clean_node
Expand All @@ -11,7 +9,7 @@
def extract_conjugation(
wxr: WiktextractContext,
entry: WordEntry,
word: str = "",
conj_page_title: str,
select_template: str = "1",
) -> None:
"""
Expand All @@ -21,26 +19,36 @@ def extract_conjugation(
https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
https://fr.wiktionary.org/wiki/Aide:Conjugaisons
"""
conj_ns = wxr.wtp.NAMESPACE_DATA["Conjugaison"]
if len(word) == 0:
word = entry.word
conj_page_title = f"{conj_ns['name']}:{entry.lang.lower()}/{word}"
conj_page = wxr.wtp.get_page_body(conj_page_title, conj_ns["id"])
conj_page = wxr.wtp.get_page_body(
conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
)
if conj_page is None:
return
conj_root = wxr.wtp.parse(conj_page)
for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
if conj_template.template_name.startswith("fr-conj-"):
process_fr_conj_template(wxr, entry, conj_template)
process_fr_conj_template(wxr, entry, conj_template, conj_page_title)
elif conj_template.template_name == "Onglets conjugaison":
process_onglets_template(wxr, entry, conj_template, select_template)
# https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
# this template expands to two tabs of tables
selected_template = conj_template.template_parameters.get(
f"contenu{select_template}"
)
if selected_template is not None:
process_fr_conj_template(
wxr, entry, selected_template, conj_page_title
)
elif conj_template.template_name.startswith(":Conjugaison:"):
word = conj_template.template_name.rsplit("/", 1)[-1]
extract_conjugation(wxr, entry, word, "2")
extract_conjugation(
wxr, entry, conj_template.template_name[1:], "2"
)


def process_fr_conj_template(
wxr: WiktextractContext, entry: WordEntry, template_node: TemplateNode
wxr: WiktextractContext,
entry: WordEntry,
template_node: TemplateNode,
conj_page_title: str,
) -> None:
# https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
# https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
Expand All @@ -54,13 +62,20 @@ def process_fr_conj_template(
h3_text = clean_node(wxr, None, node)
elif node.tag == "div":
if h3_text == "Modes impersonnels":
process_fr_conj_modes_table(wxr, entry, node)
process_fr_conj_modes_table(
wxr, entry, node, conj_page_title
)
else:
process_fr_conj_table(wxr, entry, node, h3_text)
process_fr_conj_table(
wxr, entry, node, h3_text, conj_page_title
)


def process_fr_conj_modes_table(
wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode
wxr: WiktextractContext,
entry: WordEntry,
div_node: HTMLNode,
conj_page_title: str,
) -> None:
# the first "Modes impersonnels" table
for table_node in div_node.find_child(NodeKind.TABLE):
Expand All @@ -81,7 +96,7 @@ def process_fr_conj_modes_table(
form=form_text,
tags=tags.copy(),
ipas=[clean_node(wxr, None, cell)],
source="Conjugaison page",
source=conj_page_title,
)
form.tags.append("Présent" if cell_index == 3 else "Passé")
entry.forms.append(form)
Expand All @@ -93,7 +108,11 @@ def process_fr_conj_modes_table(


def process_fr_conj_table(
wxr: WiktextractContext, entry: WordEntry, div_node: HTMLNode, h3_text: str
wxr: WiktextractContext,
entry: WordEntry,
div_node: HTMLNode,
h3_text: str,
conj_page_title: str,
) -> None:
for table_node in div_node.find_child(NodeKind.TABLE):
for row_index, row in enumerate(
Expand All @@ -109,11 +128,11 @@ def process_fr_conj_table(
and cell_child.tag == "table"
):
process_fr_conj_html_table(
wxr, entry, cell_child, h3_text
wxr, entry, cell_child, h3_text, conj_page_title
)
elif cell_child.kind == NodeKind.TABLE:
process_fr_conj_wiki_table(
wxr, entry, cell_child, h3_text
wxr, entry, cell_child, h3_text, conj_page_title
)


Expand All @@ -122,13 +141,14 @@ def process_fr_conj_html_table(
entry: WordEntry,
table_node: HTMLNode,
h3_text: str,
conj_page_title: str,
):
tags = [h3_text]
for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
if tr_index == 0:
tags.append(clean_node(wxr, None, tr_node.children))
else:
form = Form(tags=tags, source="Conjugaison page")
form = Form(tags=tags, source=conj_page_title)
for td_index, td_node in enumerate(
tr_node.find_html_recursively("td")
):
Expand All @@ -153,13 +173,14 @@ def process_fr_conj_wiki_table(
entry: WordEntry,
table_node: WikiNode,
h3_text: str,
conj_page_title: str,
):
tags = [h3_text]
for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
if row_index == 0:
tags.append(clean_node(wxr, None, row.children))
else:
form = Form(tags=tags, source="Conjugaison page")
form = Form(tags=tags, source=conj_page_title)
for cell_index, cell in enumerate(
row.find_child(NodeKind.TABLE_CELL)
):
Expand All @@ -173,18 +194,3 @@ def process_fr_conj_wiki_table(

if len(form.form) > 0 and form.form != "—":
entry.forms.append(form)


def process_onglets_template(
wxr: WiktextractContext,
entry: WordEntry,
template_node: TemplateNode,
select: str,
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
# this template expands to two tabs of tables
selected_template = template_node.template_parameters.get(
f"contenu{select}"
)
if selected_template is not None:
process_fr_conj_template(wxr, entry, selected_template)
42 changes: 42 additions & 0 deletions src/wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .conjugation import extract_conjugation
from .models import Form, Sound, WordEntry
from .pronunciation import PRON_TEMPLATES, process_pron_template

Expand Down Expand Up @@ -34,6 +35,8 @@ def extract_form_line(
process_zh_mot_template(wxr, node, page_data)
elif node.template_name == "ja-mot":
process_ja_mot_template(wxr, node, page_data)
elif node.template_name in ("conj", "conjugaison"):
process_conj_template(wxr, node, page_data)
else:
tag = clean_node(wxr, page_data[-1], node)
if (
Expand Down Expand Up @@ -121,3 +124,42 @@ def process_ja_mot_template(
Form(form=form_text, tags=["romanization"])
)
break


def process_conj_template(
wxr: WiktextractContext,
template_node: TemplateNode,
page_data: list[WordEntry],
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:conjugaison
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
for link in expanded_node.find_child(NodeKind.LINK):
if len(link.largs) == 0:
continue
conj_title = link.largs[0][0]
if not conj_title.startswith("Conjugaison:"):
continue
conj_word = conj_title.split("/", 1)[-1]
if conj_word in (
"Premier groupe",
"Deuxième groupe",
"Troisième groupe",
):
continue
if (
len(page_data) > 1
and page_data[-2].lang_code == page_data[-1].lang_code
and page_data[-2].pos == page_data[-1].pos
and len(page_data[-2].forms) > 0
and page_data[-2].forms[-1].source == conj_title
):
page_data[-1].forms = page_data[-2].forms
else:
extract_conjugation(wxr, page_data[-1], conj_title)

tag = clean_node(wxr, page_data[-1], expanded_node)
if template_node.template_name in ("conj", "conjugaison"):
tag = tag.removesuffix("(voir la conjugaison)").strip()
page_data[-1].tags.append(tag)
10 changes: 0 additions & 10 deletions src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .conjugation import extract_conjugation
from .etymology import EtymologyData, extract_etymology, insert_etymology_data
from .form_line import extract_form_line
from .gloss import extract_gloss, process_exemple_template
Expand Down Expand Up @@ -152,15 +151,6 @@ def process_pos_block(

form_line_nodes = child_nodes[form_line_start:gloss_start]
extract_form_line(wxr, page_data, form_line_nodes)
if pos_type == "verb":
if (
len(page_data) > 1
and page_data[-2].pos == pos_type
and page_data[-2].lang_code == page_data[-1].lang_code
):
page_data[-1].forms = page_data[-2].forms
else:
extract_conjugation(wxr, page_data[-1])


def parse_page(
Expand Down
16 changes: 8 additions & 8 deletions tests/test_fr_conj.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,32 +68,32 @@ def test_fr_conj_1(self):
</div>""",
)
entry = WordEntry(lang_code="fr", lang="Français", word="lancer")
extract_conjugation(self.wxr, entry)
extract_conjugation(self.wxr, entry, "Conjugaison:français/lancer")
self.assertEqual(
[f.model_dump(exclude_defaults=True) for f in entry.forms],
[
{
"form": "lancer",
"ipas": ["\\lɑ̃.se\\"],
"source": "Conjugaison page",
"source": "Conjugaison:français/lancer",
"tags": ["Modes impersonnels", "Infinitif", "Présent"],
},
{
"form": "avoir lancé",
"ipas": ["\\a.vwaʁ lɑ̃.se\\"],
"source": "Conjugaison page",
"source": "Conjugaison:français/lancer",
"tags": ["Modes impersonnels", "Infinitif", "Passé"],
},
{
"form": "je lance",
"ipas": ["\\ʒə lɑ̃s\\"],
"source": "Conjugaison page",
"source": "Conjugaison:français/lancer",
"tags": ["Indicatif", "Présent"],
},
{
"form": "j’ai lancé",
"ipas": ["\\ʒ‿e lɑ̃.se\\"],
"source": "Conjugaison page",
"source": "Conjugaison:français/lancer",
"tags": ["Indicatif", "Passé composé"],
},
],
Expand Down Expand Up @@ -139,20 +139,20 @@ def test_onglets_conjugaison(self):
</div>""",
)
entry = WordEntry(lang_code="fr", lang="Français", word="s’abattre")
extract_conjugation(self.wxr, entry)
extract_conjugation(self.wxr, entry, "Conjugaison:français/s’abattre")
self.assertEqual(
[f.model_dump(exclude_defaults=True) for f in entry.forms],
[
{
"form": "s’abattre",
"ipas": ["\\s‿a.batʁ\\"],
"source": "Conjugaison page",
"source": "Conjugaison:français/abattre",
"tags": ["Modes impersonnels", "Infinitif", "Présent"],
},
{
"form": "s’être abattu",
"ipas": ["\\s‿ɛtʁ‿a.ba.ty\\"],
"source": "Conjugaison page",
"source": "Conjugaison:français/abattre",
"tags": ["Modes impersonnels", "Infinitif", "Passé"],
},
],
Expand Down

0 comments on commit ffe2a3d

Please sign in to comment.