Skip to content

Commit

Permalink
Merge pull request #473 from xxyzz/fr
Browse files Browse the repository at this point in the history
Find alternative form of the "typographic variant" POS word
  • Loading branch information
xxyzz authored Jan 26, 2024
2 parents 02717b5 + 3fa7188 commit a1e399e
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 10 deletions.
37 changes: 29 additions & 8 deletions src/wiktextract/data/fr/pos_subtitles.json
Original file line number Diff line number Diff line change
Expand Up @@ -444,13 +444,22 @@
"pos": "num"
},
"onom": {
"pos": "onomatopoeia"
"pos": "onomatopoeia",
"tags": [
"onomatopoeic"
]
},
"onoma": {
"pos": "onomatopoeia"
"pos": "onomatopoeia",
"tags": [
"onomatopoeic"
]
},
"onomatopée": {
"pos": "onomatopoeia"
"pos": "onomatopoeia",
"tags": [
"onomatopoeic"
]
},
"part": {
"pos": "particle"
Expand Down Expand Up @@ -744,16 +753,28 @@
"pos": "symbol"
},
"var-typo": {
"pos": "typographic variant"
"pos": "typographic variant",
"tags": [
"alt-of"
]
},
"variante par contrainte typographique": {
"pos": "typographic variant"
"pos": "typographic variant",
"tags": [
"alt-of"
]
},
"variante typo": {
"pos": "typographic variant"
"pos": "typographic variant",
"tags": [
"alt-of"
]
},
"variante typographique": {
"pos": "typographic variant"
"pos": "typographic variant",
"tags": [
"alt-of"
]
},
"verb pr": {
"pos": "verb",
Expand All @@ -776,4 +797,4 @@
"pronominal"
]
}
}
}
31 changes: 30 additions & 1 deletion src/wiktextract/extractor/fr/gloss.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from collections import defaultdict
from typing import Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import TemplateNode
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from .models import Example, Sense, WordEntry
from .models import AltForm, Example, Sense, WordEntry


def extract_gloss(
Expand Down Expand Up @@ -65,6 +66,9 @@ def extract_gloss(
and gloss_only_nodes[index].template_name == "note"
):
note_index = index
find_alt_of_form(
wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data
)
gloss_text = clean_node(
wxr, gloss_data, gloss_only_nodes[:note_index]
).strip(" ()")
Expand Down Expand Up @@ -146,3 +150,28 @@ def process_exemple_template(
)
if len(example_data.text) > 0:
gloss_data.examples.append(example_data)


def find_alt_of_form(
wxr: WiktextractContext,
gloss_nodes: list[Union[str, WikiNode]],
pos_type: str,
gloss_data: Sense,
):
if pos_type == "typographic variant":
alt_of = ""
for gloss_node in filter(
lambda n: isinstance(n, WikiNode), gloss_nodes
):
# use the last link
if gloss_node.kind == NodeKind.LINK:
alt_of = clean_node(wxr, None, gloss_node)
if isinstance(gloss_node, TemplateNode):
gloss_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
)
for link in gloss_node.find_child_recursively(NodeKind.LINK):
alt_of = clean_node(wxr, None, link)
if len(alt_of) > 0:
gloss_data.alt_of.append(AltForm(word=alt_of))
gloss_data.tags.append("alt-of")
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/fr/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,17 @@ class Linkage(FrenchBaseModel):
lang_code: str = Field("", description="Wiktionary language code")


class AltForm(FrenchBaseModel):
word: str


class Sense(FrenchBaseModel):
glosses: list[str] = []
tags: list[str] = []
categories: list[str] = []
examples: list[Example] = []
note: str = ""
alt_of: list[AltForm] = []


class WordEntry(FrenchBaseModel):
Expand Down
4 changes: 3 additions & 1 deletion src/wiktextract/extractor/fr/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,13 @@ def process_pos_block(
pos_argument: str,
pos_title: str,
):
pos_type = wxr.config.POS_SUBTITLES[pos_argument]["pos"]
pos_data = wxr.config.POS_SUBTITLES[pos_argument]
pos_type = pos_data["pos"]
if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set:
page_data.append(base_data.model_copy(deep=True))
page_data[-1].pos = pos_type
page_data[-1].pos_title = pos_title
page_data[-1].tags.extend(pos_data.get("tags", []))
child_nodes = list(pos_title_node.filter_empty_str_child())
form_line_start = 0 # Ligne de forme
gloss_start = len(child_nodes)
Expand Down
57 changes: 57 additions & 0 deletions tests/test_fr_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,60 @@ def test_gloss_note_template(self):
}
],
)

def test_typographic_variant_alt_of_template(self):
self.wxr.wtp.start_page("abajhuro")
self.wxr.wtp.add_page(
"Modèle:eo-sys-h",
10,
"""''Orthographe par contrainte typographique par [[Annexe:Systèmes h et x en espéranto#Système h|système h]] de'' <bdi>[[abaĵuro#eo|abaĵuro]]</bdi>""",
)
root = self.wxr.wtp.parse("# {{eo-sys-h|abaĵuro}}.")
page_data = [
WordEntry(
word="abajhuro",
lang_code="fr",
lang="Français",
pos="typographic variant",
)
]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
[
{
"glosses": [
"Orthographe par contrainte typographique par système h de abaĵuro."
],
"alt_of": [{"word": "abaĵuro"}],
"tags": ["alt-of"],
}
],
)

def test_typographic_variant_alt_of_text(self):
self.wxr.wtp.start_page("alphoenix")
root = self.wxr.wtp.parse(
"# ''Variante par contrainte typographique de'' [[alphœnix]]."
)
page_data = [
WordEntry(
word="alphoenix",
lang_code="fr",
lang="Français",
pos="typographic variant",
)
]
extract_gloss(self.wxr, page_data, root.children[0])
self.assertEqual(
[d.model_dump(exclude_defaults=True) for d in page_data[-1].senses],
[
{
"glosses": [
"Variante par contrainte typographique de alphœnix."
],
"alt_of": [{"word": "alphœnix"}],
"tags": ["alt-of"],
}
],
)

0 comments on commit a1e399e

Please sign in to comment.