Skip to content

Commit

Permalink
Merge pull request #941 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] extract "flex.*" form table templates
  • Loading branch information
xxyzz authored Dec 9, 2024
2 parents 12220a6 + bee33ed commit f943188
Show file tree
Hide file tree
Showing 6 changed files with 175 additions and 2 deletions.
2 changes: 1 addition & 1 deletion src/wiktextract/extractor/ja/sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def process_sound_template(
audio_file = clean_node(
wxr, None, template_node.template_parameters.get(2, "")
)
if len(audio_file) > 0:
if audio_file not in ["", "-"]:
sound = Sound()
raw_tag = clean_node(
wxr, None, template_node.template_parameters.get(3, "")
Expand Down
73 changes: 73 additions & 0 deletions src/wiktextract/extractor/pt/inflection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import re
from dataclasses import dataclass

from wikitextprocessor import NodeKind, TemplateNode

from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Form, WordEntry
from .tags import translate_raw_tags


@dataclass
class TableHeader:
text: str
col_index: int
colspan: int
row_index: int
rowspan: int


def extract_flex_template(
wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
) -> None:
# https://pt.wiktionary.org/wiki/Predefinição:flex.pt
expanded_node = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(t_node), expand_all=True
)
for table_node in expanded_node.find_child(NodeKind.TABLE):
col_headers = []
for row_node in table_node.find_child(NodeKind.TABLE_ROW):
row_header = ""
col_cell_index = 0
col_header_index = 0
for cell_node in row_node.find_child(
NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
):
col_span = 1
col_span_str = cell_node.attrs.get("colspan", "1")
if re.fullmatch(r"\d+", col_span_str):
col_span = int(col_span_str)
cell_text = clean_node(wxr, None, cell_node)
if cell_text == "":
continue
if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
if row_node.contain_node(NodeKind.TABLE_CELL):
row_header = cell_text
else:
col_headers.append(
TableHeader(
cell_text, col_header_index, col_span, 0, 0
)
)
col_header_index += col_span
elif cell_node.attrs.get("style") == "background:#f4f4f4;":
row_header = cell_text
col_header_index += col_span
elif cell_text in ["–", wxr.wtp.title]:
col_cell_index += col_span
continue
else:
form = Form(form=cell_text)
if row_header != "":
form.raw_tags.append(row_header)
for col_header in col_headers:
if (
col_cell_index >= col_header.col_index
and col_cell_index
< col_header.col_index + col_header.colspan
):
form.raw_tags.append(col_header.text)
translate_raw_tags(form)
word_entry.forms.append(form)
col_cell_index += col_span
7 changes: 7 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,12 @@ class Sound(PortugueseBaseModel):
raw_tags: list[str] = []


class Form(PortugueseBaseModel):
form: str = ""
tags: list[str] = []
raw_tags: list[str] = []


class WordEntry(PortugueseBaseModel):
model_config = ConfigDict(title="Portuguese Wiktionary")
word: str = Field(description="Word string", min_length=1)
Expand All @@ -83,3 +89,4 @@ class WordEntry(PortugueseBaseModel):
derived: list[Linkage] = []
etymology_texts: list[str] = []
sounds: list[Sound] = []
forms: list[Form] = []
5 changes: 5 additions & 0 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .head_line import extract_head_line_nodes
from .inflection import extract_flex_template
from .models import Example, Linkage, Sense, WordEntry
from .section_titles import POS_DATA

Expand Down Expand Up @@ -38,6 +39,10 @@ def extract_pos_section(
extract_head_line_nodes(
wxr, page_data[-1], level_node.children[:first_gloss_index]
)
# forms table template may not in header line
for t_node in level_node.find_child(NodeKind.TEMPLATE):
if t_node.template_name.startswith("flex."):
extract_flex_template(wxr, page_data[-1], t_node)


def extract_gloss_list_item(
Expand Down
15 changes: 14 additions & 1 deletion src/wiktextract/extractor/pt/tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,20 @@
"minúscula": "lowercase",
}

TAGS = {**HEAD_LINE_TAGS}
TABLE_TAGS = {
# Predefinição:flex.pt
"Masculino": "masculine",
"Feminino": "feminine",
"Plural": "plural",
"Singular": "singular",
# Predefinição:flex.pt.subst.completa
"Coletivo": "collective",
"Normal": "standard",
"Aumentativo": "augmentative",
"Diminutivo": "diminutive",
}

TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS}


def translate_raw_tags(data: WordEntry) -> None:
Expand Down
75 changes: 75 additions & 0 deletions tests/test_pt_form.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from unittest import TestCase

from wikitextprocessor import Wtp

from wiktextract.config import WiktionaryConfig
from wiktextract.extractor.pt.page import parse_page
from wiktextract.wxr_context import WiktextractContext


class TestPtForm(TestCase):
maxDiff = None

def setUp(self) -> None:
conf = WiktionaryConfig(
dump_file_lang_code="pt",
capture_language_codes=None,
)
self.wxr = WiktextractContext(
Wtp(
lang_code="pt",
parser_function_aliases=conf.parser_function_aliases,
),
conf,
)

def test_flex_pt_subst_completa(self):
self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português")
self.wxr.wtp.add_page(
"Predefinição:flex.pt.subst.completa",
10,
"""{|
|-
! style="background:#f4f4f4;" rowspan="2" |
! style="background:#ffffe0;" colspan="2" | [[masculino|Masculino]]
! style="background:#ffffe0;" colspan="2" | [[feminino|Feminino]]
! style="background:#ffffe0;" rowspan="2" | [[coletivo|Coletivo]]
|-
! style="background:#ffffe0;" | [[singular|Singular]]
! style="background:#ffffe0;" | [[plural|Plural]]
! style="background:#ffffe0;" | [[singular|Singular]]
! style="background:#ffffe0;" | [[plural|Plural]]
|-
! style="background:#f4f4f4;" | [[normal|Normal]]
| [[cão]]
| [[cães#Português|cães]]
| [[cadela#Português|cadela]]
| [[cadelas#Português|<span style="color:black">cadelas</span>]]
| rowspan="3" | [[matilha#Português|matilha]]
|}""",
)
self.wxr.wtp.add_page("Predefinição:AFI", 10, "{{{1}}}")
data = parse_page(
self.wxr,
"cão",
"""={{-pt-}}=
==Substantivo==
# animal
{{flex.pt.subst.completa
|ms=cão|mp=cães|fs=cadela|fp=cadelas
|msa=canzarrão|mpa=canzarrões|fsa=cadelona|fpa=cadelonas
|msd=cãozinho|mpd=cãezinhos|fsd=cadelinha|fpd=cadelinhas
|col=matilha}}""",
)
self.assertEqual(
data[0]["forms"],
[
{"form": "cães", "tags": ["standard", "masculine", "plural"]},
{
"form": "cadela",
"tags": ["standard", "feminine", "singular"],
},
{"form": "cadelas", "tags": ["standard", "feminine", "plural"]},
{"form": "matilha", "tags": ["standard", "collective"]},
],
)

0 comments on commit f943188

Please sign in to comment.