diff --git a/src/wiktextract/extractor/ja/sound.py b/src/wiktextract/extractor/ja/sound.py index bc2aa301..5706214e 100644 --- a/src/wiktextract/extractor/ja/sound.py +++ b/src/wiktextract/extractor/ja/sound.py @@ -45,7 +45,7 @@ def process_sound_template( audio_file = clean_node( wxr, None, template_node.template_parameters.get(2, "") ) - if len(audio_file) > 0: + if audio_file not in ["", "-"]: sound = Sound() raw_tag = clean_node( wxr, None, template_node.template_parameters.get(3, "") diff --git a/src/wiktextract/extractor/pt/inflection.py b/src/wiktextract/extractor/pt/inflection.py new file mode 100644 index 00000000..17d56d2f --- /dev/null +++ b/src/wiktextract/extractor/pt/inflection.py @@ -0,0 +1,73 @@ +import re +from dataclasses import dataclass + +from wikitextprocessor import NodeKind, TemplateNode + +from ...page import clean_node +from ...wxr_context import WiktextractContext +from .models import Form, WordEntry +from .tags import translate_raw_tags + + +@dataclass +class TableHeader: + text: str + col_index: int + colspan: int + row_index: int + rowspan: int + + +def extract_flex_template( + wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode +) -> None: + # https://pt.wiktionary.org/wiki/Predefinição:flex.pt + expanded_node = wxr.wtp.parse( + wxr.wtp.node_to_wikitext(t_node), expand_all=True + ) + for table_node in expanded_node.find_child(NodeKind.TABLE): + col_headers = [] + for row_node in table_node.find_child(NodeKind.TABLE_ROW): + row_header = "" + col_cell_index = 0 + col_header_index = 0 + for cell_node in row_node.find_child( + NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL + ): + col_span = 1 + col_span_str = cell_node.attrs.get("colspan", "1") + if re.fullmatch(r"\d+", col_span_str): + col_span = int(col_span_str) + cell_text = clean_node(wxr, None, cell_node) + if cell_text == "": + continue + if cell_node.kind == NodeKind.TABLE_HEADER_CELL: + if row_node.contain_node(NodeKind.TABLE_CELL): + row_header = cell_text + else: + col_headers.append( + TableHeader( + cell_text, col_header_index, col_span, 0, 0 + ) + ) + col_header_index += col_span + elif cell_node.attrs.get("style") == "background:#f4f4f4;": + row_header = cell_text + col_header_index += col_span + elif cell_text in ["–", wxr.wtp.title]: + col_cell_index += col_span + continue + else: + form = Form(form=cell_text) + if row_header != "": + form.raw_tags.append(row_header) + for col_header in col_headers: + if ( + col_cell_index >= col_header.col_index + and col_cell_index + < col_header.col_index + col_header.colspan + ): + form.raw_tags.append(col_header.text) + translate_raw_tags(form) + word_entry.forms.append(form) + col_cell_index += col_span diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index e51e3755..7fb777d0 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -65,6 +65,12 @@ class Sound(PortugueseBaseModel): raw_tags: list[str] = [] +class Form(PortugueseBaseModel): + form: str = "" + tags: list[str] = [] + raw_tags: list[str] = [] + + class WordEntry(PortugueseBaseModel): model_config = ConfigDict(title="Portuguese Wiktionary") word: str = Field(description="Word string", min_length=1) @@ -83,3 +89,4 @@ class WordEntry(PortugueseBaseModel): derived: list[Linkage] = [] etymology_texts: list[str] = [] sounds: list[Sound] = [] + forms: list[Form] = [] diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py index de75f36e..05292ef4 100644 --- a/src/wiktextract/extractor/pt/pos.py +++ b/src/wiktextract/extractor/pt/pos.py @@ -9,6 +9,7 @@ from ...page import clean_node from ...wxr_context import WiktextractContext from .head_line import extract_head_line_nodes +from .inflection import extract_flex_template from .models import Example, Linkage, Sense, WordEntry from .section_titles import POS_DATA @@ -38,6 +39,10 @@ def extract_pos_section( extract_head_line_nodes( wxr, page_data[-1], level_node.children[:first_gloss_index] ) + # forms table template may not in header line + for t_node in level_node.find_child(NodeKind.TEMPLATE): + if t_node.template_name.startswith("flex."): + extract_flex_template(wxr, page_data[-1], t_node) def extract_gloss_list_item( diff --git a/src/wiktextract/extractor/pt/tags.py b/src/wiktextract/extractor/pt/tags.py index 09fc6edf..e48ae907 100644 --- a/src/wiktextract/extractor/pt/tags.py +++ b/src/wiktextract/extractor/pt/tags.py @@ -100,7 +100,20 @@ "minúscula": "lowercase", } -TAGS = {**HEAD_LINE_TAGS} +TABLE_TAGS = { + # Predefinição:flex.pt + "Masculino": "masculine", + "Feminino": "feminine", + "Plural": "plural", + "Singular": "singular", + # Predefinição:flex.pt.subst.completa + "Coletivo": "collective", + "Normal": "standard", + "Aumentativo": "augmentative", + "Diminutivo": "diminutive", +} + +TAGS = {**HEAD_LINE_TAGS, **TABLE_TAGS} def translate_raw_tags(data: WordEntry) -> None: diff --git a/tests/test_pt_form.py b/tests/test_pt_form.py new file mode 100644 index 00000000..e6716386 --- /dev/null +++ b/tests/test_pt_form.py @@ -0,0 +1,75 @@ +from unittest import TestCase + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.pt.page import parse_page +from wiktextract.wxr_context import WiktextractContext + + +class TestPtForm(TestCase): + maxDiff = None + + def setUp(self) -> None: + conf = WiktionaryConfig( + dump_file_lang_code="pt", + capture_language_codes=None, + ) + self.wxr = WiktextractContext( + Wtp( + lang_code="pt", + parser_function_aliases=conf.parser_function_aliases, + ), + conf, + ) + + def test_flex_pt_subst_completa(self): + self.wxr.wtp.add_page("Predefinição:-pt-", 10, "Português") + self.wxr.wtp.add_page( + "Predefinição:flex.pt.subst.completa", + 10, + """{| +|- +! style="background:#f4f4f4;" rowspan="2" | +! style="background:#ffffe0;" colspan="2" | [[masculino|Masculino]] +! style="background:#ffffe0;" colspan="2" | [[feminino|Feminino]] +! style="background:#ffffe0;" rowspan="2" | [[coletivo|Coletivo]] +|- +! style="background:#ffffe0;" | [[singular|Singular]] +! style="background:#ffffe0;" | [[plural|Plural]] +! style="background:#ffffe0;" | [[singular|Singular]] +! style="background:#ffffe0;" | [[plural|Plural]] +|- +! style="background:#f4f4f4;" | [[normal|Normal]] +| [[cão]] +| [[cães#Português|cães]] +| [[cadela#Português|cadela]] +| [[cadelas#Português|cadelas]] +| rowspan="3" | [[matilha#Português|matilha]] +|}""", + ) + self.wxr.wtp.add_page("Predefinição:AFI", 10, "{{{1}}}") + data = parse_page( + self.wxr, + "cão", + """={{-pt-}}= +==Substantivo== +# animal +{{flex.pt.subst.completa +|ms=cão|mp=cães|fs=cadela|fp=cadelas +|msa=canzarrão|mpa=canzarrões|fsa=cadelona|fpa=cadelonas +|msd=cãozinho|mpd=cãezinhos|fsd=cadelinha|fpd=cadelinhas +|col=matilha}}""", + ) + self.assertEqual( + data[0]["forms"], + [ + {"form": "cães", "tags": ["standard", "masculine", "plural"]}, + { + "form": "cadela", + "tags": ["standard", "feminine", "singular"], + }, + {"form": "cadelas", "tags": ["standard", "feminine", "plural"]}, + {"form": "matilha", "tags": ["standard", "collective"]}, + ], + )