Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pt] extract phraseology and note sections, nested gloss lists #955

Merged
merged 3 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,13 @@ def extract_fraseini_template(
sense = ""
sense_index = 0
first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
m = re.search(r"(\d+)$", first_arg)
m = re.search(r"\((\d+)\)$", first_arg)
if m is not None:
sense_index = int(m.group(1))
sense = first_arg[: m.start()].strip()
elif (m := re.match(r"De (\d+)", first_arg)) is not None:
sense_index = int(m.group(1))
sense = first_arg[m.end() :].strip("() \n")
else:
sense = first_arg
return sense, sense_index
Expand Down Expand Up @@ -230,3 +233,61 @@ def extract_wikisaurus_page(
page_title,
tags,
)


def extract_phraseology_section(
wxr: WiktextractContext,
word_entry: WordEntry,
level_node: LevelNode,
) -> None:
sense = ""
sense_index = 0
for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
if isinstance(node, TemplateNode) and node.template_name == "fraseini":
sense, sense_index = extract_fraseini_template(wxr, node)
elif node.kind == NodeKind.LIST:
for list_item in node.find_child(NodeKind.LIST_ITEM):
extract_phraseology_list_item(
wxr, word_entry, list_item, sense, sense_index
)


def extract_phraseology_list_item(
wxr: WiktextractContext,
word_entry: WordEntry,
list_item: WikiNode,
sense: str,
sense_index: int,
) -> None:
l_data = Linkage(word="", sense=sense, sense_index=sense_index)
for index, node in enumerate(list_item.children):
if (
isinstance(node, WikiNode)
and node.kind in NodeKind.BOLD | NodeKind.LINK
and l_data.word == ""
):
l_data.word = clean_node(wxr, None, node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
l_data.roman = clean_node(wxr, None, node)
elif isinstance(node, str) and ("=" in node or ":" in node):
sense_start = node.index("=" if "=" in node else ":") + 1
l_data.sense = clean_node(
wxr,
None,
[node[sense_start:]]
+ [
n
for n in list_item.children[index + 1 :]
if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
],
)
break

if l_data.word != "":
word_entry.phraseology.append(l_data)

for child_list in list_item.find_child(NodeKind.LIST):
for next_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_phraseology_list_item(
wxr, word_entry, next_list_item, sense, sense_index
)
6 changes: 6 additions & 0 deletions src/wiktextract/extractor/pt/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ class Linkage(PortugueseBaseModel):
default=0, ge=0, description="Number of the definition, start from 1"
)
source: str = ""
roman: str = ""


class Sound(PortugueseBaseModel):
Expand Down Expand Up @@ -92,6 +93,11 @@ class WordEntry(PortugueseBaseModel):
hypernyms: list[Linkage] = []
related: list[Linkage] = []
hyponyms: list[Linkage] = []
homophones: list[Linkage] = []
homonyms: list[Linkage] = []
paronyms: list[Linkage] = []
phraseology: list[Linkage] = []
etymology_texts: list[str] = []
sounds: list[Sound] = []
forms: list[Form] = []
notes: list[str] = []
39 changes: 34 additions & 5 deletions src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .etymology import extract_etymology_section
from .linkage import extract_expression_section, extract_linkage_section
from .linkage import (
extract_expression_section,
extract_linkage_section,
extract_phraseology_section,
)
from .models import Sense, WordEntry
from .pos import extract_pos_section
from .pronunciation import extract_pronunciation_section
Expand All @@ -25,7 +29,7 @@ def parse_section(
) -> None:
cats = {}
title_text = clean_node(wxr, cats, level_node.largs).strip(
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789"
"⁰¹²³⁴⁵⁶⁷⁸⁹0123456789:"
)
if title_text.lower() in POS_DATA:
extract_pos_section(
Expand Down Expand Up @@ -59,11 +63,17 @@ def parse_section(
extract_etymology_section(wxr, page_data, level_node)
elif title_text == "Pronúncia":
extract_pronunciation_section(wxr, page_data, level_node)
elif title_text in ["Nota", "Notas", "Nota de uso"]:
pass
elif title_text == "Fraseologia":
extract_phraseology_section(
wxr, page_data[-1] if len(page_data) else base_data, level_node
)
elif title_text.startswith("Nota"):
extract_note_section(wxr, page_data, level_node)
elif title_text.lower() not in [
"ver também",
"ligação externa",
"ligações externas",
"ligação extena",
"referências",
"referência",
"no wikcionário",
Expand All @@ -73,7 +83,9 @@ def parse_section(
"no wikisaurus",
"no commons",
"no wikimedia commons",
"na internet",
"galeria",
"galeria de imagens",
]:
wxr.wtp.debug(f"unknown section: {title_text}")

Expand All @@ -86,7 +98,7 @@ def parse_section(
clean_node(wxr, cats, link_node)
save_section_cats(cats.get("categories", []), page_data, level_node, False)

if title_text != "Pronúncia":
if title_text.lower() not in ["pronúncia", "ver também"]:
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, next_level)

Expand Down Expand Up @@ -147,3 +159,20 @@ def parse_page(
if len(data.senses) == 0:
data.senses.append(Sense(tags=["no-gloss"]))
return [m.model_dump(exclude_defaults=True) for m in page_data]


def extract_note_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
level_node: LevelNode,
) -> None:
notes = []
for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
note = clean_node(
wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
)
if note != "":
notes.append(note)
for data in page_data:
if data.lang_code == page_data[-1].lang_code:
data.notes.extend(notes)
15 changes: 10 additions & 5 deletions src/wiktextract/extractor/pt/pos.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ def extract_gloss_list_item(
wxr: WiktextractContext,
word_entry: WordEntry | Linkage,
list_item: WikiNode,
parent_gloss: list[str] = [],
) -> None:
gloss_nodes = []
sense = Sense()
first_gloss_index = len(list_item.children)
for index, node in enumerate(list_item.children):
sense = Sense(glosses=parent_gloss)
for node in list_item.children:
if isinstance(node, TemplateNode):
if node.template_name == "escopo":
extract_escopo_template(wxr, sense, node)
Expand All @@ -65,8 +65,6 @@ def extract_gloss_list_item(
if node.sarg.endswith(("*", ":")):
for next_list_item in node.find_child(NodeKind.LIST_ITEM):
extract_example_list_item(wxr, sense, next_list_item)
if index < first_gloss_index:
first_gloss_index = index
else:
gloss_nodes.append(node)

Expand All @@ -75,6 +73,13 @@ def extract_gloss_list_item(
sense.glosses.append(gloss_str)
word_entry.senses.append(sense)

for child_list in list_item.find_child(NodeKind.LIST):
if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
extract_gloss_list_item(
wxr, word_entry, child_list_item, sense.glosses
)


def extract_escopo_template(
wxr: WiktextractContext,
Expand Down
27 changes: 26 additions & 1 deletion src/wiktextract/extractor/pt/section_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"posposição": {"pos": "postp"},
"pronome": {"pos": "pron"},
"substantivo": {"pos": "noun"},
"berbo": {"pos": "verb"},
"verbo": {"pos": "verb"},
"forma de substantivo": {"pos": "noun", "tags": ["form-of"]},
"forma verbal": {"pos": "verb", "tags": ["form-of"]},
"locução substantiva": {"pos": "phrase", "tags": ["substantive"]},
Expand All @@ -19,6 +19,7 @@
"locução prepositiva": {"pos": "phrase", "tags": ["prepositional"]},
"expressão": {"pos": "phrase"},
"abreviatura": {"pos": "abbrev", "tags": ["abbreviation"]},
"abreviação": {"pos": "abbrev", "tags": ["abbreviation"]},
"contração": {"pos": "contraction", "tags": ["contraction"]},
"prefixo": {"pos": "prefix", "tags": ["morpheme"]},
"sufixo": {"pos": "suffix", "tags": ["morpheme"]},
Expand All @@ -39,19 +40,27 @@
},
"forma de pronome": {"pos": "pron", "tags": ["form-of"]},
"advérbio numeral": {"pos": "adv", "tags": ["numeral"]},
"verbo preposicionado": {"pos": "verb", "tags": ["prepositional"]},
"caractere han": {"pos": "character", "tags": ["han"]},
"hanja": {"pos": "character", "tags": ["Hanja"]},
"kanji": {"pos": "character", "tags": ["kanji"]},
"pronome pessoal": {"pos": "pron", "tags": ["person"]},
"pronome possessivo": {"pos": "det", "tags": ["possessive"]},
}


LINKAGE_SECTIONS = {
"antônimos": "antonyms",
"antônimo": "antonyms",
"antónimo": "antonyms",
"antónimos": "antonyms",
"antónimos/antônimos": "antonyms",
"sinônimos": "synonyms",
"sinônimo": "synonyms",
"sinónimos/sinônimos": "synonyms",
"sinónimos": "synonyms",
"sinónimo": "synonyms",
"sinônimos e variantes": "synonyms",
"verbetes derivados": "derived",
"verbete derivado": "derived",
"formas alternativas": "synonyms",
Expand All @@ -61,6 +70,7 @@
"hiperônimos": "hypernyms",
"hiperónimos": "hypernyms",
"termos derivados": "derived",
"termos derivadoss": "derived",
"grafia antiga": "synonyms",
"diminutivo": "synonyms",
"diminutivos": "synonyms",
Expand All @@ -70,11 +80,25 @@
"entradas relacionadas": "related",
"hipônimos": "hyponyms",
"hiponímias": "hyponyms",
"hipónimos": "hyponyms",
"ortografias obsoletas": "synonyms",
"superlativo": "synonyms",
"outros verbetes": "related",
"cardinal equivalente": "synonyms",
"cardinais equivalentes": "synonyms",
"aumentativo": "synonyms",
"advérbios derivados": "derived",
"derivações": "derived",
"homófonos": "homophones",
"homófono": "homophones",
"homónimos/homônimos": "homonyms",
"homônimos": "homonyms",
"parônimos": "paronyms",
"caracteres derivados": "derived",
"caracteres relacionados": "related",
"palavras com o kanji": "related",
"compostos": "derived",
"vermos derivados": "derived",
}

LINKAGE_TAGS = {
Expand All @@ -84,4 +108,5 @@
"ortografias obsoletas": ["obsolete"],
"superlativo": ["superlative"],
"aumentativo": ["augmentative"],
"advérbios derivados": ["adverb"],
}
18 changes: 18 additions & 0 deletions tests/test_pt_gloss.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,21 @@ def test_escopo(self):
}
],
)

def test_nested_list(self):
self.wxr.wtp.add_page("Predefinição:-en-", 10, "Inglês")
data = parse_page(
self.wxr,
"average",
"""={{-en-}}=
==Adjetivo==
# [[médio]]
## [[relativo à]] [[média]];''""",
)
self.assertEqual(
data[0]["senses"],
[
{"glosses": ["médio"]},
{"glosses": ["médio", "relativo à média;"]},
],
)
Loading
Loading