Skip to content

Commit

Permalink
Merge pull request #943 from xxyzz/pt
Browse files Browse the repository at this point in the history
[pt] find linkage data in "Wikisaurus:" pages
  • Loading branch information
xxyzz authored Dec 10, 2024
2 parents ea2043b + f24db97 commit e314ba5
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 5 deletions.
61 changes: 57 additions & 4 deletions src/wiktextract/extractor/pt/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ...page import clean_node
from ...wxr_context import WiktextractContext
from .models import Linkage, WordEntry
from .section_titles import LINKAGE_SECTIONS
from .tags import translate_raw_tags


Expand Down Expand Up @@ -68,9 +69,9 @@ def extract_linkage_section(
word_entry: WordEntry,
level_node: LevelNode,
linkage_type: str,
sense: str,
sense_index: int,
) -> None:
sense = ""
sense_index = 0
for node in level_node.children:
if isinstance(node, TemplateNode) and node.template_name == "fraseini":
sense, sense_index = extract_fraseini_template(wxr, node)
Expand Down Expand Up @@ -123,15 +124,33 @@ def extract_linkage_list_item(
match node.kind:
case NodeKind.LINK:
word = clean_node(wxr, None, node)
if word != "" and not word.startswith("Wikisaurus:"):
if word.startswith("Wikisaurus:"):
extract_wikisaurus_page(
wxr,
word_entry,
word,
linkage_type,
sense,
sense_index,
)
elif word != "":
linkage_words.append(word)
case NodeKind.BOLD:
bold_str = clean_node(wxr, None, node)
if re.fullmatch(r"\d+", bold_str):
sense_index = int(bold_str)
case NodeKind.ITALIC:
raw_tag = clean_node(wxr, None, node)
if raw_tag != "":
if raw_tag.startswith("Wikisaurus:"):
extract_wikisaurus_page(
wxr,
word_entry,
raw_tag,
linkage_type,
sense,
sense_index,
)
elif raw_tag != "":
raw_tags.append(raw_tag)
case NodeKind.LIST:
for child_list_item in node.find_child(NodeKind.LIST_ITEM):
Expand All @@ -154,3 +173,37 @@ def extract_linkage_list_item(
)
translate_raw_tags(linkage)
getattr(word_entry, linkage_type).append(linkage)


def extract_wikisaurus_page(
wxr: WiktextractContext,
word_entry: WordEntry,
page_title: str,
linkage_type: str,
sense: str,
sense_index: int,
) -> None:
page = wxr.wtp.get_page(page_title, 0)
if page is None or page.body is None:
return
root = wxr.wtp.parse(page.body)
for level1_node in root.find_child(NodeKind.LEVEL1):
lang_name = clean_node(wxr, None, level1_node.largs)
if lang_name != word_entry.lang:
continue
for level2_node in level1_node.find_child(NodeKind.LEVEL2):
pos_title = clean_node(wxr, None, level2_node.largs)
if pos_title != word_entry.pos_title:
continue
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
linkage_title = clean_node(wxr, None, level3_node.largs)
if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
continue
extract_linkage_section(
wxr,
word_entry,
level3_node,
linkage_type,
sense,
sense_index,
)
5 changes: 4 additions & 1 deletion src/wiktextract/extractor/pt/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def parse_section(
page_data[-1] if len(page_data) > 0 else base_data,
level_node,
LINKAGE_SECTIONS[title_text],
"",
0,
)
elif title_text == "Etimologia":
extract_etymology_section(wxr, page_data, level_node)
Expand Down Expand Up @@ -87,7 +89,8 @@ def parse_page(
) -> list[dict[str, Any]]:
# page layout
# https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo
if "/traduções" in page_title: # skip translation page
if "/traduções" in page_title or page_title.startswith("Wikisaurus:"):
# skip translation and thesaurus pages
return []
wxr.wtp.start_page(page_title)
tree = wxr.wtp.parse(page_text)
Expand Down

0 comments on commit e314ba5

Please sign in to comment.