From c2669b893c1c92df8c606410c2ec8e847b3bcf50 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 20 Dec 2024 17:18:10 +0800 Subject: [PATCH 1/2] [pt] add "wikisaurus" page title to `Linkage`'s `source` field --- src/wiktextract/extractor/pt/linkage.py | 18 ++++++++++++++++-- src/wiktextract/extractor/pt/models.py | 1 + src/wiktextract/extractor/pt/page.py | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/pt/linkage.py b/src/wiktextract/extractor/pt/linkage.py index fa8da6ab..cebd5a84 100644 --- a/src/wiktextract/extractor/pt/linkage.py +++ b/src/wiktextract/extractor/pt/linkage.py @@ -71,6 +71,7 @@ def extract_linkage_section( linkage_type: str, sense: str, sense_index: int, + source: str, ) -> None: for node in level_node.children: if isinstance(node, TemplateNode) and node.template_name == "fraseini": @@ -78,7 +79,13 @@ def extract_linkage_section( elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: for list_item in node.find_child(NodeKind.LIST_ITEM): extract_linkage_list_item( - wxr, word_entry, list_item, linkage_type, sense, sense_index + wxr, + word_entry, + list_item, + linkage_type, + sense, + sense_index, + source, ) @@ -104,6 +111,7 @@ def extract_linkage_list_item( linkage_type: str, sense: str, sense_index: int, + source: str, ) -> None: linkage_words = [] raw_tags = [] @@ -161,6 +169,7 @@ def extract_linkage_list_item( linkage_type, sense, sense_index, + source, ) elif isinstance(node, str): m = re.search(r"\((.+)\)", node) @@ -169,7 +178,11 @@ def extract_linkage_list_item( for word in linkage_words: linkage = Linkage( - word=word, sense=sense, sense_index=sense_index, raw_tags=raw_tags + word=word, + sense=sense, + sense_index=sense_index, + raw_tags=raw_tags, + source=source, ) translate_raw_tags(linkage) getattr(word_entry, linkage_type).append(linkage) @@ -206,4 +219,5 @@ def extract_wikisaurus_page( linkage_type, sense, sense_index, + page_title, ) diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py index 7fb777d0..bb98a924 100644 --- a/src/wiktextract/extractor/pt/models.py +++ b/src/wiktextract/extractor/pt/models.py @@ -50,6 +50,7 @@ class Linkage(PortugueseBaseModel): sense_index: int = Field( default=0, ge=0, description="Number of the definition, start from 1" ) + source: str = "" class Sound(PortugueseBaseModel): diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py index eca169ec..b6432e04 100644 --- a/src/wiktextract/extractor/pt/page.py +++ b/src/wiktextract/extractor/pt/page.py @@ -50,6 +50,7 @@ def parse_section( LINKAGE_SECTIONS[title_text], "", 0, + "", ) elif title_text == "Etimologia": extract_etymology_section(wxr, page_data, level_node) From 355d41566ab6e7fd37ab5def48d278d1364bd7af Mon Sep 17 00:00:00 2001 From: xxyzz Date: Fri, 20 Dec 2024 17:48:27 +0800 Subject: [PATCH 2/2] [it] translate some tag templates in gloss and linkage lists --- src/wiktextract/extractor/it/tags.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/wiktextract/extractor/it/tags.py b/src/wiktextract/extractor/it/tags.py index 4e15840e..87e7296e 100644 --- a/src/wiktextract/extractor/it/tags.py +++ b/src/wiktextract/extractor/it/tags.py @@ -75,7 +75,7 @@ "chimica industriale": "chemistry", "chirurgia": "surgery", "cinematografia": "cinematography", - "colori": "color", + "colore": "color", "commercio": "commerce", # "composti organici": "", # "composti inorganici": "", @@ -191,8 +191,30 @@ "volgare": "vulgar", } +# https://it.wiktionary.org/wiki/Categoria:Template_ambito +GLOSS_LIST_TEMPATE_TAGS = { + "accrescitivo": "augmentative", # Template:Accr + "colloquiale": "colloquial", # Template:Coll + "diminutivo": "diminutive", # Template:Dim + "per estensione": "broadly", # Template:Est + "senso figurato": "figuratively", # Template:Fig + "letteralmente": "literally", # Template:Lett + "peggiorativo": "pejorative", # Template:Pegg + "riferito solo a persone": "person", # Template:Pers + "per sineddoche": "synecdoche", # Template:Sndc + "specialmente al plurale": ["especially", "in-plural"], # Template:Spec pl + "spregiativo": "pejorative", # Template:Spreg + "vezzeggiativo": "endearing", # Template:Vezz + "volgare": "vulgar", # Template:Vulg +} + -TAGS = {**TABLE_TAGS, **FORM_LINE_TEMPLATE_TAGS, **TERM_TEMPLATE_TAGS} +TAGS = { + **TABLE_TAGS, + **FORM_LINE_TEMPLATE_TAGS, + **TERM_TEMPLATE_TAGS, + **GLOSS_LIST_TEMPATE_TAGS, +} def translate_raw_tags(data: WordEntry) -> None: