From a13ef5559f73ad99c9362345b31bf24d517f8c8b Mon Sep 17 00:00:00 2001 From: thallysonjsa Date: Mon, 20 Jan 2025 15:51:13 -0300 Subject: [PATCH] adding the imdb_ptbr scenario --- .../run_specs/imdb_ptbr_run_specs.py | 30 ++++++++++ .../benchmark/scenarios/imdb_ptbr_scenario.py | 60 +++++++++++++++++++ .../scenarios/test_imdb_ptbr_scenario.py | 26 ++++++++ 3 files changed, 116 insertions(+) create mode 100644 src/helm/benchmark/run_specs/imdb_ptbr_run_specs.py create mode 100644 src/helm/benchmark/scenarios/imdb_ptbr_scenario.py create mode 100644 src/helm/benchmark/scenarios/test_imdb_ptbr_scenario.py diff --git a/src/helm/benchmark/run_specs/imdb_ptbr_run_specs.py b/src/helm/benchmark/run_specs/imdb_ptbr_run_specs.py new file mode 100644 index 0000000000..1bd205caee --- /dev/null +++ b/src/helm/benchmark/run_specs/imdb_ptbr_run_specs.py @@ -0,0 +1,30 @@ +from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec +from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs +from helm.benchmark.run_spec import RunSpec, run_spec_function +from helm.benchmark.scenarios.scenario import ScenarioSpec + + +@run_spec_function("imdb_ptbr") +def get_tweetsentbr_spec() -> RunSpec: + scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.imdb_ptbr_scenario.IMDB_PTBRScenario", args={}) + + adapter_spec = get_generation_adapter_spec( + instructions="""Classifique a resenha do usuário sobre o filme como "positivo" ou "negativo". + + Resenha: Tudo sobre o filme é maravilhoso. Atuações, trilha sonora, fotografia. Amei tudo! + Classe: positivo + + Resenha: Achei um filme bem fraco, não gostei da história. + Classe: negativo + """, + input_noun="Resenha", + output_noun="Classe", + ) + + return RunSpec( + name="imdb_ptbr", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(), + groups=["imdb_ptbr"], + ) diff --git a/src/helm/benchmark/scenarios/imdb_ptbr_scenario.py b/src/helm/benchmark/scenarios/imdb_ptbr_scenario.py new file mode 100644 index 0000000000..c0536e26ee --- /dev/null +++ b/src/helm/benchmark/scenarios/imdb_ptbr_scenario.py @@ -0,0 +1,60 @@ +from typing import Any, List, Dict +from pathlib import Path +from datasets import load_dataset +from helm.common.hierarchical_logger import hlog +from helm.benchmark.scenarios.scenario import ( + Scenario, + Instance, + Reference, + TRAIN_SPLIT, + TEST_SPLIT, + CORRECT_TAG, + Input, + Output, +) + + +class IMDB_PTBRScenario(Scenario): + """ + The IMDB dataset is a widely-used benchmark dataset for natural language processing (NLP) + particularly for text classification and sentiment analysis. + This is a translated version that is meant to evaluate PT-BR models. + It consists of movie reviews from the Internet Movie Database (IMDB) and + includes both positive and negative sentiments labeled for supervised learning. + """ + + name = "simple_classification" + description = "Classify movie reviews between positive or negative." + tags = ["classification"] + + def process_dataset(self, dataset: Any, split: str) -> List[Instance]: + instances: List[Instance] = [] + label_names = {0: "negativo", 1: "positivo"} + for example in dataset[split]: + input = Input(text=example["text"]) + # NOTE: For classification scenarios, the reference outputs should be the same + # for all instances, and should include both correct and incorrect classes. + # HELM only supports single-label classification. Exactly one reference + # should have the CORRECT_TAG tag. + references = [ + Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]), + ] + instance = Instance(input=input, references=references, split=split) + instances.append(instance) + return instances + + def get_instances(self, output_path: str) -> List[Instance]: + instances: List[Instance] = [] + cache_dir = str(Path(output_path) / "data") + dataset = load_dataset("maritaca-ai/imdb_pt", cache_dir=cache_dir) + splits: Dict[str, str] = { + "train": TRAIN_SPLIT, + "test": TEST_SPLIT, + } + for split in splits: + if split not in splits.keys(): + hlog(f"{split} split doesn't exist, skipping") + continue + instances.extend(self.process_dataset(dataset, splits[split])) + + return instances diff --git a/src/helm/benchmark/scenarios/test_imdb_ptbr_scenario.py b/src/helm/benchmark/scenarios/test_imdb_ptbr_scenario.py new file mode 100644 index 0000000000..f36b4733e4 --- /dev/null +++ b/src/helm/benchmark/scenarios/test_imdb_ptbr_scenario.py @@ -0,0 +1,26 @@ +import pytest +from tempfile import TemporaryDirectory + +from helm.benchmark.scenarios.imdb_ptbr_scenario import IMDB_PTBRScenario +from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference + + +#@pytest.mark.scenarios +def test_imdb_ptbr_scenario(): + imdb_ptbr = IMDB_PTBRScenario() + with TemporaryDirectory() as tmpdir: + instances = imdb_ptbr.get_instances(tmpdir) + assert len(instances) == 30000 + assert instances[0].split == TRAIN_SPLIT + + assert instances[10].input.text.startswith( + "Foi ótimo ver algumas das minhas estrelas favoritas de 30 anos atrás, incluindo John Ritter, Ben Gazarra e Audrey Hepburn." + ) + assert len(instances[10].input.text) == 1549 + + assert instances[10].references == [ + Reference( + output=Output(text="negativo"), + tags=[CORRECT_TAG], + ) + ]