stanford-crfm · yifanmai · Feb 5, 2025 · Jan 20, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/src/helm/benchmark/run_specs/imdb_ptbr_run_specs.py b/src/helm/benchmark/run_specs/imdb_ptbr_run_specs.py
@@ -0,0 +1,30 @@
+from helm.benchmark.adaptation.common_adapter_specs import get_generation_adapter_spec
+from helm.benchmark.metrics.common_metric_specs import get_exact_match_metric_specs, get_classification_metric_specs
+from helm.benchmark.run_spec import RunSpec, run_spec_function
+from helm.benchmark.scenarios.scenario import ScenarioSpec
+
+
+@run_spec_function("imdb_ptbr")
+def get_tweetsentbr_spec() -> RunSpec:
+    scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.imdb_ptbr_scenario.IMDB_PTBRScenario", args={})
+
+    adapter_spec = get_generation_adapter_spec(
+        instructions="""Classifique a resenha do usuário sobre o filme como "positivo" ou "negativo".
+
+        Resenha: Tudo sobre o filme é maravilhoso. Atuações, trilha sonora, fotografia. Amei tudo!
+        Classe: positivo
+
+        Resenha: Achei um filme bem fraco, não gostei da história.
+        Classe: negativo
+        """,
+        input_noun="Resenha",
+        output_noun="Classe",
+    )
+
+    return RunSpec(
+        name="imdb_ptbr",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=get_exact_match_metric_specs() + get_classification_metric_specs(),
+        groups=["imdb_ptbr"],
+    )
diff --git a/src/helm/benchmark/scenarios/imdb_ptbr_scenario.py b/src/helm/benchmark/scenarios/imdb_ptbr_scenario.py
@@ -0,0 +1,60 @@
+from typing import Any, List, Dict
+from pathlib import Path
+from datasets import load_dataset
+from helm.common.hierarchical_logger import hlog
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+
+
+class IMDB_PTBRScenario(Scenario):
+    """
+    The IMDB dataset is a widely-used benchmark dataset for natural language processing (NLP)
+    particularly for text classification and sentiment analysis.
+    This is a translated version that is meant to evaluate PT-BR models.
+    It consists of movie reviews from the Internet Movie Database (IMDB) and
+    includes both positive and negative sentiments labeled for supervised learning.
+    """
+
+    name = "simple_classification"
+    description = "Classify movie reviews between positive or negative."
+    tags = ["classification"]
+
+    def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        label_names = {0: "negativo", 1: "positivo"}
+        for example in dataset[split]:
+            input = Input(text=example["text"])
+            # NOTE: For classification scenarios, the reference outputs should be the same
+            # for all instances, and should include both correct and incorrect classes.
+            # HELM only supports single-label classification. Exactly one reference
+            # should have the CORRECT_TAG tag.
+            references = [
+                Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
+            ]
+            instance = Instance(input=input, references=references, split=split)
+            instances.append(instance)
+        return instances
+
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        cache_dir = str(Path(output_path) / "data")
+        dataset = load_dataset("maritaca-ai/imdb_pt", cache_dir=cache_dir)
+        splits: Dict[str, str] = {
+            "train": TRAIN_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        for split in splits:
+            if split not in splits.keys():
+                hlog(f"{split} split doesn't exist, skipping")
+                continue
+            instances.extend(self.process_dataset(dataset, splits[split]))
+
+        return instances
diff --git a/src/helm/benchmark/scenarios/test_imdb_ptbr_scenario.py b/src/helm/benchmark/scenarios/test_imdb_ptbr_scenario.py
@@ -0,0 +1,27 @@
+import pytest
+from tempfile import TemporaryDirectory
+
+from helm.benchmark.scenarios.imdb_ptbr_scenario import IMDB_PTBRScenario
+from helm.benchmark.scenarios.scenario import TRAIN_SPLIT, CORRECT_TAG, Output, Reference
+
+
+@pytest.mark.scenarios
+def test_imdb_ptbr_scenario():
+    imdb_ptbr = IMDB_PTBRScenario()
+    with TemporaryDirectory() as tmpdir:
+        instances = imdb_ptbr.get_instances(tmpdir)
+    assert len(instances) == 30000
+    assert instances[0].split == TRAIN_SPLIT
+
+    assert instances[10].input.text.startswith(
+        "Foi ótimo ver algumas das minhas estrelas favoritas de 30 anos atrás, "
+        "incluindo John Ritter, Ben Gazarra e Audrey Hepburn."
+    )
+    assert len(instances[10].input.text) == 1549
+
+    assert instances[10].references == [
+        Reference(
+            output=Output(text="negativo"),
+            tags=[CORRECT_TAG],
+        )
+    ]