From 6d70e983fda1043eced0e702076ed749a8ebe03a Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Tue, 14 Jan 2025 22:16:12 -0800
Subject: [PATCH] Misc cleanup for HELM Capabilities (#3274)

---
 .../annotation/wildbench_annotator.py         |  2 +-
 .../benchmark/metrics/wildbench_metrics.py    |  2 +
 .../run_entries_capabilities.conf             |  6 +-
 .../run_specs/capabilities_run_specs.py       | 74 +++++++++++++++++--
 .../benchmark/run_specs/lite_run_specs.py     | 58 ---------------
 .../scenarios/bigcodebench_scenario.py        |  4 +-
 .../benchmark/scenarios/ifeval_scenario.py    |  8 +-
 .../benchmark/scenarios/mmlu_pro_scenario.py  |  2 +-
 .../benchmark/scenarios/wildbench_scenario.py |  2 +
 src/helm/benchmark/static/schema_lite_v2.yaml | 16 +++-
 10 files changed, 100 insertions(+), 74 deletions(-)

diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py
index 94c9f4cd5d5..f74d873eb57 100644
--- a/src/helm/benchmark/annotation/wildbench_annotator.py
+++ b/src/helm/benchmark/annotation/wildbench_annotator.py
@@ -62,4 +62,4 @@ def annotate(self, request_state: RequestState) -> Any:
         except ValueError:
             raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")
 
-        return {"prompt_text": annotator_prompt, "strengths": strengths, "weaknesses": weaknesses, "score": score}
+        return {"strengths": strengths, "weaknesses": weaknesses, "score": score}
diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py
index 01b7a2abd00..b3deb766b15 100644
--- a/src/helm/benchmark/metrics/wildbench_metrics.py
+++ b/src/helm/benchmark/metrics/wildbench_metrics.py
@@ -20,6 +20,8 @@ def evaluate_generation(
     ) -> List[Stat]:
         assert request_state.annotations
         score = request_state.annotations["wildbench"]["score"]
+        score_rescaled = (score - 1) / 9
         return [
             Stat(MetricName("wildbench_score")).add(score),
+            Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
         ]
diff --git a/src/helm/benchmark/presentation/run_entries_capabilities.conf b/src/helm/benchmark/presentation/run_entries_capabilities.conf
index 724eef32aba..f3051eea751 100644
--- a/src/helm/benchmark/presentation/run_entries_capabilities.conf
+++ b/src/helm/benchmark/presentation/run_entries_capabilities.conf
@@ -1,10 +1,10 @@
 # Scenarios for HELM Capabilities.
 
 entries: [
-  {description: "mmlu_pro:subject=all,use_chain_of_thought=true,model=text", priority: 1}
-  {description: "gpqa:subset=main,use_chain_of_thought=true,model=text", priority: 1}
+  {description: "mmlu_pro:subject=all,model=text", priority: 1}
+  {description: "gpqa:subset=main,model=text", priority: 1}
   {description: "ifeval:model=text", priority: 1}
   {description: "wildbench:subset=v2,model=text", priority: 1}
-  {description: "bigcodebench:version=v0.1.2,model=text", priority: 1}
+  {description: "bigcodebench:version=v0.1.3,model=text", priority: 1}
   {description: "omni_math:model=text", priority: 1}
 ]
diff --git a/src/helm/benchmark/run_specs/capabilities_run_specs.py b/src/helm/benchmark/run_specs/capabilities_run_specs.py
index 36b46999332..782c9296d49 100644
--- a/src/helm/benchmark/run_specs/capabilities_run_specs.py
+++ b/src/helm/benchmark/run_specs/capabilities_run_specs.py
@@ -20,8 +20,69 @@
 from helm.benchmark.scenarios.scenario import ScenarioSpec
 
 
+def _bool_to_str(value: bool):
+    return str(value).lower()
+
+
+@run_spec_function("mmlu_pro")
+def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec:
+    # Convert to bools and remove the str versions
+    use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
+    use_few_shot_bool: bool = use_few_shot.lower() == "true"
+    del use_chain_of_thought
+    del use_few_shot
+
+    run_spec_name = f"mmlu_pro:subset={subject},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}"  # noqa: E501
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
+    )
+    max_train_instance_num = 5 if use_few_shot_bool else 0
+
+    if use_chain_of_thought_bool:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=2000,
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=(
+                "Let’s think step by step. Based on your reasoning, what is the single, "
+                "most likely answer choice? Format your response as follows: "
+                '"The correct answer is (insert answer here)".'
+            ),
+        )
+        return RunSpec(
+            name=run_spec_name,
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_basic_metric_specs([])
+            + [
+                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
+            ],
+            groups=["mmlu_pro"],
+        )
+    else:
+        adapter_spec = AdapterSpec(
+            method=ADAPT_MULTIPLE_CHOICE_JOINT,
+            max_train_instances=max_train_instance_num,
+            max_tokens=2000,
+            input_prefix="What is the correct answer to this question: ",
+            input_suffix="\nChoices:\n",
+            output_prefix="",
+            global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
+        )
+        return RunSpec(
+            name=run_spec_name,
+            scenario_spec=scenario_spec,
+            adapter_spec=adapter_spec,
+            metric_specs=get_exact_match_metric_specs(),
+            groups=["mmlu_pro"],
+        )
+
+
 @run_spec_function("gpqa")
-def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
+def get_gpqa_spec(subset: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec:
     # Convert to bools and remove the str versions
     use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
     use_few_shot_bool: bool = use_few_shot.lower() == "true"
@@ -114,7 +175,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
     )
 
     return RunSpec(
-        name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
+        name=f"gpqa:subset={subset},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}",  # noqa: E501
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         metric_specs=metric_specs,
@@ -145,13 +206,12 @@ def get_ifeval_spec() -> RunSpec:
 
 
 @run_spec_function("wildbench")
-def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec:
-
+def get_wildbench_spec(subset: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
         class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario",
         args={
             "subset": subset,
-            "use_model_outputs": use_model_outputs.lower() == "true",
+            "use_model_outputs": False,
         },
     )
 
@@ -164,7 +224,7 @@ def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec
     ]
 
     return RunSpec(
-        name="wildbench",
+        name=f"wildbench:subset={subset}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         annotators=annotator_specs,
@@ -199,7 +259,7 @@ def get_bigcodebench_spec(version: str) -> RunSpec:
     ]
 
     return RunSpec(
-        name="bigcodebench",
+        name=f"bigcodebench:version={version}",
         scenario_spec=scenario_spec,
         adapter_spec=adapter_spec,
         annotators=annotator_specs,
diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py
index b034ff48ec5..17c7f323906 100644
--- a/src/helm/benchmark/run_specs/lite_run_specs.py
+++ b/src/helm/benchmark/run_specs/lite_run_specs.py
@@ -5,7 +5,6 @@
 from helm.benchmark.adaptation.adapter_spec import (
     ADAPT_GENERATION,
     ADAPT_MULTIPLE_CHOICE_JOINT,
-    ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
     AdapterSpec,
 )
 from helm.benchmark.adaptation.common_adapter_specs import (
@@ -25,7 +24,6 @@
 from helm.benchmark.run_spec import RunSpec, run_spec_function
 from helm.benchmark.runner import get_benchmark_output_path
 from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
-from helm.benchmark.metrics.metric import MetricSpec
 
 
 @run_spec_function("narrative_qa")
@@ -136,62 +134,6 @@ def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Ru
     )
 
 
-@run_spec_function("mmlu_pro")
-def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
-    # Convert to bools and remove the str versions
-    use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
-    use_few_shot_bool: bool = use_few_shot.lower() == "true"
-    del use_chain_of_thought
-    del use_few_shot
-
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
-    )
-    max_train_instance_num = 5 if use_few_shot_bool else 0
-
-    if use_chain_of_thought_bool:
-        adapter_spec = AdapterSpec(
-            method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
-            max_train_instances=max_train_instance_num,
-            max_tokens=2000,
-            input_prefix="What is the correct answer to this question: ",
-            input_suffix="\nChoices:\n",
-            output_prefix="",
-            global_suffix=(
-                "Let’s think step by step. Based on your reasoning, what is the single, "
-                "most likely answer choice? Format your response as follows: "
-                '"The correct answer is (insert answer here)".'
-            ),
-        )
-        return RunSpec(
-            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
-            scenario_spec=scenario_spec,
-            adapter_spec=adapter_spec,
-            metric_specs=get_basic_metric_specs([])
-            + [
-                MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
-            ],
-            groups=["mmlu_pro"],
-        )
-    else:
-        adapter_spec = AdapterSpec(
-            method=ADAPT_MULTIPLE_CHOICE_JOINT,
-            max_train_instances=max_train_instance_num,
-            max_tokens=2000,
-            input_prefix="What is the correct answer to this question: ",
-            input_suffix="\nChoices:\n",
-            output_prefix="",
-            global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
-        )
-        return RunSpec(
-            name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
-            scenario_spec=scenario_spec,
-            adapter_spec=adapter_spec,
-            metric_specs=get_exact_match_metric_specs(),
-            groups=["mmlu_pro"],
-        )
-
-
 @run_spec_function("gsm")
 def get_gsm_spec() -> RunSpec:
     scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={})
diff --git a/src/helm/benchmark/scenarios/bigcodebench_scenario.py b/src/helm/benchmark/scenarios/bigcodebench_scenario.py
index 19b8ceb699a..e5d3d1b4cef 100644
--- a/src/helm/benchmark/scenarios/bigcodebench_scenario.py
+++ b/src/helm/benchmark/scenarios/bigcodebench_scenario.py
@@ -10,7 +10,7 @@
 from helm.common.general import ensure_directory_exists
 
 
-VERSIONS = ["v0.1.2"]
+VERSIONS = ["v0.1.0_hf", "v0.1.1", "v0.1.2", "v0.1.3"]
 
 
 class BigCodeBenchScenario(Scenario):
@@ -36,7 +36,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
         ensure_directory_exists(cache_dir)
         dataset = datasets.load_dataset(
             "bigcode/bigcodebench",
-            revision="35a015f216382cb88997b91b9400357a79e55141",  # for v0.1.2
+            revision="057dd1a30dd73d4ed59cfbaaae049870491fa4d6",
             cache_dir=cache_dir,
             split=self.version,
         )
diff --git a/src/helm/benchmark/scenarios/ifeval_scenario.py b/src/helm/benchmark/scenarios/ifeval_scenario.py
index 3912d92da60..54e23978237 100644
--- a/src/helm/benchmark/scenarios/ifeval_scenario.py
+++ b/src/helm/benchmark/scenarios/ifeval_scenario.py
@@ -27,7 +27,13 @@ def get_instances(self, output_path: str) -> List[Instance]:
         # Get IFEval from HuggingFace
         cache_dir = os.path.join(output_path, "data")
         ensure_directory_exists(cache_dir)
-        dataset = datasets.load_dataset("google/IFEval", trust_remote_code=True, cache_dir=cache_dir, split="train")
+        dataset = datasets.load_dataset(
+            "google/IFEval",
+            trust_remote_code=True,
+            cache_dir=cache_dir,
+            split="train",
+            revision="966cd89545d6b6acfd7638bc708b98261ca58e84",
+        )
         assert isinstance(dataset, datasets.Dataset)
 
         # Read all instances
diff --git a/src/helm/benchmark/scenarios/mmlu_pro_scenario.py b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py
index d42c09274e8..718d5ad3848 100644
--- a/src/helm/benchmark/scenarios/mmlu_pro_scenario.py
+++ b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py
@@ -53,7 +53,7 @@ def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
         instances: List[Instance] = []
         hlog(f"Processing data for {split} split")
         for row in data:
-            id = row["id"]
+            id = row["question_id"]
             question = row["question"]
             answers = row["options"]
             correct_choice = row["answer"]
diff --git a/src/helm/benchmark/scenarios/wildbench_scenario.py b/src/helm/benchmark/scenarios/wildbench_scenario.py
index 100efa96617..b79103f8042 100644
--- a/src/helm/benchmark/scenarios/wildbench_scenario.py
+++ b/src/helm/benchmark/scenarios/wildbench_scenario.py
@@ -41,6 +41,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
             self.subset,
             cache_dir=cache_dir,
             split="test",
+            revision="7c05c1b4550282b2ed6a2e6ac5db069f1e07df5c",
         )
         assert isinstance(dataset, datasets.Dataset)
         if self.use_model_outputs:
@@ -50,6 +51,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
                     model,
                     cache_dir=cache_dir,
                     split="train",
+                    revision="d6755bc68220df853c0825a733430f73f5af2501",
                 )
                 for model in REFERENCE_MODELS
             }
diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml
index 35b30c5d214..1c3c5599e32 100644
--- a/src/helm/benchmark/static/schema_lite_v2.yaml
+++ b/src/helm/benchmark/static/schema_lite_v2.yaml
@@ -65,6 +65,11 @@ metrics:
   - name: predicted_index
     display_name: Predicted index
     description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
+  - name: inference_runtime
+    display_name: Observed inference runtime (s)
+    short_display_name: Observed inference time (s)
+    lower_is_better: true
+    description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
 
   # Accuracy metrics:
   - name: exact_match
@@ -103,6 +108,11 @@ metrics:
     short_display_name: WB Score
     description: Score of the AI output judged by GPT-4o.
     lower_is_better: false
+  - name: wildbench_score_rescaled
+    display_name: WildBench Score
+    short_display_name: WB Score
+    description: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.
+    lower_is_better: false
   - name: bigcodebench_p@1
     display_name: BigCodeBench Pass@1
     short_display_name: Pass@1
@@ -121,12 +131,16 @@ perturbations: []
 metric_groups:
   - name: accuracy
     display_name: Accuracy
+    aggregation_strategies: 
+      - mean
     metrics:
       - name: ${main_name}
         split: ${main_split}
 
   - name: efficiency
     display_name: Efficiency
+    aggregation_strategies: 
+      - mean
     metrics:
     - name: inference_runtime
       split: ${main_split}
@@ -219,7 +233,7 @@ run_groups:
       - efficiency
       - general_information
     environment:
-      main_name: wildbench_score
+      main_name: wildbench_score_rescaled
       main_split: test
     taxonomy:
       task: "instruction following"