Skip to content

Commit

Permalink
Misc cleanup for HELM Capabilities (#3274)
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai authored Jan 15, 2025
1 parent 716e523 commit 6d70e98
Show file tree
Hide file tree
Showing 10 changed files with 100 additions and 74 deletions.
2 changes: 1 addition & 1 deletion src/helm/benchmark/annotation/wildbench_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,4 @@ def annotate(self, request_state: RequestState) -> Any:
except ValueError:
raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}")

return {"prompt_text": annotator_prompt, "strengths": strengths, "weaknesses": weaknesses, "score": score}
return {"strengths": strengths, "weaknesses": weaknesses, "score": score}
2 changes: 2 additions & 0 deletions src/helm/benchmark/metrics/wildbench_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def evaluate_generation(
) -> List[Stat]:
assert request_state.annotations
score = request_state.annotations["wildbench"]["score"]
score_rescaled = (score - 1) / 9
return [
Stat(MetricName("wildbench_score")).add(score),
Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled),
]
6 changes: 3 additions & 3 deletions src/helm/benchmark/presentation/run_entries_capabilities.conf
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# Scenarios for HELM Capabilities.

entries: [
{description: "mmlu_pro:subject=all,use_chain_of_thought=true,model=text", priority: 1}
{description: "gpqa:subset=main,use_chain_of_thought=true,model=text", priority: 1}
{description: "mmlu_pro:subject=all,model=text", priority: 1}
{description: "gpqa:subset=main,model=text", priority: 1}
{description: "ifeval:model=text", priority: 1}
{description: "wildbench:subset=v2,model=text", priority: 1}
{description: "bigcodebench:version=v0.1.2,model=text", priority: 1}
{description: "bigcodebench:version=v0.1.3,model=text", priority: 1}
{description: "omni_math:model=text", priority: 1}
]
74 changes: 67 additions & 7 deletions src/helm/benchmark/run_specs/capabilities_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,69 @@
from helm.benchmark.scenarios.scenario import ScenarioSpec


def _bool_to_str(value: bool):
return str(value).lower()


@run_spec_function("mmlu_pro")
def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec:
# Convert to bools and remove the str versions
use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
use_few_shot_bool: bool = use_few_shot.lower() == "true"
del use_chain_of_thought
del use_few_shot

run_spec_name = f"mmlu_pro:subset={subject},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}" # noqa: E501
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
)
max_train_instance_num = 5 if use_few_shot_bool else 0

if use_chain_of_thought_bool:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_train_instances=max_train_instance_num,
max_tokens=2000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
global_suffix=(
"Let’s think step by step. Based on your reasoning, what is the single, "
"most likely answer choice? Format your response as follows: "
'"The correct answer is (insert answer here)".'
),
)
return RunSpec(
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_basic_metric_specs([])
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
],
groups=["mmlu_pro"],
)
else:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
max_tokens=2000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
)
return RunSpec(
name=run_spec_name,
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["mmlu_pro"],
)


@run_spec_function("gpqa")
def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
def get_gpqa_spec(subset: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec:
# Convert to bools and remove the str versions
use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
use_few_shot_bool: bool = use_few_shot.lower() == "true"
Expand Down Expand Up @@ -114,7 +175,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot
)

return RunSpec(
name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}",
name=f"gpqa:subset={subset},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}", # noqa: E501
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=metric_specs,
Expand Down Expand Up @@ -145,13 +206,12 @@ def get_ifeval_spec() -> RunSpec:


@run_spec_function("wildbench")
def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec:

def get_wildbench_spec(subset: str) -> RunSpec:
scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario",
args={
"subset": subset,
"use_model_outputs": use_model_outputs.lower() == "true",
"use_model_outputs": False,
},
)

Expand All @@ -164,7 +224,7 @@ def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec
]

return RunSpec(
name="wildbench",
name=f"wildbench:subset={subset}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
Expand Down Expand Up @@ -199,7 +259,7 @@ def get_bigcodebench_spec(version: str) -> RunSpec:
]

return RunSpec(
name="bigcodebench",
name=f"bigcodebench:version={version}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
annotators=annotator_specs,
Expand Down
58 changes: 0 additions & 58 deletions src/helm/benchmark/run_specs/lite_run_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from helm.benchmark.adaptation.adapter_spec import (
ADAPT_GENERATION,
ADAPT_MULTIPLE_CHOICE_JOINT,
ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
AdapterSpec,
)
from helm.benchmark.adaptation.common_adapter_specs import (
Expand All @@ -25,7 +24,6 @@
from helm.benchmark.run_spec import RunSpec, run_spec_function
from helm.benchmark.runner import get_benchmark_output_path
from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path
from helm.benchmark.metrics.metric import MetricSpec


@run_spec_function("narrative_qa")
Expand Down Expand Up @@ -136,62 +134,6 @@ def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Ru
)


@run_spec_function("mmlu_pro")
def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec:
# Convert to bools and remove the str versions
use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true"
use_few_shot_bool: bool = use_few_shot.lower() == "true"
del use_chain_of_thought
del use_few_shot

scenario_spec = ScenarioSpec(
class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject}
)
max_train_instance_num = 5 if use_few_shot_bool else 0

if use_chain_of_thought_bool:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT,
max_train_instances=max_train_instance_num,
max_tokens=2000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
global_suffix=(
"Let’s think step by step. Based on your reasoning, what is the single, "
"most likely answer choice? Format your response as follows: "
'"The correct answer is (insert answer here)".'
),
)
return RunSpec(
name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_basic_metric_specs([])
+ [
MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}),
],
groups=["mmlu_pro"],
)
else:
adapter_spec = AdapterSpec(
method=ADAPT_MULTIPLE_CHOICE_JOINT,
max_train_instances=max_train_instance_num,
max_tokens=2000,
input_prefix="What is the correct answer to this question: ",
input_suffix="\nChoices:\n",
output_prefix="",
global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'),
)
return RunSpec(
name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}",
scenario_spec=scenario_spec,
adapter_spec=adapter_spec,
metric_specs=get_exact_match_metric_specs(),
groups=["mmlu_pro"],
)


@run_spec_function("gsm")
def get_gsm_spec() -> RunSpec:
scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={})
Expand Down
4 changes: 2 additions & 2 deletions src/helm/benchmark/scenarios/bigcodebench_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from helm.common.general import ensure_directory_exists


VERSIONS = ["v0.1.2"]
VERSIONS = ["v0.1.0_hf", "v0.1.1", "v0.1.2", "v0.1.3"]


class BigCodeBenchScenario(Scenario):
Expand All @@ -36,7 +36,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
ensure_directory_exists(cache_dir)
dataset = datasets.load_dataset(
"bigcode/bigcodebench",
revision="35a015f216382cb88997b91b9400357a79e55141", # for v0.1.2
revision="057dd1a30dd73d4ed59cfbaaae049870491fa4d6",
cache_dir=cache_dir,
split=self.version,
)
Expand Down
8 changes: 7 additions & 1 deletion src/helm/benchmark/scenarios/ifeval_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,13 @@ def get_instances(self, output_path: str) -> List[Instance]:
# Get IFEval from HuggingFace
cache_dir = os.path.join(output_path, "data")
ensure_directory_exists(cache_dir)
dataset = datasets.load_dataset("google/IFEval", trust_remote_code=True, cache_dir=cache_dir, split="train")
dataset = datasets.load_dataset(
"google/IFEval",
trust_remote_code=True,
cache_dir=cache_dir,
split="train",
revision="966cd89545d6b6acfd7638bc708b98261ca58e84",
)
assert isinstance(dataset, datasets.Dataset)

# Read all instances
Expand Down
2 changes: 1 addition & 1 deletion src/helm/benchmark/scenarios/mmlu_pro_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def process_dataset(self, data: Dataset, split: str) -> List[Instance]:
instances: List[Instance] = []
hlog(f"Processing data for {split} split")
for row in data:
id = row["id"]
id = row["question_id"]
question = row["question"]
answers = row["options"]
correct_choice = row["answer"]
Expand Down
2 changes: 2 additions & 0 deletions src/helm/benchmark/scenarios/wildbench_scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
self.subset,
cache_dir=cache_dir,
split="test",
revision="7c05c1b4550282b2ed6a2e6ac5db069f1e07df5c",
)
assert isinstance(dataset, datasets.Dataset)
if self.use_model_outputs:
Expand All @@ -50,6 +51,7 @@ def get_instances(self, output_path: str) -> List[Instance]:
model,
cache_dir=cache_dir,
split="train",
revision="d6755bc68220df853c0825a733430f73f5af2501",
)
for model in REFERENCE_MODELS
}
Expand Down
16 changes: 15 additions & 1 deletion src/helm/benchmark/static/schema_lite_v2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ metrics:
- name: predicted_index
display_name: Predicted index
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
- name: inference_runtime
display_name: Observed inference runtime (s)
short_display_name: Observed inference time (s)
lower_is_better: true
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).

# Accuracy metrics:
- name: exact_match
Expand Down Expand Up @@ -103,6 +108,11 @@ metrics:
short_display_name: WB Score
description: Score of the AI output judged by GPT-4o.
lower_is_better: false
- name: wildbench_score_rescaled
display_name: WildBench Score
short_display_name: WB Score
description: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1.
lower_is_better: false
- name: bigcodebench_p@1
display_name: BigCodeBench Pass@1
short_display_name: Pass@1
Expand All @@ -121,12 +131,16 @@ perturbations: []
metric_groups:
- name: accuracy
display_name: Accuracy
aggregation_strategies:
- mean
metrics:
- name: ${main_name}
split: ${main_split}

- name: efficiency
display_name: Efficiency
aggregation_strategies:
- mean
metrics:
- name: inference_runtime
split: ${main_split}
Expand Down Expand Up @@ -219,7 +233,7 @@ run_groups:
- efficiency
- general_information
environment:
main_name: wildbench_score
main_name: wildbench_score_rescaled
main_split: test
taxonomy:
task: "instruction following"
Expand Down

0 comments on commit 6d70e98

Please sign in to comment.