From 6d70e983fda1043eced0e702076ed749a8ebe03a Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Tue, 14 Jan 2025 22:16:12 -0800 Subject: [PATCH] Misc cleanup for HELM Capabilities (#3274) --- .../annotation/wildbench_annotator.py | 2 +- .../benchmark/metrics/wildbench_metrics.py | 2 + .../run_entries_capabilities.conf | 6 +- .../run_specs/capabilities_run_specs.py | 74 +++++++++++++++++-- .../benchmark/run_specs/lite_run_specs.py | 58 --------------- .../scenarios/bigcodebench_scenario.py | 4 +- .../benchmark/scenarios/ifeval_scenario.py | 8 +- .../benchmark/scenarios/mmlu_pro_scenario.py | 2 +- .../benchmark/scenarios/wildbench_scenario.py | 2 + src/helm/benchmark/static/schema_lite_v2.yaml | 16 +++- 10 files changed, 100 insertions(+), 74 deletions(-) diff --git a/src/helm/benchmark/annotation/wildbench_annotator.py b/src/helm/benchmark/annotation/wildbench_annotator.py index 94c9f4cd5d5..f74d873eb57 100644 --- a/src/helm/benchmark/annotation/wildbench_annotator.py +++ b/src/helm/benchmark/annotation/wildbench_annotator.py @@ -62,4 +62,4 @@ def annotate(self, request_state: RequestState) -> Any: except ValueError: raise ValueError(f"Malformed score '{score_text}' in annotator response: {annotator_response_text}") - return {"prompt_text": annotator_prompt, "strengths": strengths, "weaknesses": weaknesses, "score": score} + return {"strengths": strengths, "weaknesses": weaknesses, "score": score} diff --git a/src/helm/benchmark/metrics/wildbench_metrics.py b/src/helm/benchmark/metrics/wildbench_metrics.py index 01b7a2abd00..b3deb766b15 100644 --- a/src/helm/benchmark/metrics/wildbench_metrics.py +++ b/src/helm/benchmark/metrics/wildbench_metrics.py @@ -20,6 +20,8 @@ def evaluate_generation( ) -> List[Stat]: assert request_state.annotations score = request_state.annotations["wildbench"]["score"] + score_rescaled = (score - 1) / 9 return [ Stat(MetricName("wildbench_score")).add(score), + Stat(MetricName("wildbench_score_rescaled")).add(score_rescaled), ] diff --git a/src/helm/benchmark/presentation/run_entries_capabilities.conf b/src/helm/benchmark/presentation/run_entries_capabilities.conf index 724eef32aba..f3051eea751 100644 --- a/src/helm/benchmark/presentation/run_entries_capabilities.conf +++ b/src/helm/benchmark/presentation/run_entries_capabilities.conf @@ -1,10 +1,10 @@ # Scenarios for HELM Capabilities. entries: [ - {description: "mmlu_pro:subject=all,use_chain_of_thought=true,model=text", priority: 1} - {description: "gpqa:subset=main,use_chain_of_thought=true,model=text", priority: 1} + {description: "mmlu_pro:subject=all,model=text", priority: 1} + {description: "gpqa:subset=main,model=text", priority: 1} {description: "ifeval:model=text", priority: 1} {description: "wildbench:subset=v2,model=text", priority: 1} - {description: "bigcodebench:version=v0.1.2,model=text", priority: 1} + {description: "bigcodebench:version=v0.1.3,model=text", priority: 1} {description: "omni_math:model=text", priority: 1} ] diff --git a/src/helm/benchmark/run_specs/capabilities_run_specs.py b/src/helm/benchmark/run_specs/capabilities_run_specs.py index 36b46999332..782c9296d49 100644 --- a/src/helm/benchmark/run_specs/capabilities_run_specs.py +++ b/src/helm/benchmark/run_specs/capabilities_run_specs.py @@ -20,8 +20,69 @@ from helm.benchmark.scenarios.scenario import ScenarioSpec +def _bool_to_str(value: bool): + return str(value).lower() + + +@run_spec_function("mmlu_pro") +def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec: + # Convert to bools and remove the str versions + use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true" + use_few_shot_bool: bool = use_few_shot.lower() == "true" + del use_chain_of_thought + del use_few_shot + + run_spec_name = f"mmlu_pro:subset={subject},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}" # noqa: E501 + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject} + ) + max_train_instance_num = 5 if use_few_shot_bool else 0 + + if use_chain_of_thought_bool: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, + max_train_instances=max_train_instance_num, + max_tokens=2000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + global_suffix=( + "Let’s think step by step. Based on your reasoning, what is the single, " + "most likely answer choice? Format your response as follows: " + '"The correct answer is (insert answer here)".' + ), + ) + return RunSpec( + name=run_spec_name, + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_basic_metric_specs([]) + + [ + MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), + ], + groups=["mmlu_pro"], + ) + else: + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + max_train_instances=max_train_instance_num, + max_tokens=2000, + input_prefix="What is the correct answer to this question: ", + input_suffix="\nChoices:\n", + output_prefix="", + global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), + ) + return RunSpec( + name=run_spec_name, + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=get_exact_match_metric_specs(), + groups=["mmlu_pro"], + ) + + @run_spec_function("gpqa") -def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: +def get_gpqa_spec(subset: str, use_chain_of_thought: str = "true", use_few_shot: str = "false") -> RunSpec: # Convert to bools and remove the str versions use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true" use_few_shot_bool: bool = use_few_shot.lower() == "true" @@ -114,7 +175,7 @@ def get_gpqa_spec(subset: str, use_chain_of_thought: str = "False", use_few_shot ) return RunSpec( - name=f"gpqa:subset={subset},use_chain_of_thought={use_chain_of_thought_bool}", + name=f"gpqa:subset={subset},use_chain_of_thought={_bool_to_str(use_chain_of_thought_bool)},use_few_shot={_bool_to_str(use_few_shot_bool)}", # noqa: E501 scenario_spec=scenario_spec, adapter_spec=adapter_spec, metric_specs=metric_specs, @@ -145,13 +206,12 @@ def get_ifeval_spec() -> RunSpec: @run_spec_function("wildbench") -def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec: - +def get_wildbench_spec(subset: str) -> RunSpec: scenario_spec = ScenarioSpec( class_name="helm.benchmark.scenarios.wildbench_scenario.WildBenchScenario", args={ "subset": subset, - "use_model_outputs": use_model_outputs.lower() == "true", + "use_model_outputs": False, }, ) @@ -164,7 +224,7 @@ def get_wildbench_spec(subset: str, use_model_outputs: str = "False") -> RunSpec ] return RunSpec( - name="wildbench", + name=f"wildbench:subset={subset}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, annotators=annotator_specs, @@ -199,7 +259,7 @@ def get_bigcodebench_spec(version: str) -> RunSpec: ] return RunSpec( - name="bigcodebench", + name=f"bigcodebench:version={version}", scenario_spec=scenario_spec, adapter_spec=adapter_spec, annotators=annotator_specs, diff --git a/src/helm/benchmark/run_specs/lite_run_specs.py b/src/helm/benchmark/run_specs/lite_run_specs.py index b034ff48ec5..17c7f323906 100644 --- a/src/helm/benchmark/run_specs/lite_run_specs.py +++ b/src/helm/benchmark/run_specs/lite_run_specs.py @@ -5,7 +5,6 @@ from helm.benchmark.adaptation.adapter_spec import ( ADAPT_GENERATION, ADAPT_MULTIPLE_CHOICE_JOINT, - ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, AdapterSpec, ) from helm.benchmark.adaptation.common_adapter_specs import ( @@ -25,7 +24,6 @@ from helm.benchmark.run_spec import RunSpec, run_spec_function from helm.benchmark.runner import get_benchmark_output_path from helm.benchmark.scenarios.scenario import ScenarioSpec, get_scenario_cache_path -from helm.benchmark.metrics.metric import MetricSpec @run_spec_function("narrative_qa") @@ -136,62 +134,6 @@ def get_mmlu_spec(subject: str, method: str = ADAPT_MULTIPLE_CHOICE_JOINT) -> Ru ) -@run_spec_function("mmlu_pro") -def get_mmlu_pro_spec(subject: str, use_chain_of_thought: str = "False", use_few_shot: str = "False") -> RunSpec: - # Convert to bools and remove the str versions - use_chain_of_thought_bool: bool = use_chain_of_thought.lower() == "true" - use_few_shot_bool: bool = use_few_shot.lower() == "true" - del use_chain_of_thought - del use_few_shot - - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.mmlu_pro_scenario.MMLUProScenario", args={"subject": subject} - ) - max_train_instance_num = 5 if use_few_shot_bool else 0 - - if use_chain_of_thought_bool: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT_CHAIN_OF_THOUGHT, - max_train_instances=max_train_instance_num, - max_tokens=2000, - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - global_suffix=( - "Let’s think step by step. Based on your reasoning, what is the single, " - "most likely answer choice? Format your response as follows: " - '"The correct answer is (insert answer here)".' - ), - ) - return RunSpec( - name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=get_basic_metric_specs([]) - + [ - MetricSpec(class_name="helm.benchmark.metrics.chain_of_thought_metric.ChainOfThoughtMetric", args={}), - ], - groups=["mmlu_pro"], - ) - else: - adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, - max_train_instances=max_train_instance_num, - max_tokens=2000, - input_prefix="What is the correct answer to this question: ", - input_suffix="\nChoices:\n", - output_prefix="", - global_suffix=("Format your response as follows: " '"The correct answer is (insert answer here)".'), - ) - return RunSpec( - name=f"mmlu_pro:subset={subject},use_chain_of_thought={use_chain_of_thought_bool}", - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=get_exact_match_metric_specs(), - groups=["mmlu_pro"], - ) - - @run_spec_function("gsm") def get_gsm_spec() -> RunSpec: scenario_spec = ScenarioSpec(class_name="helm.benchmark.scenarios.gsm_scenario.GSM8KScenario", args={}) diff --git a/src/helm/benchmark/scenarios/bigcodebench_scenario.py b/src/helm/benchmark/scenarios/bigcodebench_scenario.py index 19b8ceb699a..e5d3d1b4cef 100644 --- a/src/helm/benchmark/scenarios/bigcodebench_scenario.py +++ b/src/helm/benchmark/scenarios/bigcodebench_scenario.py @@ -10,7 +10,7 @@ from helm.common.general import ensure_directory_exists -VERSIONS = ["v0.1.2"] +VERSIONS = ["v0.1.0_hf", "v0.1.1", "v0.1.2", "v0.1.3"] class BigCodeBenchScenario(Scenario): @@ -36,7 +36,7 @@ def get_instances(self, output_path: str) -> List[Instance]: ensure_directory_exists(cache_dir) dataset = datasets.load_dataset( "bigcode/bigcodebench", - revision="35a015f216382cb88997b91b9400357a79e55141", # for v0.1.2 + revision="057dd1a30dd73d4ed59cfbaaae049870491fa4d6", cache_dir=cache_dir, split=self.version, ) diff --git a/src/helm/benchmark/scenarios/ifeval_scenario.py b/src/helm/benchmark/scenarios/ifeval_scenario.py index 3912d92da60..54e23978237 100644 --- a/src/helm/benchmark/scenarios/ifeval_scenario.py +++ b/src/helm/benchmark/scenarios/ifeval_scenario.py @@ -27,7 +27,13 @@ def get_instances(self, output_path: str) -> List[Instance]: # Get IFEval from HuggingFace cache_dir = os.path.join(output_path, "data") ensure_directory_exists(cache_dir) - dataset = datasets.load_dataset("google/IFEval", trust_remote_code=True, cache_dir=cache_dir, split="train") + dataset = datasets.load_dataset( + "google/IFEval", + trust_remote_code=True, + cache_dir=cache_dir, + split="train", + revision="966cd89545d6b6acfd7638bc708b98261ca58e84", + ) assert isinstance(dataset, datasets.Dataset) # Read all instances diff --git a/src/helm/benchmark/scenarios/mmlu_pro_scenario.py b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py index d42c09274e8..718d5ad3848 100644 --- a/src/helm/benchmark/scenarios/mmlu_pro_scenario.py +++ b/src/helm/benchmark/scenarios/mmlu_pro_scenario.py @@ -53,7 +53,7 @@ def process_dataset(self, data: Dataset, split: str) -> List[Instance]: instances: List[Instance] = [] hlog(f"Processing data for {split} split") for row in data: - id = row["id"] + id = row["question_id"] question = row["question"] answers = row["options"] correct_choice = row["answer"] diff --git a/src/helm/benchmark/scenarios/wildbench_scenario.py b/src/helm/benchmark/scenarios/wildbench_scenario.py index 100efa96617..b79103f8042 100644 --- a/src/helm/benchmark/scenarios/wildbench_scenario.py +++ b/src/helm/benchmark/scenarios/wildbench_scenario.py @@ -41,6 +41,7 @@ def get_instances(self, output_path: str) -> List[Instance]: self.subset, cache_dir=cache_dir, split="test", + revision="7c05c1b4550282b2ed6a2e6ac5db069f1e07df5c", ) assert isinstance(dataset, datasets.Dataset) if self.use_model_outputs: @@ -50,6 +51,7 @@ def get_instances(self, output_path: str) -> List[Instance]: model, cache_dir=cache_dir, split="train", + revision="d6755bc68220df853c0825a733430f73f5af2501", ) for model in REFERENCE_MODELS } diff --git a/src/helm/benchmark/static/schema_lite_v2.yaml b/src/helm/benchmark/static/schema_lite_v2.yaml index 35b30c5d214..1c3c5599e32 100644 --- a/src/helm/benchmark/static/schema_lite_v2.yaml +++ b/src/helm/benchmark/static/schema_lite_v2.yaml @@ -65,6 +65,11 @@ metrics: - name: predicted_index display_name: Predicted index description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice). + - name: inference_runtime + display_name: Observed inference runtime (s) + short_display_name: Observed inference time (s) + lower_is_better: true + description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment). # Accuracy metrics: - name: exact_match @@ -103,6 +108,11 @@ metrics: short_display_name: WB Score description: Score of the AI output judged by GPT-4o. lower_is_better: false + - name: wildbench_score_rescaled + display_name: WildBench Score + short_display_name: WB Score + description: Score of the AI output judged by GPT-4o, rescaled to be between 0 and 1. + lower_is_better: false - name: bigcodebench_p@1 display_name: BigCodeBench Pass@1 short_display_name: Pass@1 @@ -121,12 +131,16 @@ perturbations: [] metric_groups: - name: accuracy display_name: Accuracy + aggregation_strategies: + - mean metrics: - name: ${main_name} split: ${main_split} - name: efficiency display_name: Efficiency + aggregation_strategies: + - mean metrics: - name: inference_runtime split: ${main_split} @@ -219,7 +233,7 @@ run_groups: - efficiency - general_information environment: - main_name: wildbench_score + main_name: wildbench_score_rescaled main_split: test taxonomy: task: "instruction following"