diff --git a/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml b/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml index 9cc10d3968..f31a2e3c8e 100644 --- a/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml +++ b/lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml @@ -24,3 +24,7 @@ task: - leaderboard_bbh_tracking_shuffled_objects_seven_objects - leaderboard_bbh_tracking_shuffled_objects_three_objects - leaderboard_bbh_web_of_lies +aggregate_metric_list: + - metric: acc_norm + aggregation: mean + weight_by_size: true diff --git a/lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml b/lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml index 448c14ac48..2708f750c0 100644 --- a/lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml +++ b/lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml @@ -3,3 +3,7 @@ task: - leaderboard_gpqa_diamond - leaderboard_gpqa_extended - leaderboard_gpqa_main +aggregate_metric_list: + - metric: acc_norm + aggregation: mean + weight_by_size: true diff --git a/lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml b/lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml index 45cf8f1340..7e7a6cfcf2 100644 --- a/lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml +++ b/lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml @@ -7,3 +7,7 @@ task: - leaderboard_math_num_theory_hard - leaderboard_math_prealgebra_hard - leaderboard_math_precalculus_hard +aggregate_metric_list: + - metric: exact_match + aggregation: mean + weight_by_size: true diff --git a/lm_eval/tasks/leaderboard/musr/_musr.yaml b/lm_eval/tasks/leaderboard/musr/_musr.yaml index 060d231aae..accb197f26 100644 --- a/lm_eval/tasks/leaderboard/musr/_musr.yaml +++ b/lm_eval/tasks/leaderboard/musr/_musr.yaml @@ -3,3 +3,7 @@ task: - leaderboard_musr_murder_mysteries - leaderboard_musr_object_placements - leaderboard_musr_team_allocation +aggregate_metric_list: + - metric: acc_norm + aggregation: mean + weight_by_size: true