diff --git a/lm_eval/tasks/leaderboard/leaderboard.yaml b/lm_eval/tasks/leaderboard/leaderboard.yaml index d9c5aaac17..f453a272e6 100644 --- a/lm_eval/tasks/leaderboard/leaderboard.yaml +++ b/lm_eval/tasks/leaderboard/leaderboard.yaml @@ -6,3 +6,27 @@ task: - leaderboard_math_hard - leaderboard_ifeval - leaderboard_musr +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: acc_norm + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: exact_match + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: inst_level_loose_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: inst_level_strict_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: prompt_level_loose_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). + - metric: prompt_level_strict_acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). +metadata: + version: 1.0