From 9cd798974542f361960b164f0e41cc14cb61d436 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Thu, 21 Dec 2023 09:20:40 -0500 Subject: [PATCH] Correctly Print Task Versioning (#1173) * change version field formatting in metadata * mention versioning in new task guide * add instructions for changelog * run linters --- docs/new_task_guide.md | 19 +++++++++++++++++++ lm_eval/tasks/anli/anli_r1.yaml | 2 +- lm_eval/tasks/arc/arc_easy.yaml | 2 +- lm_eval/tasks/arithmetic/arithmetic_1dc.yaml | 2 +- lm_eval/tasks/asdiv/default.yaml | 2 +- lm_eval/tasks/babi/babi.yaml | 2 +- .../cot_fewshot/_cot_fewshot_template_yaml | 2 +- .../cot_zeroshot/_cot_zeroshot_template_yaml | 2 +- .../tasks/bbh/fewshot/_fewshot_template_yaml | 2 +- .../bbh/zeroshot/_zeroshot_template_yaml | 2 +- lm_eval/tasks/belebele/_default_template_yaml | 2 +- .../bigbench/generate_until_template_yaml | 2 +- .../multiple_choice/causal_judgement.yaml | 4 ++++ .../bigbench/multiple_choice_template_yaml | 2 +- lm_eval/tasks/blimp/_template_yaml | 2 +- lm_eval/tasks/ceval/_default_ceval_yaml | 2 +- lm_eval/tasks/cmmlu/_default_template_yaml | 2 +- lm_eval/tasks/code_x_glue/code-text/go.yaml | 2 +- lm_eval/tasks/code_x_glue/code-text/java.yaml | 2 +- .../code_x_glue/code-text/javascript.yaml | 2 +- lm_eval/tasks/code_x_glue/code-text/php.yaml | 2 +- .../tasks/code_x_glue/code-text/python.yaml | 2 +- lm_eval/tasks/code_x_glue/code-text/ruby.yaml | 2 +- lm_eval/tasks/coqa/default.yaml | 2 +- .../crows_pairs/crows_pairs_english.yaml | 2 +- lm_eval/tasks/csatqa/_default_csatqa_yaml | 2 +- lm_eval/tasks/drop/default.yaml | 2 +- lm_eval/tasks/fld/fld_default.yaml | 2 ++ lm_eval/tasks/glue/cola/default.yaml | 2 +- lm_eval/tasks/glue/mnli/default.yaml | 2 +- lm_eval/tasks/glue/mrpc/default.yaml | 2 +- lm_eval/tasks/glue/qnli/default.yaml | 2 +- lm_eval/tasks/glue/qqp/default.yaml | 2 +- lm_eval/tasks/glue/rte/default.yaml | 2 +- lm_eval/tasks/glue/sst2/default.yaml | 2 +- lm_eval/tasks/glue/wnli/default.yaml | 2 +- .../gsm8k/gsm8k-cot-self-consistency.yaml | 2 +- lm_eval/tasks/gsm8k/gsm8k-cot.yaml | 2 +- lm_eval/tasks/gsm8k/gsm8k.yaml | 2 +- lm_eval/tasks/headqa/headqa_en.yaml | 2 +- lm_eval/tasks/hellaswag/hellaswag.yaml | 2 +- .../tasks/hendrycks_ethics/commonsense.yaml | 2 +- .../tasks/hendrycks_ethics/deontology.yaml | 2 +- lm_eval/tasks/hendrycks_ethics/justice.yaml | 2 +- .../hendrycks_ethics/utilitarianism.yaml | 2 +- .../utilitarianism_original_yaml | 2 +- lm_eval/tasks/hendrycks_ethics/virtue.yaml | 2 +- lm_eval/tasks/ifeval/ifeval.yaml | 2 +- lm_eval/tasks/lambada/lambada_openai.yaml | 2 +- lm_eval/tasks/lambada/lambada_standard.yaml | 2 +- .../lambada_cloze/lambada_openai_cloze.yaml | 2 +- .../lambada_cloze/lambada_standard_cloze.yaml | 2 +- .../lambada_multilingual/lambada_mt_en.yaml | 2 +- lm_eval/tasks/logiqa/logiqa.yaml | 2 +- lm_eval/tasks/logiqa2/logieval.yaml | 2 +- lm_eval/tasks/logiqa2/logiqa2.yaml | 2 +- lm_eval/tasks/mathqa/mathqa.yaml | 2 +- lm_eval/tasks/mc_taco/default.yaml | 2 +- lm_eval/tasks/mgsm/direct/direct_yaml | 2 +- lm_eval/tasks/mgsm/en_cot/cot_yaml | 2 +- lm_eval/tasks/mgsm/native_cot/cot_yaml | 2 +- .../minerva_math/minerva_math_algebra.yaml | 2 +- .../tasks/mmlu/default/_default_template_yaml | 2 +- .../_mmlu_flan_cot_fewshot_template_yaml | 2 +- .../_mmlu_flan_cot_zeroshot_template_yaml | 2 +- .../_mmlu_flan_generative_template_yaml | 2 +- .../_mmlu_flan_loglikelihood_template_yaml | 2 +- .../advanced_ai_risk/_template_yaml | 2 +- .../persona/_template_yaml | 2 +- .../sycophancy/sycophancy_on_nlp_survey.yaml | 2 +- .../sycophancy_on_philpapers2020.yaml | 2 +- ...sycophancy_on_political_typology_quiz.yaml | 2 +- .../winogenerated/_template_yaml | 2 +- lm_eval/tasks/mutual/mutual.yaml | 2 +- lm_eval/tasks/nq_open/nq_open.yaml | 2 +- lm_eval/tasks/openbookqa/openbookqa.yaml | 2 +- lm_eval/tasks/paws-x/pawsx_template_yaml | 2 +- lm_eval/tasks/pile/pile_arxiv.yaml | 2 +- lm_eval/tasks/piqa/piqa.yaml | 2 +- lm_eval/tasks/polemo2/polemo2_in.yaml | 2 +- lm_eval/tasks/prost/corypaik_prost.yaml | 2 +- lm_eval/tasks/pubmedqa/pubmedqa.yaml | 2 +- lm_eval/tasks/qa4mre/qa4mre_2011.yaml | 2 +- lm_eval/tasks/qasper/bool.yaml | 2 +- lm_eval/tasks/qasper/freeform.yaml | 2 +- lm_eval/tasks/race/race.yaml | 2 +- .../realtoxicityprompts.yaml | 2 +- lm_eval/tasks/sciq/sciq.yaml | 2 +- lm_eval/tasks/siqa/default.yml | 2 +- lm_eval/tasks/storycloze/storycloze_2016.yaml | 2 +- lm_eval/tasks/super_glue/boolq/default.yaml | 2 +- lm_eval/tasks/super_glue/boolq/seq2seq.yaml | 2 +- lm_eval/tasks/super_glue/boolq/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/cb/default.yaml | 2 +- lm_eval/tasks/super_glue/cb/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/copa/default.yaml | 2 +- lm_eval/tasks/super_glue/copa/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/multirc/default.yaml | 2 +- .../tasks/super_glue/multirc/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/record/default.yaml | 2 +- .../tasks/super_glue/record/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/rte/default.yaml | 2 +- lm_eval/tasks/super_glue/rte/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/wic/default.yaml | 2 +- lm_eval/tasks/super_glue/wic/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/wsc/default.yaml | 2 +- lm_eval/tasks/super_glue/wsc/t5-prompt.yaml | 2 +- lm_eval/tasks/swag/swag.yaml | 2 +- lm_eval/tasks/toxigen/toxigen.yaml | 2 +- lm_eval/tasks/translation/wmt_common_yaml | 2 +- lm_eval/tasks/triviaqa/default.yaml | 2 +- lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml | 2 +- lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml | 2 +- lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml | 2 +- lm_eval/tasks/unscramble/anagrams1.yaml | 2 +- lm_eval/tasks/unscramble/anagrams2.yaml | 2 +- lm_eval/tasks/unscramble/cycle_letters.yaml | 2 +- .../tasks/unscramble/random_insertion.yaml | 2 +- lm_eval/tasks/unscramble/reversed_words.yaml | 2 +- lm_eval/tasks/webqs/webqs.yaml | 2 +- lm_eval/tasks/wikitext/wikitext.yaml | 2 +- lm_eval/tasks/winogrande/default.yaml | 2 +- lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml | 2 +- lm_eval/tasks/wsc273/default.yaml | 2 +- lm_eval/tasks/xcopa/default_et.yaml | 2 +- lm_eval/tasks/xnli/xnli_common_yaml | 2 +- lm_eval/tasks/xstorycloze/default_ar.yaml | 2 +- lm_eval/tasks/xwinograd/xwinograd_common_yaml | 2 +- 128 files changed, 150 insertions(+), 125 deletions(-) create mode 100644 lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md index 26ffd3aa4f..cfcf0e4d98 100644 --- a/docs/new_task_guide.md +++ b/docs/new_task_guide.md @@ -315,6 +315,25 @@ python -m scripts.write_out \ Open the file specified at the `--output_base_path ` and ensure it passes a simple eye test. +## Versioning + +One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made. + +This version info can be provided by adding the following to your new task config file: + +``` +metadata: + version: 0 +``` + +Now, whenever a change needs to be made to your task in the future, please increase the version number by 1 so that users can differentiate the different task iterations and versions. + +If you are incrementing a task's version, please also consider adding a changelog to the task's README.md noting the date, PR number, what version you have updated to, and a one-liner describing the change. + +for example, + +* \[Dec 25, 2023\] (PR #999) Version 0.0 -> 1.0: Fixed a bug with answer extraction that led to underestimated performance. + ## Checking performance + equivalence It's now time to check models' performance on your task! In the evaluation harness, we intend to support a wide range of evaluation tasks and setups, but prioritize the inclusion of already-proven benchmarks following the precise evaluation setups in the literature where possible. diff --git a/lm_eval/tasks/anli/anli_r1.yaml b/lm_eval/tasks/anli/anli_r1.yaml index 493a3a3f24..bcf7674ee1 100644 --- a/lm_eval/tasks/anli/anli_r1.yaml +++ b/lm_eval/tasks/anli/anli_r1.yaml @@ -23,4 +23,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/arc/arc_easy.yaml b/lm_eval/tasks/arc/arc_easy.yaml index 1ec12090c5..9c0d312bac 100644 --- a/lm_eval/tasks/arc/arc_easy.yaml +++ b/lm_eval/tasks/arc/arc_easy.yaml @@ -20,4 +20,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml index 6efbb2cc8b..0e2c7ac8dd 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml @@ -13,4 +13,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/asdiv/default.yaml b/lm_eval/tasks/asdiv/default.yaml index d448e867c5..350198be39 100644 --- a/lm_eval/tasks/asdiv/default.yaml +++ b/lm_eval/tasks/asdiv/default.yaml @@ -11,4 +11,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/babi/babi.yaml b/lm_eval/tasks/babi/babi.yaml index 31c421f50f..d1193ec859 100644 --- a/lm_eval/tasks/babi/babi.yaml +++ b/lm_eval/tasks/babi/babi.yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml index 30f1aafd1e..50bf5e8b36 100644 --- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml +++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml @@ -27,4 +27,4 @@ filter_list: - function: "take_first" num_fewshot: 0 metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml index c6bffa31a7..650f91bfb5 100644 --- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml +++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml @@ -24,4 +24,4 @@ filter_list: - function: "take_first" num_fewshot: 0 metadata: - - version: 0 + version: 0 diff --git a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml index 6134c86f05..6bc65079c0 100644 --- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml +++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml @@ -18,4 +18,4 @@ generation_kwargs: temperature: 0.0 num_fewshot: 0 metadata: - - version: 0 + version: 0 diff --git a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml index a0734aeceb..94a671409d 100644 --- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml +++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml @@ -18,4 +18,4 @@ generation_kwargs: temperature: 0.0 num_fewshot: 0 metadata: - - version: 0 + version: 0 diff --git a/lm_eval/tasks/belebele/_default_template_yaml b/lm_eval/tasks/belebele/_default_template_yaml index a16d1ad1fc..ef7c1a2374 100644 --- a/lm_eval/tasks/belebele/_default_template_yaml +++ b/lm_eval/tasks/belebele/_default_template_yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/bigbench/generate_until_template_yaml b/lm_eval/tasks/bigbench/generate_until_template_yaml index 7dff331292..b370418953 100644 --- a/lm_eval/tasks/bigbench/generate_until_template_yaml +++ b/lm_eval/tasks/bigbench/generate_until_template_yaml @@ -15,4 +15,4 @@ metric_list: higher_is_better: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml b/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml new file mode 100644 index 0000000000..e8011772b9 --- /dev/null +++ b/lm_eval/tasks/bigbench/multiple_choice/causal_judgement.yaml @@ -0,0 +1,4 @@ +# Generated by utils.py +dataset_name: causal_judgment_zero_shot +include: ../multiple_choice_template_yaml +task: bigbench_causal_judgement_multiple_choice diff --git a/lm_eval/tasks/bigbench/multiple_choice_template_yaml b/lm_eval/tasks/bigbench/multiple_choice_template_yaml index 7f299060fe..10fce5c1c3 100644 --- a/lm_eval/tasks/bigbench/multiple_choice_template_yaml +++ b/lm_eval/tasks/bigbench/multiple_choice_template_yaml @@ -12,4 +12,4 @@ metric_list: - metric: acc # TODO: brier score and other metrics metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/blimp/_template_yaml b/lm_eval/tasks/blimp/_template_yaml index 920076c72a..fb1dd31360 100644 --- a/lm_eval/tasks/blimp/_template_yaml +++ b/lm_eval/tasks/blimp/_template_yaml @@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}" metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/ceval/_default_ceval_yaml b/lm_eval/tasks/ceval/_default_ceval_yaml index 11392f0526..a94d87cb54 100644 --- a/lm_eval/tasks/ceval/_default_ceval_yaml +++ b/lm_eval/tasks/ceval/_default_ceval_yaml @@ -16,4 +16,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/cmmlu/_default_template_yaml b/lm_eval/tasks/cmmlu/_default_template_yaml index 0c8bc28d3c..d2e0a8876c 100644 --- a/lm_eval/tasks/cmmlu/_default_template_yaml +++ b/lm_eval/tasks/cmmlu/_default_template_yaml @@ -16,4 +16,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/code_x_glue/code-text/go.yaml b/lm_eval/tasks/code_x_glue/code-text/go.yaml index 8b004f5f54..c88067458e 100644 --- a/lm_eval/tasks/code_x_glue/code-text/go.yaml +++ b/lm_eval/tasks/code_x_glue/code-text/go.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: True metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/code_x_glue/code-text/java.yaml b/lm_eval/tasks/code_x_glue/code-text/java.yaml index 36585cdf2d..ac1ad955cf 100644 --- a/lm_eval/tasks/code_x_glue/code-text/java.yaml +++ b/lm_eval/tasks/code_x_glue/code-text/java.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: True metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml index 93002a57fc..ec8b0a6bd0 100644 --- a/lm_eval/tasks/code_x_glue/code-text/javascript.yaml +++ b/lm_eval/tasks/code_x_glue/code-text/javascript.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: True metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/code_x_glue/code-text/php.yaml b/lm_eval/tasks/code_x_glue/code-text/php.yaml index 6f1861aa93..ebc3691afb 100644 --- a/lm_eval/tasks/code_x_glue/code-text/php.yaml +++ b/lm_eval/tasks/code_x_glue/code-text/php.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: True metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/code_x_glue/code-text/python.yaml b/lm_eval/tasks/code_x_glue/code-text/python.yaml index 8faeebe4b7..92768f9bea 100644 --- a/lm_eval/tasks/code_x_glue/code-text/python.yaml +++ b/lm_eval/tasks/code_x_glue/code-text/python.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: True metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml index 124c644c4e..c2c939b63a 100644 --- a/lm_eval/tasks/code_x_glue/code-text/ruby.yaml +++ b/lm_eval/tasks/code_x_glue/code-text/ruby.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: True metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/coqa/default.yaml b/lm_eval/tasks/coqa/default.yaml index 4154ac528a..f9494d5db8 100644 --- a/lm_eval/tasks/coqa/default.yaml +++ b/lm_eval/tasks/coqa/default.yaml @@ -19,4 +19,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml index 929e0a6205..d95c83d01c 100644 --- a/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml +++ b/lm_eval/tasks/crows_pairs/crows_pairs_english.yaml @@ -20,4 +20,4 @@ metric_list: aggregation: mean higher_is_better: false metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/csatqa/_default_csatqa_yaml b/lm_eval/tasks/csatqa/_default_csatqa_yaml index 98c23e559e..a4a5db84b5 100644 --- a/lm_eval/tasks/csatqa/_default_csatqa_yaml +++ b/lm_eval/tasks/csatqa/_default_csatqa_yaml @@ -14,4 +14,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/drop/default.yaml b/lm_eval/tasks/drop/default.yaml index 4b8848072d..7e425660ac 100644 --- a/lm_eval/tasks/drop/default.yaml +++ b/lm_eval/tasks/drop/default.yaml @@ -21,4 +21,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/fld/fld_default.yaml b/lm_eval/tasks/fld/fld_default.yaml index afcbebd03e..ee84f73bc5 100644 --- a/lm_eval/tasks/fld/fld_default.yaml +++ b/lm_eval/tasks/fld/fld_default.yaml @@ -12,3 +12,5 @@ metric_list: - metric: exact_match aggregation: mean higher_is_better: true +metadata: + version: 0.0 diff --git a/lm_eval/tasks/glue/cola/default.yaml b/lm_eval/tasks/glue/cola/default.yaml index 291c94e2ac..a46003c276 100644 --- a/lm_eval/tasks/glue/cola/default.yaml +++ b/lm_eval/tasks/glue/cola/default.yaml @@ -13,4 +13,4 @@ doc_to_decontamination_query: sentence metric_list: - metric: mcc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/glue/mnli/default.yaml b/lm_eval/tasks/glue/mnli/default.yaml index 81de19a3a8..6caffa85a2 100644 --- a/lm_eval/tasks/glue/mnli/default.yaml +++ b/lm_eval/tasks/glue/mnli/default.yaml @@ -11,4 +11,4 @@ doc_to_choice: ["True", "Neither", "False"] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/glue/mrpc/default.yaml b/lm_eval/tasks/glue/mrpc/default.yaml index 455ef682b0..f0bc24510c 100644 --- a/lm_eval/tasks/glue/mrpc/default.yaml +++ b/lm_eval/tasks/glue/mrpc/default.yaml @@ -12,4 +12,4 @@ metric_list: - metric: acc - metric: f1 metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/glue/qnli/default.yaml b/lm_eval/tasks/glue/qnli/default.yaml index b31e16259a..49a6216a5e 100644 --- a/lm_eval/tasks/glue/qnli/default.yaml +++ b/lm_eval/tasks/glue/qnli/default.yaml @@ -11,4 +11,4 @@ doc_to_choice: ["yes", "no"] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/glue/qqp/default.yaml b/lm_eval/tasks/glue/qqp/default.yaml index 1fa7a796ea..34b6e10375 100644 --- a/lm_eval/tasks/glue/qqp/default.yaml +++ b/lm_eval/tasks/glue/qqp/default.yaml @@ -12,4 +12,4 @@ metric_list: - metric: acc - metric: f1 metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/glue/rte/default.yaml b/lm_eval/tasks/glue/rte/default.yaml index c9cc837d6d..7b12096a46 100644 --- a/lm_eval/tasks/glue/rte/default.yaml +++ b/lm_eval/tasks/glue/rte/default.yaml @@ -11,4 +11,4 @@ doc_to_choice: ["True", "False"] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/glue/sst2/default.yaml b/lm_eval/tasks/glue/sst2/default.yaml index f561d76566..838afeb218 100644 --- a/lm_eval/tasks/glue/sst2/default.yaml +++ b/lm_eval/tasks/glue/sst2/default.yaml @@ -11,4 +11,4 @@ doc_to_choice: ["negative", "positive"] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/glue/wnli/default.yaml b/lm_eval/tasks/glue/wnli/default.yaml index 8b3a8e7fc5..a8e57a35d6 100644 --- a/lm_eval/tasks/glue/wnli/default.yaml +++ b/lm_eval/tasks/glue/wnli/default.yaml @@ -11,4 +11,4 @@ doc_to_choice: ["False", "True"] metric_list: - metric: acc metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml index 080dc34cf7..b076d4efbe 100644 --- a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml +++ b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml @@ -31,4 +31,4 @@ filter_list: - function: "majority_vote" - function: "take_first" metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml index 65da50575b..2df407b772 100644 --- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml +++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml @@ -41,4 +41,4 @@ filter_list: regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)." - function: "take_first" metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/gsm8k/gsm8k.yaml b/lm_eval/tasks/gsm8k/gsm8k.yaml index eb3c9af777..ccf6a5a341 100644 --- a/lm_eval/tasks/gsm8k/gsm8k.yaml +++ b/lm_eval/tasks/gsm8k/gsm8k.yaml @@ -34,4 +34,4 @@ filter_list: regex_pattern: "#### (\\-?[0-9\\.\\,]+)" - function: "take_first" metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/headqa/headqa_en.yaml b/lm_eval/tasks/headqa/headqa_en.yaml index 96eaa2f109..eeb2ff12dd 100644 --- a/lm_eval/tasks/headqa/headqa_en.yaml +++ b/lm_eval/tasks/headqa/headqa_en.yaml @@ -20,4 +20,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/hellaswag/hellaswag.yaml b/lm_eval/tasks/hellaswag/hellaswag.yaml index 4df871e0c2..ec627da7d4 100644 --- a/lm_eval/tasks/hellaswag/hellaswag.yaml +++ b/lm_eval/tasks/hellaswag/hellaswag.yaml @@ -19,4 +19,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml index 0dbd29a7da..f127074367 100644 --- a/lm_eval/tasks/hendrycks_ethics/commonsense.yaml +++ b/lm_eval/tasks/hendrycks_ethics/commonsense.yaml @@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes'] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/hendrycks_ethics/deontology.yaml b/lm_eval/tasks/hendrycks_ethics/deontology.yaml index 05493d9456..a2f3685222 100644 --- a/lm_eval/tasks/hendrycks_ethics/deontology.yaml +++ b/lm_eval/tasks/hendrycks_ethics/deontology.yaml @@ -5,5 +5,5 @@ doc_to_text: "Question: Would most people believe this reasonable or unreasonabl doc_to_target: label doc_to_choice: ['unreasonable', 'reasonable'] metadata: - - version: 1.0 + version: 1.0 # TODO: implement exact-match metric for this subset diff --git a/lm_eval/tasks/hendrycks_ethics/justice.yaml b/lm_eval/tasks/hendrycks_ethics/justice.yaml index 668f05ed82..2ba79d16b3 100644 --- a/lm_eval/tasks/hendrycks_ethics/justice.yaml +++ b/lm_eval/tasks/hendrycks_ethics/justice.yaml @@ -6,4 +6,4 @@ dataset_name: justice doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:" # TODO: impl. exact match for this and deontology metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml index 99b464683f..8960a31934 100644 --- a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml +++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml @@ -9,4 +9,4 @@ doc_to_choice: ['no', 'yes'] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml index 65b95a4f28..5583c1b682 100644 --- a/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml +++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism_original_yaml @@ -13,4 +13,4 @@ # - metric: acc # TODO: we want this to be implemented as a winograd_schema task type, actually # metadata: -# - version: 1.0 +# version: 1.0 diff --git a/lm_eval/tasks/hendrycks_ethics/virtue.yaml b/lm_eval/tasks/hendrycks_ethics/virtue.yaml index 85bb552660..8c236a983d 100644 --- a/lm_eval/tasks/hendrycks_ethics/virtue.yaml +++ b/lm_eval/tasks/hendrycks_ethics/virtue.yaml @@ -7,4 +7,4 @@ doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sente doc_to_target: label doc_to_choice: ['no', 'yes'] metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/ifeval/ifeval.yaml b/lm_eval/tasks/ifeval/ifeval.yaml index bbaaa2f2a2..7913549cea 100644 --- a/lm_eval/tasks/ifeval/ifeval.yaml +++ b/lm_eval/tasks/ifeval/ifeval.yaml @@ -26,4 +26,4 @@ metric_list: aggregation: !function utils.agg_inst_level_acc higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/lambada/lambada_openai.yaml b/lm_eval/tasks/lambada/lambada_openai.yaml index d9a9ccc37d..2fcccbd59f 100644 --- a/lm_eval/tasks/lambada/lambada_openai.yaml +++ b/lm_eval/tasks/lambada/lambada_openai.yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/lambada/lambada_standard.yaml b/lm_eval/tasks/lambada/lambada_standard.yaml index 3521053e50..900e181163 100644 --- a/lm_eval/tasks/lambada/lambada_standard.yaml +++ b/lm_eval/tasks/lambada/lambada_standard.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml index 42aea6de7d..d25e26d9ef 100644 --- a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml +++ b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml index 3e412d63d1..7cde8fdebc 100644 --- a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml +++ b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml index 768b0dd7e3..7e63a6d1bc 100644 --- a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml +++ b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/logiqa/logiqa.yaml b/lm_eval/tasks/logiqa/logiqa.yaml index 912de6342f..181ef4d8c7 100644 --- a/lm_eval/tasks/logiqa/logiqa.yaml +++ b/lm_eval/tasks/logiqa/logiqa.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/logiqa2/logieval.yaml b/lm_eval/tasks/logiqa2/logieval.yaml index f0552b7c5b..f2593beb77 100644 --- a/lm_eval/tasks/logiqa2/logieval.yaml +++ b/lm_eval/tasks/logiqa2/logieval.yaml @@ -24,4 +24,4 @@ filter_list: regex_pattern: "^\\s*([A-D])" - function: "take_first" metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/logiqa2/logiqa2.yaml b/lm_eval/tasks/logiqa2/logiqa2.yaml index 568692b01a..0bcd97b131 100644 --- a/lm_eval/tasks/logiqa2/logiqa2.yaml +++ b/lm_eval/tasks/logiqa2/logiqa2.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mathqa/mathqa.yaml b/lm_eval/tasks/mathqa/mathqa.yaml index 73439072af..e37ba11807 100644 --- a/lm_eval/tasks/mathqa/mathqa.yaml +++ b/lm_eval/tasks/mathqa/mathqa.yaml @@ -19,4 +19,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/mc_taco/default.yaml b/lm_eval/tasks/mc_taco/default.yaml index e3708e3224..16aee3f7e7 100644 --- a/lm_eval/tasks/mc_taco/default.yaml +++ b/lm_eval/tasks/mc_taco/default.yaml @@ -12,4 +12,4 @@ metric_list: - metric: acc - metric: f1 metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/mgsm/direct/direct_yaml b/lm_eval/tasks/mgsm/direct/direct_yaml index 58af06d5d1..6cb89f90e8 100644 --- a/lm_eval/tasks/mgsm/direct/direct_yaml +++ b/lm_eval/tasks/mgsm/direct/direct_yaml @@ -26,4 +26,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mgsm/en_cot/cot_yaml b/lm_eval/tasks/mgsm/en_cot/cot_yaml index ec7937860d..a6307e3d7e 100644 --- a/lm_eval/tasks/mgsm/en_cot/cot_yaml +++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml @@ -28,4 +28,4 @@ filter_list: regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)" - function: "take_first" metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mgsm/native_cot/cot_yaml b/lm_eval/tasks/mgsm/native_cot/cot_yaml index 4d4e6fb380..e6f96160aa 100644 --- a/lm_eval/tasks/mgsm/native_cot/cot_yaml +++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml @@ -28,4 +28,4 @@ filter_list: regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)" - function: "take_first" metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml index 8dc9b34a38..65b5a6442f 100644 --- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml +++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml @@ -21,4 +21,4 @@ metric_list: higher_is_better: true num_fewshot: 0 metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml index e2b54acf29..37e8bb1649 100644 --- a/lm_eval/tasks/mmlu/default/_default_template_yaml +++ b/lm_eval/tasks/mmlu/default/_default_template_yaml @@ -12,4 +12,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml index 0f75fa3001..87662b3c17 100644 --- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml +++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml @@ -23,4 +23,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml index ae8214d859..c2c1ff67f1 100644 --- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml +++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml @@ -23,4 +23,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml index 863d85cc87..d480001132 100644 --- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml +++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml @@ -13,4 +13,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml index 5828843577..4bd5e44e45 100644 --- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml +++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml @@ -13,4 +13,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml index 78ce72cfd8..6409360bdc 100644 --- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml +++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml @@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}" metric_list: - metric: acc metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/model_written_evals/persona/_template_yaml b/lm_eval/tasks/model_written_evals/persona/_template_yaml index fc4babb2b6..5702df8e0b 100644 --- a/lm_eval/tasks/model_written_evals/persona/_template_yaml +++ b/lm_eval/tasks/model_written_evals/persona/_template_yaml @@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" metric_list: - metric: acc metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml index f06c9959f8..303e33906a 100644 --- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml +++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml @@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" metric_list: - metric: acc metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml index 26d2e3f6b1..2339894b1e 100644 --- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml +++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml @@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" metric_list: - metric: acc metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml index f96fbde1f1..c7772c1d67 100644 --- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml +++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml @@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" metric_list: - metric: acc metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml index 0dfa3d3163..6b16788923 100644 --- a/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml +++ b/lm_eval/tasks/model_written_evals/winogenerated/_template_yaml @@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" metric_list: - metric: acc metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/mutual/mutual.yaml b/lm_eval/tasks/mutual/mutual.yaml index dae7b374f7..f313010182 100644 --- a/lm_eval/tasks/mutual/mutual.yaml +++ b/lm_eval/tasks/mutual/mutual.yaml @@ -22,4 +22,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/nq_open/nq_open.yaml b/lm_eval/tasks/nq_open/nq_open.yaml index 4051c0630c..99ee83327a 100644 --- a/lm_eval/tasks/nq_open/nq_open.yaml +++ b/lm_eval/tasks/nq_open/nq_open.yaml @@ -29,4 +29,4 @@ metric_list: regexes_to_ignore: - "\ban|a|the\b" metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/openbookqa/openbookqa.yaml b/lm_eval/tasks/openbookqa/openbookqa.yaml index 401bb03fd3..bdfcd19635 100644 --- a/lm_eval/tasks/openbookqa/openbookqa.yaml +++ b/lm_eval/tasks/openbookqa/openbookqa.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/paws-x/pawsx_template_yaml b/lm_eval/tasks/paws-x/pawsx_template_yaml index a393f625b8..4756473829 100644 --- a/lm_eval/tasks/paws-x/pawsx_template_yaml +++ b/lm_eval/tasks/paws-x/pawsx_template_yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/pile/pile_arxiv.yaml b/lm_eval/tasks/pile/pile_arxiv.yaml index 2328665deb..58760cc86e 100644 --- a/lm_eval/tasks/pile/pile_arxiv.yaml +++ b/lm_eval/tasks/pile/pile_arxiv.yaml @@ -20,4 +20,4 @@ metric_list: aggregation: bits_per_byte higher_is_better: false metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/piqa/piqa.yaml b/lm_eval/tasks/piqa/piqa.yaml index 23a523ebfa..5a07250ab9 100644 --- a/lm_eval/tasks/piqa/piqa.yaml +++ b/lm_eval/tasks/piqa/piqa.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/polemo2/polemo2_in.yaml b/lm_eval/tasks/polemo2/polemo2_in.yaml index c1da822bc3..6fc02fc908 100644 --- a/lm_eval/tasks/polemo2/polemo2_in.yaml +++ b/lm_eval/tasks/polemo2/polemo2_in.yaml @@ -42,4 +42,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/prost/corypaik_prost.yaml b/lm_eval/tasks/prost/corypaik_prost.yaml index b6f3e60e05..adf7a8d232 100644 --- a/lm_eval/tasks/prost/corypaik_prost.yaml +++ b/lm_eval/tasks/prost/corypaik_prost.yaml @@ -16,4 +16,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/pubmedqa/pubmedqa.yaml b/lm_eval/tasks/pubmedqa/pubmedqa.yaml index 9d2d19606a..47de2fa098 100644 --- a/lm_eval/tasks/pubmedqa/pubmedqa.yaml +++ b/lm_eval/tasks/pubmedqa/pubmedqa.yaml @@ -13,4 +13,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml index 5e585b5927..b9ceb78094 100644 --- a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml +++ b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml @@ -19,4 +19,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/qasper/bool.yaml b/lm_eval/tasks/qasper/bool.yaml index 3446121944..468da5c6d1 100644 --- a/lm_eval/tasks/qasper/bool.yaml +++ b/lm_eval/tasks/qasper/bool.yaml @@ -11,4 +11,4 @@ doc_to_choice: ["no", "yes"] metric_list: - metric: f1 metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/qasper/freeform.yaml b/lm_eval/tasks/qasper/freeform.yaml index 0d9e8f94da..248aede8b4 100644 --- a/lm_eval/tasks/qasper/freeform.yaml +++ b/lm_eval/tasks/qasper/freeform.yaml @@ -15,4 +15,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/race/race.yaml b/lm_eval/tasks/race/race.yaml index 04ffaef4b2..56707fbf15 100644 --- a/lm_eval/tasks/race/race.yaml +++ b/lm_eval/tasks/race/race.yaml @@ -11,4 +11,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml b/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml index 5053792670..658c6cdba3 100644 --- a/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml +++ b/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml @@ -14,4 +14,4 @@ generation_kwargs: do_sample: false temperature: 0.0 metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/sciq/sciq.yaml b/lm_eval/tasks/sciq/sciq.yaml index d7ed2eacfb..926d66b180 100644 --- a/lm_eval/tasks/sciq/sciq.yaml +++ b/lm_eval/tasks/sciq/sciq.yaml @@ -18,4 +18,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/siqa/default.yml b/lm_eval/tasks/siqa/default.yml index f31929b5bc..35b14599d6 100644 --- a/lm_eval/tasks/siqa/default.yml +++ b/lm_eval/tasks/siqa/default.yml @@ -13,4 +13,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/storycloze/storycloze_2016.yaml b/lm_eval/tasks/storycloze/storycloze_2016.yaml index e17c7d5f06..df1c2629cb 100644 --- a/lm_eval/tasks/storycloze/storycloze_2016.yaml +++ b/lm_eval/tasks/storycloze/storycloze_2016.yaml @@ -15,4 +15,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/super_glue/boolq/default.yaml b/lm_eval/tasks/super_glue/boolq/default.yaml index bb63aa3f2e..f26e4682c4 100644 --- a/lm_eval/tasks/super_glue/boolq/default.yaml +++ b/lm_eval/tasks/super_glue/boolq/default.yaml @@ -14,4 +14,4 @@ doc_to_decontamination_query: passage metric_list: - metric: acc metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml index c283d5ea11..569316cb31 100644 --- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml +++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml @@ -23,4 +23,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml index 17f626fe03..7089381ad8 100644 --- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml @@ -19,4 +19,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/cb/default.yaml b/lm_eval/tasks/super_glue/cb/default.yaml index 6c333b6d38..c575e9872a 100644 --- a/lm_eval/tasks/super_glue/cb/default.yaml +++ b/lm_eval/tasks/super_glue/cb/default.yaml @@ -14,4 +14,4 @@ metric_list: - metric: f1 aggregation: !function "aggregate.cb_multi_fi" metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml index 2a6130dba5..984e17935a 100644 --- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml @@ -22,4 +22,4 @@ metric_list: aggregation: !function "t5_utils.agg_mean_3class_f1" higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/copa/default.yaml b/lm_eval/tasks/super_glue/copa/default.yaml index 2efb6070ae..1af5dbf472 100644 --- a/lm_eval/tasks/super_glue/copa/default.yaml +++ b/lm_eval/tasks/super_glue/copa/default.yaml @@ -12,4 +12,4 @@ doc_to_choice: !function utils.doc_to_choice metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml index 6c8f52a729..20a90db98d 100644 --- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml @@ -19,4 +19,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/multirc/default.yaml b/lm_eval/tasks/super_glue/multirc/default.yaml index 7489d0679b..5a388299f6 100644 --- a/lm_eval/tasks/super_glue/multirc/default.yaml +++ b/lm_eval/tasks/super_glue/multirc/default.yaml @@ -12,4 +12,4 @@ doc_to_choice: "['''{{answer}}\\nIs the answer correct? yes''', '''{{answer}}\\n metric_list: - metric: acc metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml index 442a345075..927a357158 100644 --- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml @@ -20,4 +20,4 @@ metric_list: aggregation: !function t5_utils.agg_em higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/record/default.yaml b/lm_eval/tasks/super_glue/record/default.yaml index ff9a823b32..54f871c9d5 100644 --- a/lm_eval/tasks/super_glue/record/default.yaml +++ b/lm_eval/tasks/super_glue/record/default.yaml @@ -17,4 +17,4 @@ metric_list: higher_is_better: True aggregation: mean metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/super_glue/record/t5-prompt.yaml b/lm_eval/tasks/super_glue/record/t5-prompt.yaml index 356d922170..c999bc9030 100644 --- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml @@ -19,4 +19,4 @@ metric_list: aggregation: !function t5_utils.squad_f1_agg higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/rte/default.yaml b/lm_eval/tasks/super_glue/rte/default.yaml index d77ede0725..6754af1a1e 100644 --- a/lm_eval/tasks/super_glue/rte/default.yaml +++ b/lm_eval/tasks/super_glue/rte/default.yaml @@ -12,4 +12,4 @@ doc_to_choice: ['True', 'False'] metric_list: - metric: acc metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml index 389450777f..9e80686e2a 100644 --- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml @@ -19,4 +19,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/wic/default.yaml b/lm_eval/tasks/super_glue/wic/default.yaml index 7e53ab4280..0f86855a78 100644 --- a/lm_eval/tasks/super_glue/wic/default.yaml +++ b/lm_eval/tasks/super_glue/wic/default.yaml @@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes'] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml index 79bc518f93..3a0dbb2f7f 100644 --- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml @@ -19,4 +19,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/super_glue/wsc/default.yaml b/lm_eval/tasks/super_glue/wsc/default.yaml index 0e93ad09f2..b9c7ec347c 100644 --- a/lm_eval/tasks/super_glue/wsc/default.yaml +++ b/lm_eval/tasks/super_glue/wsc/default.yaml @@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes'] metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml index 01183727d1..5e18acbbfb 100644 --- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml +++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml @@ -20,4 +20,4 @@ filter_list: filter: - function: !function t5_utils.WSCPostprocess metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/swag/swag.yaml b/lm_eval/tasks/swag/swag.yaml index dab13f10dc..13e30566ea 100644 --- a/lm_eval/tasks/swag/swag.yaml +++ b/lm_eval/tasks/swag/swag.yaml @@ -16,4 +16,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/toxigen/toxigen.yaml b/lm_eval/tasks/toxigen/toxigen.yaml index 691376e7f0..8b840b426d 100644 --- a/lm_eval/tasks/toxigen/toxigen.yaml +++ b/lm_eval/tasks/toxigen/toxigen.yaml @@ -15,4 +15,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/translation/wmt_common_yaml b/lm_eval/tasks/translation/wmt_common_yaml index 7ef6a0ea4f..3e3c395ad6 100644 --- a/lm_eval/tasks/translation/wmt_common_yaml +++ b/lm_eval/tasks/translation/wmt_common_yaml @@ -14,4 +14,4 @@ generation_kwargs: temperature: 0.0 repeats: 1 metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/triviaqa/default.yaml b/lm_eval/tasks/triviaqa/default.yaml index dcfcf3ddc0..106c0290cc 100644 --- a/lm_eval/tasks/triviaqa/default.yaml +++ b/lm_eval/tasks/triviaqa/default.yaml @@ -28,4 +28,4 @@ metric_list: ignore_case: true ignore_punctuation: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml index afad7eab3b..2a1e6108f1 100644 --- a/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml +++ b/lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml @@ -76,4 +76,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml index 9ae8092b69..d9d3a696aa 100644 --- a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml +++ b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml @@ -33,4 +33,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml b/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml index 45a1ef293b..0599b9d6be 100644 --- a/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml +++ b/lm_eval/tasks/truthfulqa/truthfulqa_mc2.yaml @@ -10,4 +10,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/unscramble/anagrams1.yaml b/lm_eval/tasks/unscramble/anagrams1.yaml index c41c225eef..b6a123ec98 100644 --- a/lm_eval/tasks/unscramble/anagrams1.yaml +++ b/lm_eval/tasks/unscramble/anagrams1.yaml @@ -17,4 +17,4 @@ metric_list: ignore_case: false ignore_punctuation: false metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/unscramble/anagrams2.yaml b/lm_eval/tasks/unscramble/anagrams2.yaml index 72a3cb39db..fea6e11006 100644 --- a/lm_eval/tasks/unscramble/anagrams2.yaml +++ b/lm_eval/tasks/unscramble/anagrams2.yaml @@ -17,4 +17,4 @@ metric_list: ignore_case: false ignore_punctuation: false metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/unscramble/cycle_letters.yaml b/lm_eval/tasks/unscramble/cycle_letters.yaml index d86e6fdb99..063c7d3f0c 100644 --- a/lm_eval/tasks/unscramble/cycle_letters.yaml +++ b/lm_eval/tasks/unscramble/cycle_letters.yaml @@ -17,4 +17,4 @@ metric_list: ignore_case: false ignore_punctuation: false metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/unscramble/random_insertion.yaml b/lm_eval/tasks/unscramble/random_insertion.yaml index a843c9f494..7b08b8330f 100644 --- a/lm_eval/tasks/unscramble/random_insertion.yaml +++ b/lm_eval/tasks/unscramble/random_insertion.yaml @@ -17,4 +17,4 @@ metric_list: ignore_case: false ignore_punctuation: false metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/unscramble/reversed_words.yaml b/lm_eval/tasks/unscramble/reversed_words.yaml index 9a909bb0cd..0c698c3dc9 100644 --- a/lm_eval/tasks/unscramble/reversed_words.yaml +++ b/lm_eval/tasks/unscramble/reversed_words.yaml @@ -17,4 +17,4 @@ metric_list: ignore_case: false ignore_punctuation: false metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/webqs/webqs.yaml b/lm_eval/tasks/webqs/webqs.yaml index 2490944ea1..32893edfb1 100644 --- a/lm_eval/tasks/webqs/webqs.yaml +++ b/lm_eval/tasks/webqs/webqs.yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/wikitext/wikitext.yaml b/lm_eval/tasks/wikitext/wikitext.yaml index 06b7d981e9..c31d920dde 100644 --- a/lm_eval/tasks/wikitext/wikitext.yaml +++ b/lm_eval/tasks/wikitext/wikitext.yaml @@ -15,4 +15,4 @@ metric_list: - metric: byte_perplexity - metric: bits_per_byte metadata: - - version: 2.0 + version: 2.0 diff --git a/lm_eval/tasks/winogrande/default.yaml b/lm_eval/tasks/winogrande/default.yaml index 1927059905..213f0727fe 100644 --- a/lm_eval/tasks/winogrande/default.yaml +++ b/lm_eval/tasks/winogrande/default.yaml @@ -14,4 +14,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml index f6d7a9230a..aa14b66413 100644 --- a/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml +++ b/lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml @@ -16,4 +16,4 @@ metric_list: aggregation: !function metrics.agg_bleu higher_is_better: true metadata: - - version: 0.0 + version: 0.0 diff --git a/lm_eval/tasks/wsc273/default.yaml b/lm_eval/tasks/wsc273/default.yaml index 8584c49502..c6f7335700 100644 --- a/lm_eval/tasks/wsc273/default.yaml +++ b/lm_eval/tasks/wsc273/default.yaml @@ -14,4 +14,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/xcopa/default_et.yaml b/lm_eval/tasks/xcopa/default_et.yaml index 4484f61803..9f2b0b73b5 100644 --- a/lm_eval/tasks/xcopa/default_et.yaml +++ b/lm_eval/tasks/xcopa/default_et.yaml @@ -11,4 +11,4 @@ doc_to_choice: !function utils.doc_to_choice metric_list: - metric: acc metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/xnli/xnli_common_yaml b/lm_eval/tasks/xnli/xnli_common_yaml index f76b39f5bd..0201459d35 100644 --- a/lm_eval/tasks/xnli/xnli_common_yaml +++ b/lm_eval/tasks/xnli/xnli_common_yaml @@ -16,4 +16,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/xstorycloze/default_ar.yaml b/lm_eval/tasks/xstorycloze/default_ar.yaml index 1718863bf4..2a52966d5a 100644 --- a/lm_eval/tasks/xstorycloze/default_ar.yaml +++ b/lm_eval/tasks/xstorycloze/default_ar.yaml @@ -15,4 +15,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0 diff --git a/lm_eval/tasks/xwinograd/xwinograd_common_yaml b/lm_eval/tasks/xwinograd/xwinograd_common_yaml index 2e22d706e0..86554820e9 100644 --- a/lm_eval/tasks/xwinograd/xwinograd_common_yaml +++ b/lm_eval/tasks/xwinograd/xwinograd_common_yaml @@ -17,4 +17,4 @@ metric_list: aggregation: mean higher_is_better: true metadata: - - version: 1.0 + version: 1.0