diff --git a/lm_eval/__main__.py b/lm_eval/__main__.py index 2db13c3271..85645f4cca 100644 --- a/lm_eval/__main__.py +++ b/lm_eval/__main__.py @@ -201,6 +201,12 @@ def parse_eval_args() -> argparse.Namespace: "E.g, `--seed 42` sets all three seeds to 42." ), ) + parser.add_argument( + "--trust_remote_code", + default=True, + help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub", + ) + return parser.parse_args() @@ -290,6 +296,16 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: path.mkdir(parents=True, exist_ok=True) output_path_file = path.joinpath("results.json") + # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args + if args.trust_remote_code: + os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = ( + args.trust_remote_code if args.trust_remote_code else True + ) + args.model_args = ( + args.model_args + + f",trust_remote_code={os.environ['HF_DATASETS_TRUST_REMOTE_CODE']}" + ) + eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info("Loading selected tasks...") diff --git a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml index 0e2c7ac8dd..3e8d414a60 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml @@ -14,3 +14,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_2da.yaml b/lm_eval/tasks/arithmetic/arithmetic_2da.yaml index 529ed983b6..a186d76e89 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_2da.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_2da.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_2da dataset_name: arithmetic_2da +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml b/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml index b0e21c3742..471bd4b444 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_2dm dataset_name: arithmetic_2dm +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml index 40d90054f5..f8e762486b 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_2ds dataset_name: arithmetic_2ds +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_3da.yaml b/lm_eval/tasks/arithmetic/arithmetic_3da.yaml index 5c21ac0590..a4870d04f0 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_3da.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_3da.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_3da dataset_name: arithmetic_3da +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml index dd62a3cb51..37f9ff0d25 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_3ds dataset_name: arithmetic_3ds +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_4da.yaml b/lm_eval/tasks/arithmetic/arithmetic_4da.yaml index 36608fdd80..4c04c6249f 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_4da.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_4da.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_4da dataset_name: arithmetic_4da +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml index 43f22e58c2..282b3d1e51 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_4ds dataset_name: arithmetic_4ds +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_5da.yaml b/lm_eval/tasks/arithmetic/arithmetic_5da.yaml index 8070b380d4..5365cfbeb9 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_5da.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_5da.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_5da dataset_name: arithmetic_5da +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml b/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml index 961fce06ce..51d95da007 100644 --- a/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml +++ b/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml @@ -1,3 +1,5 @@ include: arithmetic_1dc.yaml task: arithmetic_5ds dataset_name: arithmetic_5ds +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/asdiv/default.yaml b/lm_eval/tasks/asdiv/default.yaml index 350198be39..bd3917c3c2 100644 --- a/lm_eval/tasks/asdiv/default.yaml +++ b/lm_eval/tasks/asdiv/default.yaml @@ -12,3 +12,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/coqa/default.yaml b/lm_eval/tasks/coqa/default.yaml index c1ed84f7d1..de398c242d 100644 --- a/lm_eval/tasks/coqa/default.yaml +++ b/lm_eval/tasks/coqa/default.yaml @@ -20,3 +20,5 @@ metric_list: higher_is_better: true metadata: version: 3.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/drop/default.yaml b/lm_eval/tasks/drop/default.yaml index 22c60f14d2..4a93612152 100644 --- a/lm_eval/tasks/drop/default.yaml +++ b/lm_eval/tasks/drop/default.yaml @@ -22,3 +22,5 @@ metric_list: higher_is_better: true metadata: version: 3.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/kobest/kobest_sentineg.yaml b/lm_eval/tasks/kobest/kobest_sentineg.yaml index caf565c734..64319dca39 100644 --- a/lm_eval/tasks/kobest/kobest_sentineg.yaml +++ b/lm_eval/tasks/kobest/kobest_sentineg.yaml @@ -21,3 +21,5 @@ metric_list: higher_is_better: True metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/kobest/kobest_wic.yaml b/lm_eval/tasks/kobest/kobest_wic.yaml index 87bd74b161..569d3393db 100644 --- a/lm_eval/tasks/kobest/kobest_wic.yaml +++ b/lm_eval/tasks/kobest/kobest_wic.yaml @@ -21,3 +21,5 @@ metric_list: higher_is_better: True metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/lambada/lambada_openai.yaml b/lm_eval/tasks/lambada/lambada_openai.yaml index 2fcccbd59f..e9fd3a90d5 100644 --- a/lm_eval/tasks/lambada/lambada_openai.yaml +++ b/lm_eval/tasks/lambada/lambada_openai.yaml @@ -18,3 +18,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/logiqa/logiqa.yaml b/lm_eval/tasks/logiqa/logiqa.yaml index 181ef4d8c7..3e318b7d16 100644 --- a/lm_eval/tasks/logiqa/logiqa.yaml +++ b/lm_eval/tasks/logiqa/logiqa.yaml @@ -19,3 +19,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/logiqa2/logieval.yaml b/lm_eval/tasks/logiqa2/logieval.yaml index f2593beb77..f83f274b65 100644 --- a/lm_eval/tasks/logiqa2/logieval.yaml +++ b/lm_eval/tasks/logiqa2/logieval.yaml @@ -25,3 +25,5 @@ filter_list: - function: "take_first" metadata: version: 0.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml index e630fa8ac9..c0a1547bf4 100644 --- a/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml +++ b/lm_eval/tasks/minerva_math/minerva_math_algebra.yaml @@ -23,3 +23,5 @@ num_fewshot: 0 metadata: version: 1.0 num_fewshot: 4 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml index 303e33906a..e8e9b865c0 100644 --- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml +++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_nlp_survey.yaml @@ -12,3 +12,5 @@ metric_list: - metric: acc metadata: version: 0.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml index 2339894b1e..f726d6cedd 100644 --- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml +++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_philpapers2020.yaml @@ -12,3 +12,5 @@ metric_list: - metric: acc metadata: version: 0.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml index c7772c1d67..95b3280b8b 100644 --- a/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml +++ b/lm_eval/tasks/model_written_evals/sycophancy/sycophancy_on_political_typology_quiz.yaml @@ -12,3 +12,5 @@ metric_list: - metric: acc metadata: version: 0.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/mutual/mutual.yaml b/lm_eval/tasks/mutual/mutual.yaml index f313010182..102da1559f 100644 --- a/lm_eval/tasks/mutual/mutual.yaml +++ b/lm_eval/tasks/mutual/mutual.yaml @@ -23,3 +23,5 @@ metric_list: higher_is_better: true metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/race/race.yaml b/lm_eval/tasks/race/race.yaml index 56707fbf15..b90b809f61 100644 --- a/lm_eval/tasks/race/race.yaml +++ b/lm_eval/tasks/race/race.yaml @@ -12,3 +12,5 @@ metric_list: higher_is_better: true metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/unscramble/anagrams1.yaml b/lm_eval/tasks/unscramble/anagrams1.yaml index 555d416dbd..392fc78f94 100644 --- a/lm_eval/tasks/unscramble/anagrams1.yaml +++ b/lm_eval/tasks/unscramble/anagrams1.yaml @@ -18,3 +18,5 @@ metric_list: ignore_punctuation: false metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/unscramble/anagrams2.yaml b/lm_eval/tasks/unscramble/anagrams2.yaml index 41a1438aa5..9f1dbe6fb5 100644 --- a/lm_eval/tasks/unscramble/anagrams2.yaml +++ b/lm_eval/tasks/unscramble/anagrams2.yaml @@ -18,3 +18,5 @@ metric_list: ignore_punctuation: false metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/unscramble/cycle_letters.yaml b/lm_eval/tasks/unscramble/cycle_letters.yaml index 194bd399fe..dc7ccf977c 100644 --- a/lm_eval/tasks/unscramble/cycle_letters.yaml +++ b/lm_eval/tasks/unscramble/cycle_letters.yaml @@ -18,3 +18,5 @@ metric_list: ignore_punctuation: false metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/unscramble/random_insertion.yaml b/lm_eval/tasks/unscramble/random_insertion.yaml index 8e64a010d4..189c2415f1 100644 --- a/lm_eval/tasks/unscramble/random_insertion.yaml +++ b/lm_eval/tasks/unscramble/random_insertion.yaml @@ -18,3 +18,5 @@ metric_list: ignore_punctuation: false metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/wikitext/wikitext.yaml b/lm_eval/tasks/wikitext/wikitext.yaml index c31d920dde..cc95b10261 100644 --- a/lm_eval/tasks/wikitext/wikitext.yaml +++ b/lm_eval/tasks/wikitext/wikitext.yaml @@ -16,3 +16,5 @@ metric_list: - metric: bits_per_byte metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/pyproject.toml b/pyproject.toml index 3ce91939df..c4d8fb5eaf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ license = { "text" = "MIT" } dependencies = [ "accelerate>=0.21.0", "evaluate", - "datasets>=2.14.0", + "datasets>=2.16.0", "evaluate>=0.4.0", "jsonlines", "numexpr",