diff --git a/docs/interface.md b/docs/interface.md index 89e0397b9f..8e2fa26b4a 100644 --- a/docs/interface.md +++ b/docs/interface.md @@ -58,12 +58,15 @@ This mode supports a number of command-line arguments, the details of which can * `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments: * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token, - * `hub_repo_name` - repository name on Hugging Face Hub, e.g., `lm-eval-results`, + * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`, + * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`, + * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`, * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`, * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set, * `public_repo` - whether the repository is public, can be `True` or `False`, * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`. * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`. + * `gated` - whether to gate the details dataset, can be `True` or `False`. ## External Library Usage diff --git a/lm_eval/loggers/evaluation_tracker.py b/lm_eval/loggers/evaluation_tracker.py index 6aa7f25bef..429d70b27a 100644 --- a/lm_eval/loggers/evaluation_tracker.py +++ b/lm_eval/loggers/evaluation_tracker.py @@ -1,4 +1,5 @@ import json +import os import re import time from collections import defaultdict @@ -14,6 +15,7 @@ HfApi, hf_hub_url, ) +from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status from lm_eval.utils import ( eval_logger, @@ -112,12 +114,15 @@ def __init__( output_path: str = None, hub_results_org: str = "", hub_repo_name: str = "", + details_repo_name: str = "", + results_repo_name: str = "", push_results_to_hub: bool = False, push_samples_to_hub: bool = False, public_repo: bool = False, token: str = "", leaderboard_url: str = "", point_of_contact: str = "", + gated: bool = False, ) -> None: """ Creates all the necessary loggers for evaluation tracking. @@ -126,12 +131,15 @@ def __init__( output_path (str): Path to save the results. If not provided, the results won't be saved. hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token. hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`. + details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`. + result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo. push_results_to_hub (bool): Whether to push the results to the Hugging Face hub. push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub. public_repo (bool): Whether to push the results to a public or private repository. token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`. leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card. point_of_contact (str): Contact information on the Hugging Face hub dataset card. + gated (bool): Whether to gate the repository. """ self.general_config_tracker = GeneralConfigTracker() @@ -142,6 +150,7 @@ def __init__( self.leaderboard_url = leaderboard_url self.point_of_contact = point_of_contact self.api = HfApi(token=token) if token else None + self.gated_repo = gated if not self.api and (push_results_to_hub or push_samples_to_hub): raise ValueError( @@ -159,9 +168,24 @@ def __init__( f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'." ) - hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results" - self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}" - self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private" + if hub_repo_name == "": + details_repo_name = ( + details_repo_name if details_repo_name != "" else "lm-eval-results" + ) + results_repo_name = ( + results_repo_name if results_repo_name != "" else details_repo_name + ) + else: + details_repo_name = hub_repo_name + results_repo_name = hub_repo_name + eval_logger.warning( + "hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead." + ) + + self.details_repo = f"{hub_results_org}/{details_repo_name}" + self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private" + self.results_repo = f"{hub_results_org}/{results_repo_name}" + self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private" def save_results_aggregated( self, @@ -211,9 +235,9 @@ def save_results_aggregated( if self.api and self.push_results_to_hub: repo_id = ( - self.hub_results_repo + self.results_repo if self.public_repo - else self.hub_results_repo_private + else self.results_repo_private ) self.api.create_repo( repo_id=repo_id, @@ -221,10 +245,15 @@ def save_results_aggregated( private=not self.public_repo, exist_ok=True, ) - self.api.upload_folder( + self.api.upload_file( repo_id=repo_id, - folder_path=str(path), - path_in_repo=self.general_config_tracker.model_name_sanitized, + path_or_fileobj=str( + path.joinpath(f"results_{self.date_id}.json") + ), + path_in_repo=os.path.join( + self.general_config_tracker.model_name, + f"results_{self.date_id}.json", + ), repo_type="dataset", commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}", ) @@ -278,6 +307,7 @@ def save_results_samples( sample["resps"] = sanitize_list(sample["resps"]) sample["filtered_resps"] = sanitize_list(sample["filtered_resps"]) sample["arguments"] = arguments + sample["target"] = str(sample["target"]) sample_dump = ( json.dumps( @@ -293,9 +323,9 @@ def save_results_samples( if self.api and self.push_samples_to_hub: repo_id = ( - self.hub_results_repo + self.details_repo if self.public_repo - else self.hub_results_repo_private + else self.details_repo_private ) self.api.create_repo( repo_id=repo_id, @@ -303,6 +333,18 @@ def save_results_samples( private=not self.public_repo, exist_ok=True, ) + try: + if self.gated_repo: + headers = build_hf_headers() + r = get_session().put( + url=f"https://huggingface.co/api/datasets/{repo_id}/settings", + headers=headers, + json={"gated": "auto"}, + ) + hf_raise_for_status(r) + except Exception as e: + eval_logger.warning("Could not gate the repository") + eval_logger.info(repr(e)) self.api.upload_folder( repo_id=repo_id, folder_path=str(path), @@ -327,9 +369,7 @@ def recreate_metadata_card(self) -> None: """ eval_logger.info("Recreating metadata card") - repo_id = ( - self.hub_results_repo if self.public_repo else self.hub_results_repo_private - ) + repo_id = self.details_repo if self.public_repo else self.details_repo_private files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") results_files = get_results_filenames(files_in_repo) @@ -360,7 +400,10 @@ def recreate_metadata_card(self) -> None: results_datetime, ) latest_task_results_datetime[samples_key] = latest_datetime - latest_task_results_datetime[results_key] = latest_datetime + latest_task_results_datetime[results_key] = max( + latest_task_results_datetime[results_key], + latest_datetime, + ) # Create metadata card card_metadata = MetadataConfigs() @@ -377,14 +420,15 @@ def recreate_metadata_card(self) -> None: sanitized_last_eval_date_results = re.sub( r"[^\w\.]", "_", latest_task_results_datetime[config_name] ) - # Ensure that all results files are listed in the metadata card - current_results = card_metadata.get(config_name, {"data_files": []}) - current_results["data_files"].append( - {"split": eval_date_sanitized, "path": [str(results_filename)]} - ) - card_metadata[config_name] = current_results - # If the results file is the newest, update the "latest" field in the metadata card + if eval_date_sanitized == sanitized_last_eval_date_results: + # Ensure that all results files are listed in the metadata card + current_results = card_metadata.get(config_name, {"data_files": []}) + current_results["data_files"].append( + {"split": eval_date_sanitized, "path": [str(results_filename)]} + ) + card_metadata[config_name] = current_results + # If the results file is the newest, update the "latest" field in the metadata card card_metadata[config_name]["data_files"].append( {"split": "latest", "path": [str(results_filename)]} ) @@ -403,65 +447,20 @@ def recreate_metadata_card(self) -> None: sanitized_last_eval_date_results = re.sub( r"[^\w\.]", "_", latest_task_results_datetime[config_name] ) - # Ensure that all sample results files are listed in the metadata card - current_details_for_task = card_metadata.get( - config_name, {"data_files": []} - ) - current_details_for_task["data_files"].append( - {"split": eval_date_sanitized, "path": [str(results_filename)]} - ) - card_metadata[config_name] = current_details_for_task - # If the samples results file is the newest, update the "latest" field in the metadata card if eval_date_sanitized == sanitized_last_eval_date_results: + # Ensure that all sample results files are listed in the metadata card + current_details_for_task = card_metadata.get( + config_name, {"data_files": []} + ) + current_details_for_task["data_files"].append( + {"split": eval_date_sanitized, "path": [str(results_filename)]} + ) + card_metadata[config_name] = current_details_for_task + # If the samples results file is the newest, update the "latest" field in the metadata card card_metadata[config_name]["data_files"].append( {"split": "latest", "path": [str(results_filename)]} ) - # Special case for MMLU with a single split covering it all - # We add another config with all MMLU splits results together for easy inspection - SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"] - for special_task in SPECIAL_TASKS: - if special_task in config_name: - special_task = f"{model_name}__{special_task}" - former_entry = card_metadata.get(special_task, {"data_files": []}) - - former_split = [ - (i, entry) - for i, entry in enumerate(former_entry["data_files"]) - if entry.get("split", None) == eval_date_sanitized - ] - - if len(former_split) == 0: - former_entry["data_files"].append( - { - "split": eval_date_sanitized, - "path": [str(results_filename)], - } - ) - else: - split_index, _ = former_split[0] - former_entry["data_files"][split_index]["path"].append( - str(results_filename) - ) - - if eval_date_sanitized == sanitized_last_eval_date_results: - latest_split = [ - (i, entry) - for i, entry in enumerate(former_entry["data_files"]) - if entry.get("split", None) == "latest" - ] - if len(latest_split) == 0: - former_entry["data_files"].append( - {"split": "latest", "path": [str(results_filename)]} - ) - else: - latest_index, _ = latest_split[0] - former_entry["data_files"][latest_index]["path"].append( - str(results_filename) - ) - - card_metadata[special_task] = former_entry - # Get latest results and extract info to update metadata card examples latest_datetime = max(latest_task_results_datetime.values()) latest_model_name = max( diff --git a/lm_eval/loggers/utils.py b/lm_eval/loggers/utils.py index 348175a954..ded8f820ec 100644 --- a/lm_eval/loggers/utils.py +++ b/lm_eval/loggers/utils.py @@ -118,15 +118,15 @@ def add_tokenizer_info(storage: Dict[str, Any], lm): tokenizer_info = { "tokenizer_pad_token": [ lm.tokenizer.pad_token, - lm.tokenizer.pad_token_id, + str(lm.tokenizer.pad_token_id), ], "tokenizer_eos_token": [ lm.tokenizer.eos_token, - lm.tokenizer.eos_token_id, + str(lm.tokenizer.eos_token_id), ], "tokenizer_bos_token": [ lm.tokenizer.bos_token, - lm.tokenizer.bos_token_id, + str(lm.tokenizer.bos_token_id), ], "eot_token_id": getattr(lm, "eot_token_id", None), "max_length": getattr(lm, "max_length", None), diff --git a/lm_eval/utils.py b/lm_eval/utils.py index 62d500ccb1..652be78e42 100644 --- a/lm_eval/utils.py +++ b/lm_eval/utils.py @@ -163,7 +163,7 @@ def get_file_datetime(filename: str) -> str: """ Given the results and sample results filenames, extracts and returns the datetime. """ - return filename[filename.rfind("_") + 1 :].replace(".json", "") + return filename[filename.rfind("_") + 1 :].replace(".jsonl", "") def sanitize_model_name(model_name: str) -> str: