diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..12324c0 Binary files /dev/null and b/.DS_Store differ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..aa318e2 --- /dev/null +++ b/LICENSE @@ -0,0 +1,56 @@ +# For the main pipeline structure-related code, we maintain the original license provided with lm-evaluation-harness, which is the MIT License. + +MIT License + +Copyright (c) 2024 LMMs-Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +# For the multimodal models and datasets that we have added (defined as code in the lmms_eval/tasks and lmms_eval/models folders), we apply the Apache License. + +Apache 2.0 License + +Copyright (c) 2024 LMMs-Lab + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +When modifying the code, please include the following information about the original lmms-eval source: +# Adopted from lmms-eval from https://github.com/EvolvingLMMs-Lab/lmms-eval. Below is the original copyright: +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/README.md b/README.md index 70d7054..c7d01f2 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,84 @@ -## Hi there šŸ‘‹ - - +

+ +

+ +# The Evaluation Suite of Large Multimodal Models + +[![PyPI](https://img.shields.io/pypi/v/lmms-eval)](https://pypi.org/project/lmms-eval) +![PyPI - Downloads](https://img.shields.io/pypi/dm/lmms-eval) +![GitHub contributors](https://img.shields.io/github/contributors/EvolvingLMMs-Lab/lmms-eval) +[![issue resolution](https://img.shields.io/github/issues-closed-raw/EvolvingLMMs-Lab/lmms-eval)](https://github.com/EvolvingLMMs-Lab/lmms-eval/issues) +[![open issues](https://img.shields.io/github/issues-raw/EvolvingLMMs-Lab/lmms-eval)](https://github.com/EvolvingLMMs-Lab/lmms-eval/issues) + +> Accelerating the development of large multimodal models (LMMs) with `lmms-eval` + +šŸ  [LMMs-Lab Homepage](https://lmms-lab.framer.ai) | šŸ¤— [Huggingface Datasets](https://huggingface.co/lmms-lab) | Discord_Thread [discord/lmms-eval](https://discord.gg/zdkwKUqrPy) + +šŸ“– [Supported Tasks (90+)](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/docs/current_tasks.md) | šŸŒŸ [Supported Models (30+)](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/models) | šŸ“š [Documentation](docs/README.md) + +--- + +## Annoucement +- [2025-1] šŸŽ‰šŸŽ‰ We introduce [VideoMMMU](https://videommmu.github.io/), a massive, multi-modal, multi-disciplinary video benchmark that evaluates the knowledge acquisition capability from educational videos. + +## Installation + +For formal usage, you can install the package from PyPI by running the following command: +```bash +pip install lmms-eval +``` + +For development, you can install the package by cloning the repository and running the following command: +```bash +git clone https://github.com/EvolvingLMMs-Lab/lmms-eval +cd lmms-eval +pip install -e . +``` + +If you want to test LLaVA, you will have to clone their repo from [LLaVA](https://github.com/haotian-liu/LLaVA) and +```bash +git clone https://github.com/LLaVA-VL/LLaVA-NeXT +cd LLaVA-NeXT +pip install -e . +``` + +## Evaluation + +**Evaluation of LLaVA-OneVision on VideoMMMU** + +```bash +accelerate launch --num_processes=1 --main_process_port 12345 -m lmms_eval \ +--model llava_onevision \ +--model_args pretrained=lmms-lab/llava-onevision-qwen2-7b-ov,conv_template=qwen_1_5,model_name=llava_qwen,max_frames_num=32,torch_dype=bfloat16 \ + --tasks video_mmmu \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix debug \ + --output_path ./logs/ +``` + +**Evaluate a single track of VideoMMMU** + +```bash +accelerate launch --num_processes=1 --main_process_port 12345 -m lmms_eval \ +--model llava_onevision \ +--model_args pretrained=lmms-lab/llava-onevision-qwen2-7b-ov,conv_template=qwen_1_5,model_name=llava_qwen,max_frames_num=32,torch_dype=bfloat16 \ + --tasks video_mmmu_perception \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix debug \ + --output_path ./logs/ +``` + +**Evaluate the question_only track of VideoMMMU (Knowledge Acquisition Experiment)** + +```bash +accelerate launch --num_processes=1 --main_process_port 12345 -m lmms_eval \ +--model llava_onevision \ +--model_args pretrained=lmms-lab/llava-onevision-qwen2-7b-ov,conv_template=qwen_1_5,model_name=llava_qwen,max_frames_num=1,torch_dype=bfloat16 \ + --tasks video_mmmu_adaptation_question_only \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix debug \ + --output_path ./logs/ +``` diff --git a/docs/README.md b/docs/README.md new file mode 100755 index 0000000..2c91236 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,12 @@ +# LMMs Eval Documentation + +Welcome to the docs for `lmms-eval`! + +Majority of this documentation is adapted from [lm-eval-harness](https://github.com/EleutherAI/lm-evaluation-harness/) + +## Table of Contents + +* To learn about the command line flags, see the [commands](commands.md) +* To learn how to add a new moddel, see the [Model Guide](model_guide.md). +* For a crash course on adding new tasks to the library, see our [Task Guide](task_guide.md). +* If you need to upload your datasets into correct HF format with viewer supported, please refer to [tools](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/pufanyi/hf_dataset_docs/tools) diff --git a/docs/commands.md b/docs/commands.md new file mode 100755 index 0000000..3dd3427 --- /dev/null +++ b/docs/commands.md @@ -0,0 +1,231 @@ +# User Guide +This document details the interface exposed by `lmms_eval` and provides details on what flags are available to users. + +## Command-line Interface + + +Equivalently, running the library can be done via the `lmms_eval` entrypoint at the command line. + +This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`: + +- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#model-apis-and-inference-servers) for a full list of enabled model names and supported libraries or APIs. + +* `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=liuhaotian/llava-v1.5-7b,batch_size=1`. For a full list of what keyword arguments, see the initialization of the corresponding model class in `lmms_eval/models/`. + +* `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. You can use `--tasks list` to see all the available tasks. If you add your own tasks but not shown on the list, you can try to set `--verbosity=DEBUG` to view the error message. You can also use `--tasks list_with_num` to check every tasks and the number of question each task contains. However, `list_with_num` will download all the available datasets and may require lots of memory and time. + +- `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer. + +- `--gen_kwargs` : takes an arg string in same format as `--model_args` and creates a dictionary of keyword arguments. These will be passed to the models for all called `generate_until` (free-form or greedy generation task) tasks, to set options such as the sampling temperature or `top_p` / `top_k`. For a list of what args are supported for each model type, reference the respective library's documentation (for example, the documentation for `transformers.AutoModelForCausalLM.generate()`.) These kwargs will be applied to all `generate_until` tasks called--we do not currently support unique gen_kwargs or batch_size values per task in a single run of the library. To control these on a per-task level, set them in that task's YAML file. + +- `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length. + +- `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed. + +- `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type. + +- `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well. + +- `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`. + +- `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models. + +- `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again. + +- `--cache_requests` : Can be "true", "refresh", or "delete". "true" means that the cache should be used. "refresh" means that you wish to regenerate the cache, which you should run if you change your dataset configuration for a given task. "delete" will delete the cache. Cached files are stored under lm_eval/cache/.cache unless you specify a different path via the environment variable: `LM_HARNESS_CACHE_PATH`. e.g. `LM_HARNESS_CACHE_PATH=~/Documents/cache_for_lm_harness`. + +- `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity. + +- `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task. + +- `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes. + +- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`. + +- `--system_instruction`: Specifies a system instruction string to prepend to the prompt. + +- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways: + - `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied. + - `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt. + + For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer. + +- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on. + +- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results. + +* `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42. + +* `--wandb_args`: Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run``` + +* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments: + * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token, + * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`, + * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`, + * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`, + * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`, + * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set, + * `public_repo` - whether the repository is public, can be `True` or `False`, + * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`. + * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`. + * `gated` - whether to gate the details dataset, can be `True` or `False`. + +## External Library Usage + +We also support using the library's external API for use within model training loops or other scripts. + +`lmms_eval` supplies two functions for external import and use: `lmms_eval.evaluate()` and `lmms_eval.simple_evaluate()`. + +`simple_evaluate()` can be used by simply creating an `lmms_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs/model_guide.md), and wrapping your custom model in that class as follows: + +```python +import lmms_eval +... + +my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code) +... +# instantiate an LM subclass that takes your initialized model and can run +# - `Your_LMM.loglikelihood()` +# - `Your_LMM.generate_until()` +lmm_obj = Your_LMM(model=my_model, batch_size=16) + +# indexes all tasks from the `lmms_eval/tasks` subdirectory. +# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")` +# to include a set of tasks in a separate directory. +task_manager = lmms_eval.tasks.TaskManager() + +# Setting `task_manager` to the one above is optional and should generally be done +# if you want to include tasks from paths other than ones in `lmms_eval/tasks`. +# `simple_evaluate` will instantiate its own task_manager if it is set to None here. +results = lmms_eval.simple_evaluate( # call simple_evaluate + model=lmm_obj, + tasks=["taskname1", "taskname2"], + num_fewshot=0, + task_manager=task_manager, + ... +) +``` + +See the `simple_evaluate()` and `evaluate()` functions in [lmms_eval/evaluator.py](../lmms_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously. + +Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`. + +As a brief example usage of `evaluate()`: + +```python +import lmms_eval + +# suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase +from my_tasks import MyTask1 +... + +# create your model (could be running finetuning with some custom modeling code) +my_model = initialize_my_model() +... + +# instantiate an LM subclass that takes your initialized model and can run +# - `Your_LM.loglikelihood()` +# - `Your_LM.loglikelihood_rolling()` +# - `Your_LM.generate_until()` +lmm_obj = Your_LMM(model=my_model, batch_size=16) + +# optional: the task_manager indexes tasks including ones +# specified by the user through `include_path`. +task_manager = lmms_eval.tasks.TaskManager( + include_path="/path/to/custom/yaml" + ) + +# To get a task dict for `evaluate` +task_dict = lmms_eval.tasks.get_task_dict( + [ + "mmlu", # A stock task + "my_custom_task", # A custom task + { + "task": ..., # A dict that configures a task + "doc_to_text": ..., + }, + MyTask1 # A task object from `lm_eval.task.Task` + ], + task_manager # A task manager that allows lm_eval to + # load the task during evaluation. + # If none is provided, `get_task_dict` + # will instantiate one itself, but this + # only includes the stock tasks so users + # will need to set this if including + # custom paths is required. + ) + +results = evaluate( + lm=lmm_obj, + task_dict=task_dict, + ... +) +``` + +## Usage with SRT API + +> install sglang + +```bash +git clone https://github.com/sgl-project/sglang.git +cd sglang; +pip install -e "python[srt]" +python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ +``` + +> run sglang backend service with the following command + +```bash + +CKPT_PATH=$1 +TASK=$2 +MODALITY=$3 +TP_SIZE=$4 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +python3 -m lmms_eval \ + --model srt_api \ + --model_args modality=$MODALITY,model_version=$CKPT_PATH,tp=$TP_SIZE,host=127.0.0.1,port=30000,timeout=600 \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + +You may need to install some dependencies for the above command to work (if you encounter some errors). + +```bash +pip install httpx==0.23.3 +pip install protobuf==3.20 +pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ +``` + + + +## Regression Test + +Now after each PR, we need to run the regression test to make sure the performance of the model is not degraded. + +```bash +python3 tools/regression.py +``` + +```bash +Already on 'dev/fix_output_path' + +|task|llava-onevision-qwen2-0.5b-ov| +|--|--| +|ocrbench (dev/fix_output_path)|0.70 Ā± 0.70| +|mmmu_val (dev/fix_output_path)|50.00 Ā± 50.00| +|ai2d (dev/fix_output_path)|50.00 Ā± 50.00| +|muirbench (dev/fix_output_path)|12.50 Ā± 12.50| +|videomme (dev/fix_output_path)|2500.00 Ā± 2500.00| + +|branch|runtime|%| +|--|--|--| +|dev/fix_output_path|87.7s|100%| +``` diff --git a/docs/current_tasks.md b/docs/current_tasks.md new file mode 100644 index 0000000..8dc0a4a --- /dev/null +++ b/docs/current_tasks.md @@ -0,0 +1,322 @@ +# Current Tasks + +> () indicates the task name in the lmms_eval. The task name is also used to specify the dataset in the configuration file. +> The following is manually updated documentation. You could use `lmms_eval task --list` to list all supported tasks and their task names. + +## 1. Image tasks: + +- [AI2D](https://arxiv.org/abs/1603.07396) (ai2d) +- [ChartQA](https://github.com/vis-nlp/ChartQA) (chartqa) +- [COCO Caption](https://github.com/tylin/coco-caption) (coco_cap) + - COCO 2014 Caption (coco2014_cap) + - COCO 2014 Caption Validation (coco2014_cap_val) + - COCO 2014 Caption Test (coco2014_cap_test) + - COCO 2017 Caption (coco2017_cap) + - COCO 2017 Caption MiniVal (coco2017_cap_val) + - COCO 2017 Caption MiniTest (coco2017_cap_test) +- [ConBench](https://github.com/foundation-multimodal-models/ConBench) (conbench) +- [DetailCaps-4870](https://github.com/foundation-multimodal-models/CAPTURE) (detailcaps) +- [DOCVQA](https://github.com/anisha2102/docvqa) (docvqa) + - DOCVQA Validation (docvqa_val) + - DOCVQA Test (docvqa_test) +- [Ferret](https://github.com/apple/ml-ferret) (ferret) +- [Flickr30K](https://github.com/BryanPlummer/flickr30k_entities) (flickr30k) + - Flickr30K Test (flickr30k_test) +- [GQA](https://cs.stanford.edu/people/dorarad/gqa/index.html) (gqa) +- [GQA-ru](https://huggingface.co/datasets/deepvk/GQA-ru) (gqa_ru) +- [II-Bench](https://github.com/II-Bench/II-Bench) (ii_bench) +- [Infographic VQA](https://www.docvqa.org/datasets/infographicvqa) (infovqa) + - Infographic VQA Validation (infovqa_val) + - Infographic VQA Test (infovqa_test) +- [LiveBench](https://huggingface.co/datasets/lmms-lab/LiveBench) (live_bench) + - LiveBench 06/2024 (live_bench_2406) + - LiveBench 07/2024 (live_bench_2407) +- [LLaVA-Bench-Wilder](https://huggingface.co/datasets/lmms-lab/LLaVA-Bench-Wilder) (llava_wilder_small) +- [LLaVA-Bench-COCO](https://llava-vl.github.io/) (llava_bench_coco) +- [LLaVA-Bench](https://llava-vl.github.io/) (llava_in_the_wild) +- [MathVerse](https://github.com/ZrrSkywalker/MathVerse) (mathverse) + - MathVerse Text Dominant (mathverse_testmini_text_dominant) + - MathVerse Text Only (mathverse_testmini_text_only) + - MathVerse Text Lite (mathverse_testmini_text_lite) + - MathVerse Vision Dominant (mathverse_testmini_vision_dominant) + - MathVerse Vision Intensive (mathverse_testmini_vision_intensive) + - MathVerse Vision Only (mathverse_testmini_vision_only) +- [MathVista](https://mathvista.github.io/) (mathvista) + - MathVista Validation (mathvista_testmini) + - MathVista Test (mathvista_test) +- [MMBench](https://github.com/open-compass/MMBench) (mmbench) + - MMBench English (mmbench_en) + - MMBench English Dev (mmbench_en_dev) + - MMBench English Test (mmbench_en_test) + - MMBench Chinese (mmbench_cn) + - MMBench Chinese Dev (mmbench_cn_dev) + - MMBench Chinese Test (mmbench_cn_test) +- [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) (mme) +- [MMStar](https://github.com/MMStar-Benchmark/MMStar) (mmstar) +- [MMUPD](https://huggingface.co/datasets/MM-UPD/MM-UPD) (mmupd) + - MMUPD Base (mmupd_base) + - MMAAD Base (mmaad_base) + - MMIASD Base (mmiasd_base) + - MMIVQD Base (mmivqd_base) + - MMUPD Option (mmupd_option) + - MMAAD Option (mmaad_option) + - MMIASD Option (mmiasd_option) + - MMIVQD Option (mmivqd_option) + - MMUPD Instruction (mmupd_instruction) + - MMAAD Instruction (mmaad_instruction) + - MMIASD Instruction (mmiasd_instruction) + - MMIVQD Instruction (mmivqd_instruction) +- [MMVet](https://github.com/yuweihao/MM-Vet) (mmvet) +- [Multilingual LlaVa Bench](https://huggingface.co/datasets/gagan3012/multilingual-llava-bench) + - llava_in_the_wild_arabic + - llava_in_the_wild_bengali + - llava_in_the_wild_chinese + - llava_in_the_wild_french + - llava_in_the_wild_hindi + - llava_in_the_wild_japanese + - llava_in_the_wild_russian + - llava_in_the_wild_spanish + - llava_in_the_wild_urdu +- [NaturalBench](https://huggingface.co/datasets/BaiqiL/NaturalBench) +- [NoCaps](https://nocaps.org/) (nocaps) + - NoCaps Validation (nocaps_val) + - NoCaps Test (nocaps_test) +- [OCRBench](https://github.com/Yuliang-Liu/MultimodalOCR) (ocrbench) +- [OKVQA](https://okvqa.allenai.org/) (ok_vqa) + - OKVQA Validation 2014 (ok_vqa_val2014) +- [POPE](https://github.com/RUCAIBox/POPE) (pope) +- [RefCOCO](https://github.com/lichengunc/refer) (refcoco) + - refcoco_seg_test + - refcoco_seg_val + - refcoco_seg_testA + - refcoco_seg_testB + - refcoco_bbox_test + - refcoco_bbox_val + - refcoco_bbox_testA + - refcoco_bbox_testB +- [RefCOCO+](https://github.com/lichengunc/refer) (refcoco+) + - refcoco+\_seg + - refcoco+\_seg_val + - refcoco+\_seg_testA + - refcoco+\_seg_testB + - refcoco+\_bbox + - refcoco+\_bbox_val + - refcoco+\_bbox_testA + - refcoco+\_bbox_testB +- [RefCOCOg](https://github.com/lichengunc/refer) (refcocog) + - refcocog_seg_test + - refcocog_seg_val + - refcocog_bbox_test + - refcocog_bbox_val +- [ScienceQA](https://scienceqa.github.io/) (scienceqa_full) + - ScienceQA Full (scienceqa) + - ScienceQA IMG (scienceqa_img) +- [ScreenSpot](https://github.com/njucckevin/SeeClick) (screenspot) + - ScreenSpot REC / Grounding (screenspot_rec) + - ScreenSpot REG / Instruction Generation (screenspot_reg) +- [ST-VQA](https://rrc.cvc.uab.es/?ch=11) (stvqa) +- [synthdog](https://github.com/clovaai/donut) (synthdog) + - synthdog English (synthdog_en) + - synthdog Chinese (synthdog_zh) +- [TextCaps](https://textvqa.org/textcaps/) (textcaps) + - TextCaps Validation (textcaps_val) + - TextCaps Test (textcaps_test) +- [TextVQA](https://textvqa.org/) (textvqa) + - TextVQA Validation (textvqa_val) + - TextVQA Test (textvqa_test) +- [VCR-Wiki](https://github.com/tianyu-z/VCR) + - VCR-Wiki English + - VCR-Wiki English easy 100 (vcr_wiki_en_easy_100) + - VCR-Wiki English easy 500 (vcr_wiki_en_easy_500) + - VCR-Wiki English easy (vcr_wiki_en_easy) + - VCR-Wiki English hard 100 (vcr_wiki_en_hard_100) + - VCR-Wiki English hard 500 (vcr_wiki_en_hard_500) + - VCR-Wiki English hard (vcr_wiki_en_hard) + - VCR-Wiki Chinese + - VCR-Wiki Chinese easy 100 (vcr_wiki_zh_easy_100) + - VCR-Wiki Chinese easy 500 (vcr_wiki_zh_easy_500) + - VCR-Wiki Chinese easy (vcr_wiki_zh_easy) + - VCR-Wiki Chinese hard 100 (vcr_wiki_zh_hard_100) + - VCR-Wiki Chinese hard 500 (vcr_wiki_zh_hard_500) + - VCR-Wiki Chinese hard (vcr_wiki_zh_hard) +- [VibeEval](https://github.com/reka-ai/reka-vibe-eval) (vibe_eval) +- [VizWizVQA](https://vizwiz.org/tasks-and-datasets/vqa/) (vizwiz_vqa) + - VizWizVQA Validation (vizwiz_vqa_val) + - VizWizVQA Test (vizwiz_vqa_test) +- [VQAv2](https://visualqa.org/) (vqav2) + - VQAv2 Validation (vqav2_val) + - VQAv2 Test (vqav2_test) +- [WebSRC](https://x-lance.github.io/WebSRC/) (websrc) + - WebSRC Validation (websrc_val) + - WebSRC Test (websrc_test) +- [WildVision-Bench](https://github.com/WildVision-AI/WildVision-Bench) (wildvision) + - WildVision 0617(wildvision_0617) + - WildVision 0630 (wildvision_0630) +- [SeedBench 2 Plus](https://huggingface.co/datasets/AILab-CVC/SEED-Bench-2-plus) (seedbench_2_plus) + +## 2. Multi-image tasks: + +- [CMMMU](https://cmmmu-benchmark.github.io/) (cmmmu) + - CMMMU Validation (cmmmu_val) + - CMMMU Test (cmmmu_test) +- [HallusionBench](https://github.com/tianyi-lab/HallusionBench) (hallusion_bench_image) +- [ICON-QA](https://iconqa.github.io/) (iconqa) + - ICON-QA Validation (iconqa_val) + - ICON-QA Test (iconqa_test) +- [JMMMU](https://mmmu-japanese-benchmark.github.io/JMMMU/) (jmmmu) +- [LLaVA-NeXT-Interleave-Bench](https://huggingface.co/datasets/lmms-lab/LLaVA-NeXT-Interleave-Bench) (llava_interleave_bench) + - llava_interleave_bench_in_domain + - llava_interleave_bench_out_domain + - llava_interleave_bench_multi_view +- [MIRB](https://github.com/ys-zong/MIRB) (mirb) +- [MMMU](https://mmmu-benchmark.github.io/) (mmmu) + - MMMU Validation (mmmu_val) + - MMMU Test (mmmu_test) +- [MMMU_Pro](https://huggingface.co/datasets/MMMU/MMMU_Pro) + - MMMU Pro (mmmu_pro) + - MMMU Pro Original (mmmu_pro_original) + - MMMU Pro Vision (mmmu_pro_vision) + - MMMU Pro COT (mmmu_pro_cot) + - MMMU Pro Original COT (mmmu_pro_original_cot) + - MMMU Pro Vision COT (mmmu_pro_vision_cot) + - MMMU Pro Composite COT (mmmu_pro_composite_cot) +- [MMT Multiple Image](https://mmt-bench.github.io/) (mmt_mi) + - MMT Multiple Image Validation (mmt_mi_val) + - MMT Multiple Image Test (mmt_mi_test) +- [MuirBench](https://muirbench.github.io/) (muirbench) +- [MP-DocVQA](https://github.com/rubenpt91/MP-DocVQA-Framework) (multidocvqa) + - MP-DocVQA Validation (multidocvqa_val) + - MP-DocVQA Test (multidocvqa_test) +- [OlympiadBench](https://github.com/OpenBMB/OlympiadBench) (olympiadbench) + - OlympiadBench Test English (olympiadbench_test_en) + - OlympiadBench Test Chinese (olympiadbench_test_cn) +- [Q-Bench](https://q-future.github.io/Q-Bench/) (qbenchs_dev) + - Q-Bench2-HF (qbench2_dev) + - Q-Bench-HF (qbench_dev) + - A-Bench-HF (abench_dev) + +## 3. Videos tasks: + +- [ActivityNet-QA](https://github.com/MILVLG/activitynet-qa) (activitynetqa_generation) +- [SeedBench](https://github.com/AILab-CVC/SEED-Bench) (seedbench) +- [SeedBench 2](https://github.com/AILab-CVC/SEED-Bench) (seedbench_2) +- [CVRR-ES](https://github.com/mbzuai-oryx/CVRR-Evaluation-Suite) (cvrr) + - cvrr_continuity_and_object_instance_count + - cvrr_fine_grained_action_understanding + - cvrr_interpretation_of_social_context + - cvrr_interpretation_of_visual_context + - cvrr_multiple_actions_in_a_single_video + - cvrr_non_existent_actions_with_existent_scene_depictions + - cvrr_non_existent_actions_with_non_existent_scene_depictions + - cvrr_partial_actions + - cvrr_time_order_understanding + - cvrr_understanding_emotional_context + - cvrr_unusual_and_physically_anomalous_activities +- [EgoSchema](https://github.com/egoschema/EgoSchema) (egoschema) + - egoschema_mcppl + - egoschema_subset_mcppl + - egoschema_subset +- [LongVideoBench](https://github.com/longvideobench/LongVideoBench) +- [MovieChat](https://github.com/rese1f/MovieChat) (moviechat) + - Global Mode for entire video (moviechat_global) + - Breakpoint Mode for specific moments (moviechat_breakpoint) +- [MLVU](https://github.com/JUNJIE99/MLVU) (mlvu) +- [MMT-Bench](https://mmt-bench.github.io/) (mmt) + - MMT Validation (mmt_val) + - MMT Test (mmt_test) +- [MVBench](https://github.com/OpenGVLab/Ask-Anything/blob/main/video_chat2/MVBENCH.md) (mvbench) + + - mvbench_action_sequence + - mvbench_moving_count + - mvbench_action_prediction + - mvbench_episodic_reasoning + - mvbench_action_antonym + - mvbench_action_count + - mvbench_scene_transition + - mvbench_object_shuffle + - mvbench_object_existence + - mvbench_fine_grained_pose + - mvbench_unexpected_action + - mvbench_moving_direction + - mvbench_state_change + - mvbench_object_interaction + - mvbench_character_order + - mvbench_action_localization + - mvbench_counterfactual_inference + - mvbench_fine_grained_action + - mvbench_moving_attribute + - mvbench_egocentric_navigation + +- [NExT-QA](https://github.com/doc-doc/NExT-QA) (nextqa) + + - NExT-QA Multiple Choice Test (nextqa_mc_test) + - NExT-QA Open Ended Validation (nextqa_oe_val) + - NExT-QA Open Ended Test (nextqa_oe_test) + +- [PerceptionTest](https://github.com/google-deepmind/perception_test) + + - PerceptionTest Test + - perceptiontest_test_mc + - perceptiontest_test_mcppl + - PerceptionTest Validation + - perceptiontest_val_mc + - perceptiontest_val_mcppl + +- [TempCompass](https://github.com/llyx97/TempCompass) (tempcompass) + + - tempcompass_multi_choice + - tempcompass_yes_no + - tempcompass_caption_matching + - tempcompass_captioning + + +- [TemporalBench](https://huggingface.co/datasets/microsoft/TemporalBench) (temporalbench) + + - temporalbench_short_qa + - temporalbench_long_qa + - temporalbench_short_caption + + +- [Vatex](https://eric-xw.github.io/vatex-website/index.html) (vatex) + + - Vatex Chinese (vatex_val_zh) + - Vatex Test (vatex_test) + +- [VideoDetailDescription](https://huggingface.co/datasets/lmms-lab/VideoDetailCaption) (video_dc499) +- [Video-ChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT) (videochatgpt) + - Video-ChatGPT Generic (videochatgpt_gen) + - Video-ChatGPT Temporal (videochatgpt_temporal) + - Video-ChatGPT Consistency (videochatgpt_consistency) +- [Video-MME](https://video-mme.github.io/) (videomme) +- [Vinoground](https://vinoground.github.io) (vinoground) +- [VITATECS](https://github.com/lscpku/VITATECS) (vitatecs) + + - VITATECS Direction (vitatecs_direction) + - VITATECS Intensity (vitatecs_intensity) + - VITATECS Sequence (vitatecs_sequence) + - VITATECS Compositionality (vitatecs_compositionality) + - VITATECS Localization (vitatecs_localization) + - VITATECS Type (vitatecs_type) + +- [WorldQA](https://zhangyuanhan-ai.github.io/WorldQA/) (worldqa) + + - WorldQA Generation (worldqa_gen) + - WorldQA Multiple Choice (worldqa_mc) + +- [YouCook2](http://youcook2.eecs.umich.edu/) (youcook2_val) + +- [VDC](https://github.com/rese1f/aurora) (vdc) + - VDC Detailed Caption (detailed_test) + - VDC Camera Caption (camera_test) + - VDC Short Caption (short_test) + - VDC Background Caption (background_test) + - VDC Main Object Caption (main_object_test) + + +## 4. Text Tasks + +- [GSM8K](https://github.com/openai/grade-school-math) (gsm8k) +- [HellaSwag](https://rowanzellers.com/hellaswag/) (hellaswag) +- [IFEval](https://github.com/google-research/google-research/tree/master/instruction_following_eval) (ifeval) +- [MMLU](https://github.com/hendrycks/test) (mmlu) +- [MMLU_pro](https://github.com/TIGER-AI-Lab/MMLU-Pro) (mmlu_pro) diff --git a/docs/model_guide.md b/docs/model_guide.md new file mode 100755 index 0000000..bc95509 --- /dev/null +++ b/docs/model_guide.md @@ -0,0 +1,78 @@ +# New Model Guide +In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lmms_eval.api.model.lmms` class, that defines how the lmms_eval should interface with your model. This guide walks through how to write this `lmms` subclass via adding it to the library! + +## Setup + +To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment: + +```sh +# After forking... +git clone https://github.com//lmms-eval.git +cd lmms-eval +git checkout -b +pip install -e . +``` + +Now, we'll create a new file where we'll be adding our model: + +```sh +touch lmms_eval/models/.py +``` + +**As a rule of thumb, we recommend you to use `lmms_eval/models/qwen_vl.py` and `lmms_eval/models/instructblip.py` as reference implementations for your model. You can copy and paste the contents of one of these files into your new file to get started.** + +## Interface + +All models must subclass the `lmms_eval.api.model.lmms` class. + +The lmms class enforces a common interface via which we can extract responses from a model: + +```python +class MyCustomLM(lmms): + #... + def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]: + #... + + def generate_until(self, requests: list[Instance]) -> list[str]: + #... + #... +``` +Where `Instance` is a dataclass defined in [`lmms_eval.api.instance`](https://github.com/EvolvingLMMs-Lab/lmms-eval/tree/main/lmms_eval/api/instance.py) with property `args` of request-dependent type signature described below. + +We support three types of requests, consisting of different interactions / measurements with an autoregressive LM. + +All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name. Overall, you can check the [construct_requests](https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/api/task.py#L918) to see how the arguments are being constructed for different types of output type requests. + +- `generate_until` + - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters. + - In each `Instance.args` there will be 6 elements which are `contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split`. `contexts` refers to the formatted question and is the text input for the LMM. Sometimes it might contains image token and need to address differently for different models. `all_gen_kwargs` refers to the dict that contains all the generation configuration for the model. We use `doc_id`, `task`, and `split` to access the dataset and then you can use `doc_to_visual` which is a function reference to process the image. When you implement your own model, you should use these to write your own generate_util function. + - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`). + - The generated input+output text from the model will then be returned. + +- `loglikelihood` + - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned. + - In each `Instance.args` there will be 6 elements which are ` contexts, doc_to_target, doc_to_visual, doc_id, task, split`. `contexts` refers to the formatted question and is the text input for the LMM. Sometimes it might contains image token and need to address differently for different models. `doc_to_target` is a function reference that get the get the answer from the doc. This will be the continuation of the answer and only tokens belong to this part should be calculated for the loglikelihood. + - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the target string is the *most likely* N-token string to be output by the LM given the input. ) + + + + +## Registration + +Congrats on implementing your model! Now it's time to test it out. + +To make your model usable via the command line interface to `lmms_eval`, you'll need to tell `lmms_eval` what your model's name is. + +This is done via a *decorator*, `lmms_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lmms-eval --model ` and alert `lmms_eval` to the model's existence. + +```python +from lmms_eval.api.registry import register_model + +@register_model("", "") +class MyCustomLM(LM): +``` + +The final step is to import your model in `lmms_eval/models/__init__.py`: +```python +from .my_model_filename import MyCustomLM +``` diff --git a/docs/run_examples.md b/docs/run_examples.md new file mode 100644 index 0000000..a63b22b --- /dev/null +++ b/docs/run_examples.md @@ -0,0 +1,507 @@ +# User Guide +This document details the running examples for different models in `lmms_eval`. We include commandas on how to prepare environments for different model and some commands to run these models + +## Environmental Variables + +Before running experiments and evaluations, we recommend you to export following environment variables to your environment. Some are necessary for certain tasks to run. + +```bash +export OPENAI_API_KEY="" +export HF_HOME="" +export HF_TOKEN="" +export HF_HUB_ENABLE_HF_TRANSFER="1" +export REKA_API_KEY="" +# Other possible environment variables include +# ANTHROPIC_API_KEY,DASHSCOPE_API_KEY etc. +``` + +## Some common environment issue +Sometimes you might encounter some common issues for example error related to `httpx` or `protobuf`. To solve these issues, you can first try + +```bash +python3 -m pip install httpx==0.23.3; +python3 -m pip install protobuf==3.20; +# If you are using numpy==2.x, sometimes may causing errors +python3 -m pip install numpy==1.26; +# Someties sentencepiece are required for tokenizer to work +python3 -m pip install sentencepiece; +``` + +# Image Model + +### LLaVA +First, you will need to clone repo of `lmms_eval` and repo of [`llava`](https://github.com/LLaVA-VL/LLaVA-NeXT/tree/inference) + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +cd /path/to/LLaVA-NeXT; +python3 -m pip install -e ".[train]"; + + +TASK=$1 +CKPT_PATH=$2 +CONV_TEMPLATE=$3 +MODEL_NAME=$4 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +#mmbench_en_dev,mathvista_testmini,llava_in_the_wild,mmvet +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model llava \ + --model_args pretrained=$CKPT_PATH,conv_template=$CONV_TEMPLATE,model_name=$MODEL_NAME \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` +If you are trying to use large LLaVA models such as LLaVA-NeXT-Qwen1.5-72B, you can try adding `device_map=auto` in model_args and change `num_processes` to 1. + +### IDEFICS2 + +You won't need to clone any other repos to run idefics. Making sure your transformers version supports idefics2 would be enough + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +python3 -m pip install transformers --upgrade; + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model idefics2 \ + --model_args pretrained=HuggingFaceM4/idefics2-8b \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` + +### InternVL2 + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + + +python3 -m pip install flash-attn --no-build-isolation; +python3 -m pip install torchvision einops timm sentencepiece; + + +TASK=$1 +CKPT_PATH=$2 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12380 -m lmms_eval \ + --model internvl2 \ + --model_args pretrained=$CKPT_PATH \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` + + +### InternVL-1.5 +First you need to fork [`InternVL`](https://github.com/OpenGVLab/InternVL) + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +cd /path/to/InternVL/internvl_chat +python3 -m pip install -e .; + +python3 -m pip install flash-attn==2.3.6 --no-build-isolation; + + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model internvl \ + --model_args pretrained="OpenGVLab/InternVL-Chat-V1-5"\ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + +### Xcomposer-4KHD and Xcomposer-2d5 + +Both of these two models does not require external repo + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + + +python3 -m pip install flash-attn --no-build-isolation; +python3 -m pip install torchvision einops timm sentencepiece; + +TASK=$1 +MODALITY=$2 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +# For Xcomposer2d5 +accelerate launch --num_processes 8 --main_process_port 10000 -m lmms_eval \ + --model xcomposer2d5 \ + --model_args pretrained="internlm/internlm-xcomposer2d5-7b",device="cuda",modality=$MODALITY\ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +# For Xcomposer-4kHD +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model xcomposer2_4khd \ + --model_args pretrained="internlm/internlm-xcomposer2-4khd-7b" \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + +### InstructBLIP + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +python3 -m pip install transformers --upgrade; + +CKPT_PATH=$1 +TASK=$2 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model instructblip \ + --model_args pretrained=$CKPT_PATH \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix instructblip \ + --output_path ./logs/ + +``` + +### SRT API MODEL +To enable faster testing speed for larger llava model, you can use this srt api model to enable testing through sglang. +You will need to first glone sglang from "https://github.com/sgl-project/sglang". Current version is tested on the commit #1222 of sglang + +Here are the scripts if you want to test the result in one script. +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +cd /path/to/sglang; +python3 -m pip install -e "python[all]"; + + +python3 -m pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/ + + +CKPT_PATH=$1 +TASK=$2 +MODALITY=$3 +TP_SIZE=$4 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +python3 -m lmms_eval \ + --model srt_api \ + --model_args modality=$MODALITY,model_version=$CKPT_PATH,tp=$TP_SIZE,host=127.0.0.1,port=30000,timeout=600 \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + +You can use the script in `sglang` under `test` folder to kill all sglang service + +# API Model + +### GPT + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +export OPENAI_API_KEY="" + +TASK=$1 +MODEL_VERSION=$2 +MODALITIES=$3 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 30000 -m lmms_eval \ + --model gpt4v \ + --model_args model_version=$MODEL_VERSION,modality=$MODALITIES\ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + + +### Claude + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +export ANTHROPIC_API_KEY="" + +TASK=$1 +MODEL_VERSION=$2 +MODALITIES=$3 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model claude \ + --model_args model_version=$MODEL_VERSION\ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` + + +# Video Model + +### LLaVA-VID + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +cd /path/to/LLaVA-NeXT; +python3 -m pip install -e ".[train]"; + +python3 -m pip install flash-attn --no-build-isolation; + +python3 -m pip install av; + + +TASK=$1 +CKPT_PATH=$2 +CONV_TEMPLATE=$3 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model llavavid \ + --model_args pretrained=$CKPT_PATH,conv_template=$CONV_TEMPLATE,video_decode_backend=decord,max_frames_num=32 \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + + +### LLaMA-VID + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +# Notice that you should not leave the folder of LLaMA-VID when calling lmms-eval +# Because they left their processor's config inside the repo +cd /path/to/LLaMA-VID; +python3 -m pip install -e . + +python3 -m pip install av sentencepiece; + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model llama_vid \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` + +### Video-LLaVA + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +python3 -m pip install transformers --upgrade; +python3 -m pip install av sentencepiece; + + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model video_llava \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + + +### MPlug-Owl +Notice that this model will takes long time to load, please be patient :) + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +# It has to use an old transformers version to run +python3 -m pip install av sentencepiece protobuf==3.20 transformers==4.28.1 einops; + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model mplug_owl_video \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ + +``` + + +### Video-ChatGPT + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +python3 -m pip install sentencepiece av; + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model video_chatgpt \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` + +### MovieChat + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +python -m pip install torch==2.0.1 torchvision==0.15.2 --extra-index-url https://download.pytorch.org/whl/cu118 + +git clone https://github.com/rese1f/MovieChat.git +mv /path/to/MovieChat /path/to/lmms-eval/lmms_eval/models/ + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model moviechat \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` + +### LLaVA-OneVision-MovieChat + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +git clone https://github.com/rese1f/MovieChat.git +mv /path/to/MovieChat/MovieChat_OneVision/llava /path/to/lmms-eval/ + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model llava_onevision_moviechat \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` + +### LLaVA-OneVision-MovieChat + +```bash +cd /path/to/lmms-eval +python3 -m pip install -e .; + +git clone https://github.com/rese1f/aurora.git +mv /path/to/aurora/src/xtuner/xtuner /path/to/lmms-eval/xtuner-aurora + +TASK=$1 +echo $TASK +TASK_SUFFIX="${TASK//,/_}" +echo $TASK_SUFFIX + +accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ + --model auroracap \ + --tasks $TASK \ + --batch_size 1 \ + --log_samples \ + --log_samples_suffix $TASK_SUFFIX \ + --output_path ./logs/ +``` diff --git a/docs/task_guide.md b/docs/task_guide.md new file mode 100644 index 0000000..4313aad --- /dev/null +++ b/docs/task_guide.md @@ -0,0 +1,329 @@ +# Task Configuration + +The `lmms_eval` is meant to be an extensible and flexible framework within which many different evaluation tasks can be defined. All tasks in the new version of the harness are built around a YAML configuration file format. + +These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lmms_eval` task implementations. + +While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups also exist. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users. + +## Configurations + +Tasks are configured via the `TaskConfig` object. Below, we describe all fields usable within the object, and their role in defining a task. + +### Parameters + +Task naming + registration: +- **task** (`str`, defaults to None) ā€” name of the task. +- **group** (`str`, *optional*) ā€” name of the task group(s) a task belongs to. Enables one to run all tasks with a specified tag or group name at once. This would be deprecated in the future, and we recommend using `tag` to replace it. +- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results. +- **tag** (`str`, *optional*) ā€” name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once. This is a improved naming rule over `group`. + +Dataset configuration options: +- **dataset_path** (`str`) ā€” The name of the dataset as listed by HF in the datasets Hub. +- **dataset_name** (`str`, *optional*, defaults to None) ā€” The name of what HF calls a `config` or `subset` of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.) +- **dataset_kwargs** (`dict`, *optional*) ā€” Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv. +- **test_split** (`str`, *optional*) ā€” Split in the dataset to use as the test split. This is required for denoting the `split` of the HF dataset. +- **training_split** (`str`, *optional*) ā€” Split in the dataset to use as the training split. +- **validation_split** (`str`, *optional*) ā€” Split in the dataset to use as the validation split. +- **fewshot_split** (`str`, *optional*) ā€” Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0. **This function is not well tested so far** +- **process_docs** (`Callable`, *optional*) ā€” Optionally define a function to apply to each HF dataset split, to preprocess all documents before being fed into prompt template rendering or other evaluation steps. Can be used to rename dataset columns, or to process documents into a format closer to the expected format expected by a prompt template. + +Prompting / in-context formatting options: +- **doc_to_text** (`Union[Callable, str]`, *optional*) ā€” Column name or function to process a sample into the appropriate input for the model. + + For multi-round generation, (e.g., MMSearch), the function accepts additional parameters about the round index, previous round information and previous model output. It should return the input image for the next round, input text for the next round, a boolean indicating if round inference should terminate, model outputs from all rounds, and extra information from previous rounds. +- **doc_to_target** (`Union[Callable, str]`, *optional*) ā€” Column name or or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into +- **doc_to_choice** (`Union[Callable, str]`, *optional*) ā€” Column name or or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks. + +Runtime configuration options: +- **num_fewshot** (`int`, *optional*, defaults to 0) ā€” Number of few-shot examples before the input. **This function is not well tested so far** +- **batch_size** (`int`, *optional*, defaults to 1) ā€” Batch size. + +**So far some models (such as qwen) may not support batch size > 1. Some models (such as llava) will generate different scores for different batch sizes. We recommend setting batch size to 1 for final benchmarking runs.** + +Scoring details: +- **metric_list** (`str`, *optional*, defaults to None) ā€” A list of metrics to use for evaluation. +- **output_type** (`str`, *optional*, defaults to "generate_until") ā€” Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, and `multiple_choice`. +- **generation_kwargs** (`dict`, *optional*) ā€” Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes. + +Other: +- **metadata** (`dict`, *optional*) ā€” An optional field where arbitrary metadata can be passed. Most tasks should include a `version` key in this field that is used to denote the version of the yaml config. Other special metadata keys are: `num_fewshot`, to override the printed `n-shot` table column for a task. + +## Using Yaml Configurations to Define Tasks + +We recomment to browse existing tasks in the `lmms_eval/tasks` folder to get a sense of the different options available. + +Here we will provide some explainations on the existing tasks and how to define new tasks. Here we use MME as an example. + +```yaml +dataset_path: lmms-lab/MME # The name of the dataset as listed by HF in the datasets Hub. +dataset_kwargs: + token: True # Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv. +task: "mme" # The name of the task, this should be registered in the task manager. If successful, you can call lmms_eval with this task name by setting `--tasks mme`. +test_split: test # The split of the dataset to use as the test split. +output_type: generate_until # The type of model output for the given task. Options are `generate_until`, `loglikelihood`, and `multiple_choice`. +doc_to_visual: !function utils.mme_doc_to_visual # The function to process a sample into the appropriate input for the model. +doc_to_text: !function utils.mme_doc_to_text # The function to process a sample into the appropriate target output for the model. +doc_to_target: "answer" # The function to process a sample into a list of possible string choices for `multiple_choice` tasks. +generation_kwargs: # Auxiliary arguments for the `generate` function from HF transformers library. This would be used in different models files. + max_new_tokens: 16 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +# The return value of process_results will be used by metrics +process_results: !function utils.mme_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +# e.g. Following metrics `mme_perception_score` is custom defined. +# So `mme_process_results` function should return the dict `{"mme_perception_score": {sub_k:sub_v, ..., } }` +# And the `mme_aggregate_results` function could get the dict `{sub_k:sub_v, ..., }`, and use the information to gather the final accuracy. +metric_list: + - metric: mme_perception_score # The name of the metric to use for evaluation. The process_results function should return the metric name and the metric value, in format of `{metric_name: results}`. And the aggregation function will use the results to get the final score. + aggregation: !function utils.mme_aggregate_results # The name of the aggregation function to use for evaluation. + higher_is_better: true # Whether the metric is better when the value is higher. + - metric: mme_cognition_score + aggregation: !function utils.mme_aggregate_results + higher_is_better: true +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." + qwen_vl: + pre_prompt: "" + post_prompt: " Answer:" +metadata: + - version: 0.0 +``` + +### Embedded Python Code + +As above example shown, you can use python functions for certain arguments by using the `!function` operator after the argument name followed by `.`. This feature can be used for the following arguments: +1. `doc_to_text` +2. `doc_to_target` +3. `doc_to_choice` +4. `aggregation` for a `metric` in `metric_list` + +You can base a YAML on another YAML file as a template. This can be handy when you need to just change the prompt for `doc_to_text` but keep the rest the same or change `filters` to compare which is better. Simply use `include` in the YAML file and write the name of the template you want to base from. This assumes that the base temeplate is in the same directory. + +Otherwise, You will need to define the full path. + +```yaml +include: +... +``` + +### Passing Arguments to Metrics + +Metrics can be defined in the `metric_list` argument when building the YAML config. Multiple metrics can be listed along with any auxiliary arguments. For example, setting the [`exact_match` metric](https://github.com/huggingface/evaluate/tree/main/metrics/exact_match), auxiliary arguments such as `ignore_case`, `ignore_punctuation`, `regexes_to_ignore` can be listed as well. They will be added to the metric function as `kwargs`. Some metrics have predefined values for `aggregation` and `higher_is_better` so listing the metric name only can be sufficient. + +```yaml +metric_list: + - metric: acc + - metric: exact_match + aggregation: mean + higher_is_better: true + ignore_case: true + ignore_punctuation: false + regexes_to_ignore: + - "," + - "\\$" +``` + +### Natively Supported Metrics + +Here we list all metrics currently supported natively in `lmms_eval`: + +Metrics: +* `acc` (accuracy) +* `acc_norm` (length-normalized accuracy) +* `acc_all` (accuracy metric where all answers must be correct for each question) +* `anls` (average Normalized Levenshtein Similarity, used for evaluating text similarity) +* `acc_mutual_info` (baseline loglikelihood - normalized accuracy) +* `by_pass` (by-pass score, dont calculate anything, just return the model output as the result) +* `exact_match` (exact match score, bind to `output_type: generate_until` and `aggregation: mean`) +* `perplexity` +* `word_perplexity` (perplexity per word) +* `byte_perplexity` (perplexity per byte) +* `bits_per_byte` +* `brier_score` (a scoring rule for probabilistic predictions) +* `matthews_corrcoef` (Matthews correlation coefficient) +* `f1` (F1 score) +* `bleu` +* `chrf` +* `ter` + +Aggregation functions: +* `mean` +* `median` +* `perplexity` +* `weighted_perplexity` +* `bits_per_byte` + +### Adding a Multiple Choice Metric + +Adding a multiple choice metric has a few steps. To get it working you need to: + +1. register a metric function +2. register an aggregation function +3. update the `Task` definition to make sure the correct arguments are passed + +The default metric and aggregation functions are in `lm_eval/api/metrics.py`, and you can add a function there if it's for general use. The metrics are towards the bottom of the file and look like this: + +```python + @register_metric( + metric="mcc", + higher_is_better=True, + output_type="multiple_choice", + aggregation="matthews_corrcoef", + ) + def mcc_fn(items): # This is a passthrough function + return items +``` +Note that many of these are passthrough functions, and for multiple choice (at least) this function is never actually called. + +Aggregation functions are defined towards the top of the file, here's an example: + + @register_aggregation("matthews_corrcoef") + def matthews_corrcoef(items): + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + return sklearn.metrics.matthews_corrcoef(golds, preds) + +This function returns a single numeric value. The input is defined in `Task.process_results` in `lm_eval/api/task.py`. There's a section that looks like this: + +```python + result_dict = { + **({"acc": acc} if "acc" in use_metric else {}), + **({"f1": (gold, pred)} if "f1" in use_metric else {}), + **({"mcc": (gold, pred)} if "mcc" in use_metric else {}), + **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), + **({"exact_match": exact_match} if "exact_match" in use_metric else {}), + } +``` + +The value here determines the input to the aggregation function, though the name used matches the metric function. These metrics all have simple needs and just need the accuracy or gold and predicted values, but immediately below this there are examples of metrics with more complicated needs you can use as reference. + +## Good Reference Tasks + +Contributing a new task can be daunting! Luckily, much of the work has often been done for you in a different, similarly evaluated task. Good examples of task implementations to study include: + +**Generation-based tasks:** + +- MME (`lmms_eval/tasks/mme/mme.yaml`) + +```yaml +dataset_path: lmms-lab/MME +dataset_kwargs: + token: True +task: "mme" +test_split: test +output_type: generate_until +doc_to_visual: !function utils.mme_doc_to_visual +doc_to_text: !function utils.mme_doc_to_text +doc_to_target: "answer" +generation_kwargs: + max_new_tokens: 16 + temperature: 0 + top_p: 1.0 + num_beams: 1 + do_sample: false +# The return value of process_results will be used by metrics +process_results: !function utils.mme_process_results +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +# e.g. Following metrics `mme_perception_score` is custom defined. +# So `mme_process_results` function should return the dict `{"mme_perception_score": {sub_k:sub_v, ..., } }` +# And the `mme_aggregate_results` function could get the dict `{sub_k:sub_v, ..., }`, and use the information to gather the final accuracy. +metric_list: + - metric: mme_perception_score + aggregation: !function utils.mme_aggregate_results + higher_is_better: true + - metric: mme_cognition_score + aggregation: !function utils.mme_aggregate_results + higher_is_better: true +lmms_eval_specific_kwargs: + default: + pre_prompt: "" + post_prompt: "\nAnswer the question using a single word or phrase." + qwen_vl: + pre_prompt: "" + post_prompt: " Answer:" +metadata: + - version: 0.0 +``` + +And other tasks can be: +- MMBench (`lmms_eval/tasks/mmbench/mmbench.yaml`) (Group: `mmbench`) + +**Notes:** +You can pay special attention to the process_results and metric_list fields, which define how the model output is post-processed and scored. + +**`process_results`** is executed in parallel (multi-GPU). We recommend using it to collect and parse model outputs into formatted results. If your evaluation requires external models (e.g., GPT-4) as a judge or answer extractor, we also suggest integrating the judging process within this function. + +**`aggregate_results`** is executed in the main process (rank 0). We recommend using it to calculate the final score or accuracy. + +Also, the `lmms_eval_specific_kwargs` field is used to define model-specific prompt configurations. The default is set to follow Llava. + +**Generation-based Tasks (GPT-Eval)** + +You can check the following tasks to see how we incoporate GPT4 as judge model into our evaluation pipeline. + +- LLaVA-In-The-Wild (https://github.com/EvolvingLMMs-Lab/lmms-eval/blob/main/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml) + +**PPL-based tasks:** +- Seedbench (`lmms_eval/tasks/seedbench/seedbench_ppl.yaml`) + +```yaml +dataset_path: lmms-lab/SEED-Bench +dataset_kwargs: + token: True +task: "seedbench_ppl" +test_split: test +output_type: multiple_choice +doc_to_visual: !function utils.seed_doc_to_visual +doc_to_text: !function utils.seed_doc_to_text_mc +doc_to_choice : !function utils.seed_doc_to_choice +doc_to_target: !function utils.seed_doc_to_mc_target +# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results +metric_list: + - metric: acc +metadata: + - version: 0.0 +``` + +**Multi-round-generation-based tasks:** + +- MMSearch(`lmms_eval/tasks/mmsearch/mmsearch_end2end.yaml`) + +```yaml +dataset_path: CaraJ/MMSearch +dataset_name: end2end +dataset_kwargs: + token: False +task: "mmsearch_end2end" +test_split: end2end +output_type: generate_until_multi_round # Note that here we use the new output_type here for multi-round generation. It basicly follows generate_until but incorporate multi-round inference +doc_to_visual: !function lmms_eval_utils.mmsearch_end2end_doc_to_visual +doc_to_text: !function lmms_eval_utils.mmsearch_end2end_doc_to_text +doc_to_target: "answer" +generation_kwargs: + until: + - "ASSISTANT:" + max_new_tokens: 512 + temperature: 0 + top_p: 0 + num_beams: 1 + do_sample: false +process_results: !function lmms_eval_utils.mmsearch_end2end_process_results +metric_list: + - metric: end2end_f1_score + aggregation: !function lmms_eval_utils.mmsearch_aggregate_results_f1_score + higher_is_better: true + - metric: requery_score + aggregation: !function lmms_eval_utils.mmsearch_aggregate_results_req_score + higher_is_better: true +lmms_eval_specific_kwargs: # Note that here we cache the result of every sample whenever the it is inferenced + middle_resules_dir: /data1/zrr/jdz/mmsearch/mmsearch_middile_results + result_cache_dir: /data1/zrr/jdz/mmsearch/mmsearch_result_cache_dir + +``` diff --git a/experiment_on_delta/.DS_Store b/experiment_on_delta/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/experiment_on_delta/.DS_Store differ diff --git a/experiment_on_delta/check_correct.py b/experiment_on_delta/check_correct.py new file mode 100644 index 0000000..271b3bb --- /dev/null +++ b/experiment_on_delta/check_correct.py @@ -0,0 +1,57 @@ +import json + +# Function to normalize strings +def normalize_str(answer): + if isinstance(answer, str): + return [answer.strip().lower()] + return [str(answer).strip().lower()] + +# Function to evaluate open-ended questions +def eval_open(gold_i, pred_i): + correct = False + if isinstance(gold_i, list): + norm_answers = [] + for answer in gold_i: + norm_answers.extend(normalize_str(answer)) + else: + norm_answers = normalize_str(gold_i) + + for pred in pred_i: # pred is already normalized + if isinstance(pred, str): # check if normalized answer is in the prediction + for norm_ans in norm_answers: + if isinstance(norm_ans, str) and norm_ans in pred: + correct = True + break + else: # for numeric comparison + if pred in norm_answers: + correct = True + break + return correct + +# Input and output file paths +input_file = "" +output_file = "" + +correct_ids = [] + +with open(input_file, "r") as f: + for line in f: + entry = json.loads(line.strip()) + if entry["mmmu_acc"]["parsed_pred"] == "No Answere Found": + continue + question_type = entry.get("question_type", "open") + answer = entry["mmmu_acc"]["answer"] + parsed_pred = entry["mmmu_acc"]["parsed_pred"] + + if question_type == "open": + pred_list = normalize_str(parsed_pred) if isinstance(parsed_pred, str) else parsed_pred + if eval_open(answer, pred_list): + correct_ids.append(entry["doc"]["id"]) + else: + if answer == parsed_pred: + correct_ids.append(entry["doc"]["id"]) + +with open(output_file, "w") as f: + json.dump({"correct_ids": correct_ids}, f, indent=4) + +print(f"Matched IDs have been saved to {output_file}") diff --git a/experiment_on_delta/check_wrong2right.py b/experiment_on_delta/check_wrong2right.py new file mode 100644 index 0000000..aaf534f --- /dev/null +++ b/experiment_on_delta/check_wrong2right.py @@ -0,0 +1,42 @@ +import json + +# Paths to your JSON files +file1 = " " # Replace with your actual adaptation JSON file path +file2 = " " # Replace with your actual question_only JSON file path + +# Load the IDs from each file +with open(file1, "r") as f1, open(file2, "r") as f2: + data1 = json.load(f1) + data2 = json.load(f2) + +# Convert IDs lists to sets for set operations +ids1 = set(data1["correct_ids"]) +ids2 = set(data2["correct_ids"]) + +# Calculate differences +ids_in_file1_not_in_file2 = ids1 - ids2 +ids_in_file2_not_in_file1 = ids2 - ids1 + +output_w2r_path = "" +output_r2w_path = "" +# Save to JSON files +with open(output_w2r_path, "w") as w2r_file: + json.dump({"total_ids_in_adaptation_not_in_question_only": len(ids_in_file1_not_in_file2), "ids": sorted(ids_in_file1_not_in_file2)}, w2r_file, indent=4) + +with open(output_r2w_path, "w") as r2w_file: + json.dump({"total_ids_in_question_only_not_in_adaptation": len(ids_in_file2_not_in_file1), "ids": sorted(ids_in_file2_not_in_file1)}, r2w_file, indent=4) + +# Optional: Print summary +print("Summary of ID comparison:") +print(f"1. Total IDs in adaptation but not in question_only: {len(ids_in_file1_not_in_file2)}") +print(f"2. Total IDs in question_only but not in adaptation: {len(ids_in_file2_not_in_file1)}") +print(f"3. Total IDs in both files: {len(ids1 & ids2)}") +print(f"4. Total correct in adaptation: {len(ids1)}") +print(f"5. Total correct in question_only: {len(ids2)}") + +# Calculate wrong2right and right2wrong rate +wrong2right_rate = len(ids_in_file1_not_in_file2) / (300 - len(ids2)) * 100 +right2wrong_rate = len(ids_in_file2_not_in_file1) / len(ids2) * 100 + +print(f"6. Wrong2Right_Rate: {wrong2right_rate}") +print(f"7. Right2Wrong_Rate: {right2wrong_rate}") diff --git a/lmms_eval/__init__.py b/lmms_eval/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/lmms_eval/__main__.py b/lmms_eval/__main__.py new file mode 100755 index 0000000..7c89535 --- /dev/null +++ b/lmms_eval/__main__.py @@ -0,0 +1,532 @@ +import argparse +import datetime +import importlib +import json +import os +import sys +import traceback +import warnings +from functools import partial + +import numpy as np +import yaml + +warnings.simplefilter("ignore", category=DeprecationWarning) + +import hashlib +from pathlib import Path +from typing import Union + +from accelerate import Accelerator +from accelerate.utils import InitProcessGroupKwargs +from loguru import logger as eval_logger + +from lmms_eval import evaluator, utils +from lmms_eval.api.registry import ALL_TASKS +from lmms_eval.evaluator import request_caching_arg_to_dict +from lmms_eval.loggers import EvaluationTracker, WandbLogger +from lmms_eval.tasks import TaskManager +from lmms_eval.utils import ( + handle_non_serializable, + make_table, + simple_parse_args_string, +) + + +def _int_or_none_list_arg_type(min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","): + def parse_value(item): + item = item.strip().lower() + if item == "none": + return None + try: + return int(item) + except ValueError: + raise argparse.ArgumentTypeError(f"{item} is not an integer or None") + + items = [parse_value(v) for v in value.split(split_char)] + num_items = len(items) + + if num_items == 1: + # Makes downstream handling the same for single and multiple values + items = items * max_len + elif num_items < min_len or num_items > max_len: + raise argparse.ArgumentTypeError(f"Argument requires {max_len} integers or None, separated by '{split_char}'") + elif num_items != max_len: + logging.warning(f"Argument requires {max_len} integers or None, separated by '{split_char}'. " "Missing values will be filled with defaults.") + default_items = [parse_value(v) for v in defaults.split(split_char)] + items.extend(default_items[num_items:]) # extend items list with missing defaults + + return items + + +def check_argument_types(parser: argparse.ArgumentParser): + """ + Check to make sure all CLI args are typed, raises error if not + """ + for action in parser._actions: + if action.dest != "help" and not action.const: + if action.type is None: + raise ValueError(f"Argument '{action.dest}' doesn't have a type specified.") + else: + continue + + +def _handle_non_serializable(o): + if isinstance(o, np.int64) or isinstance(o, np.int32): + return int(o) + elif isinstance(o, set): + return list(o) + else: + return str(o) + + +def parse_eval_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument("--config", default="", help="Path to a yaml file specifying all eval arguments, will ignore cli arguments if specified") + parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`") + parser.add_argument( + "--tasks", + default=None, + help="To get full list of tasks, use the command lmms-eval --tasks list", + ) + parser.add_argument( + "--model_args", + default="", + help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`", + ) + parser.add_argument( + "--num_fewshot", + type=int, + default=None, + help="Number of examples in few-shot context", + ) + parser.add_argument( + "--batch_size", + "-b", + type=str, + default=1, + metavar="auto|auto:N|N", + help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.", + ) + parser.add_argument( + "--max_batch_size", + type=int, + default=None, + metavar="N", + help="Maximal batch size to try with --batch_size auto.", + ) + parser.add_argument( + "--device", + type=str, + default=None, + help="Device to use (e.g. cuda, cuda:0, cpu)", + ) + parser.add_argument( + "--output_path", + default=None, + type=str, + metavar="= [dir/file.jsonl] [DIR]", + help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.", + ) + parser.add_argument( + "--limit", + type=float, + default=None, + help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.", + ) + parser.add_argument( + "--use_cache", + "-c", + type=str, + default=None, + metavar="DIR", + help="A path to a sqlite db file for caching model responses. `None` if not caching.", + ) + parser.add_argument( + "--cache_requests", + type=str, + default=None, + choices=["true", "refresh", "delete"], + help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.", + ) + parser.add_argument( + "--check_integrity", + action="store_true", + help="Whether to run the relevant part of the test suite for the tasks", + ) + parser.add_argument( + "--write_out", + "-w", + action="store_true", + default=False, + help="Prints the prompt for the first few documents.", + ) + parser.add_argument( + "--log_samples", + action="store_true", + default=False, + help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis", + ) + parser.add_argument( + "--wandb_log_samples", + action="store_true", + default=False, + help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis to Weights and Biases", + ) + parser.add_argument( + "--log_samples_suffix", + type=str, + default="model_outputs", + help="Specify a suffix for the log_samples file name.", + ) + parser.add_argument( + "--system_instruction", + type=str, + default=None, + help="System instruction to be used in the prompt", + ) + parser.add_argument( + "--apply_chat_template", + action="store_true", + default=False, + help="If True, applies the chat template to the prompt", + ) + parser.add_argument( + "--fewshot_as_multiturn", + action="store_true", + default=False, + help="If True, uses the fewshot as a multi-turn conversation", + ) + parser.add_argument( + "--show_config", + action="store_true", + default=False, + help="If True, shows the the full config of all tasks at the end of the evaluation.", + ) + parser.add_argument( + "--include_path", + type=str, + default=None, + help="Additional path to include if there are external tasks to include.", + ) + parser.add_argument( + "--gen_kwargs", + default="", + help=("String arguments for model generation on greedy_until tasks," " e.g. `temperature=0,top_k=0,top_p=0`"), + ) + parser.add_argument( + "--verbosity", + type=str, + default="INFO", + help="Log error when tasks are not registered.", + ) + parser.add_argument( + "--wandb_args", + default="", + help="Comma separated string arguments passed to wandb.init, e.g. `project=lmms-eval,job_type=eval", + ) + parser.add_argument( + "--timezone", + default="Asia/Singapore", + help="Timezone for datetime string, e.g. Asia/Singapore, America/New_York, America/Los_Angeles. You can check the full list via `import pytz; print(pytz.common_timezones)`", + ) + parser.add_argument( + "--hf_hub_log_args", + type=str, + default="", + help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`", + ) + parser.add_argument( + "--predict_only", + "-x", + action="store_true", + default=False, + help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.", + ) + default_seed_string = "0,1234,1234,1234" + parser.add_argument( + "--seed", + type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string), + default=default_seed_string, # for backward compatibility + help=( + "Set seed for python's random, numpy, torch, and fewshot sampling.\n" + "Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, " + "respectively, or a single integer to set the same seed for all four.\n" + f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` " + "(for backward compatibility).\n" + "E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. " + "Here numpy's seed is not set since the second value is `None`.\n" + "E.g, `--seed 42` sets all four seeds to 42." + ), + ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub", + ) + args = parser.parse_args() + return args + + +def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: + if not args: + args = parse_eval_args() + + # Check if no arguments were passed after parsing + if len(sys.argv) == 1: + print("ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”") + print("ā”‚ Please provide arguments to evaluate the model. e.g. ā”‚") + print("ā”‚ `lmms-eval --model llava --model_path liuhaotian/llava-v1.6-7b --tasks okvqa` ā”‚") + print("ā”‚ Use `lmms-eval --help` for more information. ā”‚") + print("ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜") + sys.exit(1) + + if args.wandb_args: + if "name" not in args.wandb_args: + name = f"{args.model}_{args.model_args}_{utils.get_datetime_str(timezone=args.timezone)}" + name = utils.sanitize_long_string(name) + args.wandb_args += f",name={name}" + wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args)) + + # reset logger + eval_logger.remove() + eval_logger.add(sys.stdout, colorize=True, level=args.verbosity) + eval_logger.info(f"Verbosity set to {args.verbosity}") + os.environ["VERBOSITY"] = args.verbosity + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + args_list = [] + results_list = [] + if args.config: + if not os.path.exists(args.config): + raise ValueError(f"Config file does not exist: {args.config}") + + with open(args.config, "r") as file: + config_args = yaml.safe_load(file) + config_args = [config_args] if type(config_args) != list else config_args + # multiple configs, create args list first + for config in config_args: + args_copy = argparse.Namespace(**vars(args)) + for key, value in config.items(): + setattr(args_copy, key, value) + args_list.append(args_copy) + else: + args_list.append(args) + + # initialize Accelerator + kwargs_handler = InitProcessGroupKwargs(timeout=datetime.timedelta(seconds=60000)) + accelerator = Accelerator(kwargs_handlers=[kwargs_handler]) + if accelerator.is_main_process: + is_main_process = True + else: + is_main_process = False + + for args in args_list: + try: + # if is_main_process and args.wandb_args: # thoughtfully we should only init wandb once, instead of multiple ranks to avoid network traffics and unwanted behaviors. + # wandb_logger = WandbLogger() + + results, samples = cli_evaluate_single(args) + results_list.append(results) + + accelerator.wait_for_everyone() + if is_main_process and args.wandb_args: + try: + wandb_logger.post_init(results) + wandb_logger.log_eval_result() + if args.wandb_log_samples and samples is not None: + wandb_logger.log_eval_samples(samples) + except Exception as e: + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + # wandb_logger.finish() + + except Exception as e: + if args.verbosity == "DEBUG": + raise e + else: + traceback.print_exc() + eval_logger.error(f"Error during evaluation: {e}. Please set `--verbosity=DEBUG` to get more information.") + results_list.append(None) + + for args, results in zip(args_list, results_list): + # cli_evaluate will return none if the process is not the main process (rank 0) + if results is not None: + print(f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}") + print(make_table(results)) + if "groups" in results: + print(make_table(results, "groups")) + + if args.wandb_args: + wandb_logger.run.finish() + + +def cli_evaluate_single(args: Union[argparse.Namespace, None] = None) -> None: + selected_task_list = args.tasks.split(",") if args.tasks else None + + if args.include_path is not None: + eval_logger.info(f"Including path: {args.include_path}") + task_manager = TaskManager(args.verbosity, include_path=args.include_path, model_name=args.model) + + # update the evaluation tracker args with the output path and the HF token + if args.output_path: + args.hf_hub_log_args += f",output_path={args.output_path}" + if os.environ.get("HF_TOKEN", None): + args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}" + + evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args) + eval_logger.info(f"Evaluation tracker args: {evaluation_tracker_args}") + + evaluation_tracker = EvaluationTracker(**evaluation_tracker_args) + + if args.predict_only: + args.log_samples = True + if (args.log_samples or args.predict_only) and not args.output_path: + raise ValueError("Specify --output_path if providing --log_samples or --predict_only") + + if args.fewshot_as_multiturn and args.apply_chat_template is False: + raise ValueError("If fewshot_as_multiturn is set, apply_chat_template must be set to True.") + + if (args.num_fewshot is None or args.num_fewshot == 0) and args.fewshot_as_multiturn: + raise ValueError("If fewshot_as_multiturn is set, num_fewshot must be greater than 0.") + + if args.include_path is not None: + eval_logger.info(f"Including path: {args.include_path}") + + if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples: + eval_logger.warning("Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub.") + + if args.limit: + eval_logger.warning(" --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.") + + if os.environ.get("LMMS_EVAL_PLUGINS", None): + args.include_path = [args.include_path] if args.include_path else [] + for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","): + package_tasks_location = importlib.util.find_spec(f"{plugin}.tasks").submodule_search_locations[0] + args.include_path.append(package_tasks_location) + + if args.tasks is None: + eval_logger.error("Need to specify task to evaluate.") + sys.exit() + elif args.tasks == "list": + eval_logger.info("Available Tasks:\n - {}".format(f"\n - ".join(sorted(task_manager.list_all_tasks())))) + sys.exit() + elif args.tasks == "list_groups": + eval_logger.info(task_manager.list_all_tasks(list_subtasks=False, list_tags=False)) + sys.exit() + elif args.tasks == "list_tags": + eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_subtasks=False)) + sys.exit() + elif args.tasks == "list_subtasks": + eval_logger.info(task_manager.list_all_tasks(list_groups=False, list_tags=False)) + sys.exit() + elif args.tasks == "list_with_num": + log_message = ( + "\n" + "=" * 70 + "\n" + "\n\tYou are trying to check all the numbers in each task." + "\n\tThis action will download the complete dataset." + "\n\tIf the results are not clear initially, call this again." + "\n\n" + "=" * 70 + ) + eval_logger.info(log_message) + for task_name in sorted(task_manager.list_all_tasks()): + try: + task_dict = get_task_dict([task_name], model_name="llava") + task_obj = task_dict[task_name] + if type(task_obj) == tuple: + group, task_obj = task_obj + if task_obj is None: + continue + eval_logger.info(f"\nTask : {task_obj.config.task}\n - #num : {len(task_obj.test_docs()) if task_obj.has_test_docs() else len(task_obj.validation_docs())}") + except Exception as e: + eval_logger.debug(f"\nTask : {task_name} fail to load \n Exception : \n {e}") + sys.exit() + else: + if os.path.isdir(args.tasks): + import glob + + task_names = [] + yaml_path = os.path.join(args.tasks, "*.yaml") + for yaml_file in glob.glob(yaml_path): + config = utils.load_yaml_config(yaml_file) + task_names.append(config) + else: + task_list = args.tasks.split(",") + task_names = task_manager.match_tasks(task_list) + for task in [task for task in task_list if task not in task_names]: + if os.path.isfile(task): + config = utils.load_yaml_config(task) + task_names.append(config) + task_missing = [task for task in task_list if task not in task_names and "*" not in task] # we don't want errors if a wildcard ("*") task name was used + + if task_missing: + missing = ", ".join(task_missing) + eval_logger.error( + f"Tasks were not found: {missing}\n" f"{utils.SPACING}Try `lmms-eval --tasks list` for list of available tasks", + ) + raise ValueError( + f"Tasks not found: {missing}. Try `lmms-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues." + ) + + eval_logger.info(f"Selected Tasks: {task_names}") + request_caching_args = request_caching_arg_to_dict(cache_requests=args.cache_requests) + datetime_str = utils.get_datetime_str(timezone=args.timezone) + + results = evaluator.simple_evaluate( + model=args.model, + model_args=args.model_args, + tasks=task_names, + num_fewshot=args.num_fewshot, + batch_size=args.batch_size, + max_batch_size=args.max_batch_size, + device=args.device, + use_cache=args.use_cache, + limit=args.limit, + check_integrity=args.check_integrity, + write_out=args.write_out, + log_samples=args.log_samples, + evaluation_tracker=evaluation_tracker, + system_instruction=args.system_instruction, + apply_chat_template=args.apply_chat_template, + fewshot_as_multiturn=args.fewshot_as_multiturn, + gen_kwargs=args.gen_kwargs, + task_manager=task_manager, + verbosity=args.verbosity, + predict_only=args.predict_only, + random_seed=args.seed[0], + numpy_random_seed=args.seed[1], + torch_random_seed=args.seed[2], + fewshot_random_seed=args.seed[3], + cli_args=args, + datetime_str=datetime_str, + **request_caching_args, + ) + + if results is not None: + if args.log_samples: + samples = results.pop("samples") + else: + samples = None + dumped = json.dumps(results, indent=4, default=_handle_non_serializable) + if args.show_config: + print(dumped) + + batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) + + evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None, datetime_str=datetime_str) + + if args.log_samples: + for task_name, config in results["configs"].items(): + evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name]) + + if evaluation_tracker.push_results_to_hub or evaluation_tracker.push_samples_to_hub: + evaluation_tracker.recreate_metadata_card() + + return results, samples + return None, None + + +def print_results(args, results): + print(f"{args.model} ({args.model_args}),\ngen_kwargs: ({args.gen_kwargs}),\nlimit: {args.limit},\nnum_fewshot: {args.num_fewshot},\nbatch_size: {args.batch_size}") + print(evaluator.make_table(results)) + if "groups" in results: + print(evaluator.make_table(results, "groups")) + + +if __name__ == "__main__": + cli_evaluate() diff --git a/lmms_eval/api/__init__.py b/lmms_eval/api/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/lmms_eval/api/filter.py b/lmms_eval/api/filter.py new file mode 100755 index 0000000..654ea63 --- /dev/null +++ b/lmms_eval/api/filter.py @@ -0,0 +1,54 @@ +from dataclasses import dataclass +from typing import List + +from datasets import Dataset + +from lmms_eval.api.instance import Instance + + +class Filter: + """ + Filter classes operate on a per-task level. + They take all model outputs (`instance.resps` for all `task.instances`) + across all instances of a task, and perform operations. + In a single run, one can configure any number of separate filters or lists of filters. + + """ + + def __init__(self, *args, **kwargs) -> None: + """ + Can define custom behavior here, if an individual instantiation of a Filter class should have state. + """ + + def apply(self, resps, docs): + """ + Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects. + Should return the list of (filtered) response lists *in the same order as they were input*, e.g. + if pass in [, ] should return + [, ] + """ + return resps + + +@dataclass +class FilterEnsemble: + """ + FilterEnsemble creates a pipeline applying multiple filters. + Its intended usage is to stack multiple post-processing steps in order. + `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each + pipeline separately. + """ + + name: str + filters: List[Filter] + + def apply(self, instances: List[Instance], docs: List[Dataset]) -> None: + resps = [inst.resps for inst in instances] # operate just on the model responses + for f in self.filters: + # apply filters in sequence + resps = f.apply(resps, docs) + + # add the end results after filtering to filtered_requests of their respective source instances. + # has key `self.name`: each FilterEnsemble applied in a given run should use a different name. + for inst, resp in zip(instances, resps): + inst.filtered_resps[self.name] = resp diff --git a/lmms_eval/api/group.py b/lmms_eval/api/group.py new file mode 100644 index 0000000..c03f805 --- /dev/null +++ b/lmms_eval/api/group.py @@ -0,0 +1,104 @@ +import abc +from dataclasses import asdict, dataclass +from inspect import getsource +from typing import Any, Callable, List, Optional, Union + + +@dataclass +class AggMetricConfig(dict): + metric: Optional[str] = None + aggregation: Optional[str] = "mean" + weight_by_size: Optional[str] = False + # list of filter names which should be incorporated into the aggregated metric. + filter_list: Optional[Union[str, list]] = "none" + + def __post_init__(self): + if self.aggregation != "mean" and not callable(self.aggregation): + raise ValueError(f"Currently, 'mean' is the only pre-defined aggregation across groups' subtasks. Got '{self.aggregation}'.") + + if isinstance(self.filter_list, str): + self.filter_list = [self.filter_list] + + +@dataclass +class GroupConfig(dict): + group: Optional[str] = None + group_alias: Optional[str] = None + task: Optional[Union[str, list]] = None + aggregate_metric_list: Optional[Union[List[AggMetricConfig], AggMetricConfig, dict]] = None + metadata: Optional[dict] = None # by default, not used in the code. allows for users to pass arbitrary info to tasks + + def __getitem__(self, item): + return getattr(self, item) + + def __setitem__(self, item, value): + return setattr(self, item, value) + + def __post_init__(self): + if self.aggregate_metric_list is not None: + if isinstance(self.aggregate_metric_list, dict): + self.aggregate_metric_list = [self.aggregate_metric_list] + + self.aggregate_metric_list = [AggMetricConfig(**item) if isinstance(item, dict) else item for item in self.aggregate_metric_list] + + def to_dict(self, keep_callable: bool = False) -> dict: + """dumps the current config as a dictionary object, as a printable format. + null fields will not be printed. + Used for dumping results alongside full task configuration + + :return: dict + A printable dictionary version of the TaskConfig object. + + # TODO: should any default value in the TaskConfig not be printed? + """ + cfg_dict = asdict(self) + # remove values that are `None` + for k, v in list(cfg_dict.items()): + if callable(v): + cfg_dict[k] = self.serialize_function(v, keep_callable=keep_callable) + return cfg_dict + + def serialize_function(self, value: Union[Callable, str], keep_callable=False) -> Union[Callable, str]: + """Serializes a given function or string. + + If 'keep_callable' is True, the original callable is returned. + Otherwise, attempts to return the source code of the callable using 'getsource'. + """ + if keep_callable: + return value + else: + try: + return getsource(value) + except (TypeError, OSError): + return str(value) + + +class ConfigurableGroup(abc.ABC): + def __init__( + self, + config: Optional[dict] = None, + ) -> None: + self._config = GroupConfig(**config) + + @property + def group(self): + return self._config.group + + @property + def group_alias(self): + return self._config.group_alias + + @property + def version(self): + return self._config.version + + @property + def config(self): + return self._config.to_dict() + + @property + def group_name(self) -> Any: + return self._config.group + + def __repr__(self): + return f"ConfigurableGroup(group={self.group}," f"group_alias={self.group_alias})" diff --git a/lmms_eval/api/instance.py b/lmms_eval/api/instance.py new file mode 100755 index 0000000..18cfb73 --- /dev/null +++ b/lmms_eval/api/instance.py @@ -0,0 +1,29 @@ +from dataclasses import dataclass, field +from typing import Literal, Tuple + + +@dataclass +class Instance: + request_type: Literal["loglikelihood", "generate_until", "generate_until_multi_round"] + arguments: tuple + idx: int + metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None)) # TODO: better typehints here + resps: list = field(default_factory=list) + filtered_resps: dict = field(default_factory=dict) + + # initialized after init + task_name: str = None + doc_id: str = None + repeats: str = None + doc: dict = None + + def __post_init__(self) -> None: + # unpack metadata field + self.task_name, self.doc_id, self.repeats = self.metadata["task"], self.metadata["doc_id"], self.metadata["repeats"] + + @property + def args(self): + """ + Returns (string,) where `string` is the string to calculate loglikelihood over + """ + return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,) diff --git a/lmms_eval/api/metrics.py b/lmms_eval/api/metrics.py new file mode 100755 index 0000000..8a29833 --- /dev/null +++ b/lmms_eval/api/metrics.py @@ -0,0 +1,606 @@ +# the code is adapted from https://github.com/EleutherAI/lm-evaluation-harness +import logging +import math +import random +import re +import string +from collections.abc import Iterable +from typing import List + +import numpy as np +import sacrebleu + +from lmms_eval.api.registry import register_aggregation, register_metric + +eval_logger = logging.getLogger("lm-eval") + + +# Register Aggregations First +@register_aggregation("bypass") +def bypass_agg(arr): + return 999 + + +@register_aggregation("mean") +def mean(arr): + return sum(arr) / len(arr) + + +@register_aggregation("median") +def median(arr): + return arr[len(arr) // 2] + + +# Certain metrics must be calculated across all documents in a benchmark. +# We use them as aggregation metrics, paired with no-op passthrough metric fns. +@register_aggregation("perplexity") +def perplexity(items): + return math.exp(-mean(items)) + + +@register_aggregation("weighted_perplexity") +def weighted_perplexity(items): + return math.exp(-weighted_mean(items)) + + +@register_aggregation("bits_per_byte") +def bits_per_byte(items): + return -weighted_mean(items) / math.log(2) + + +@register_aggregation("f1") +def f1_score(items): + from sklearn.metrics import f1_score + + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds) + + return np.max(fscore) + + +@register_aggregation("matthews_corrcoef") +def matthews_corrcoef(items): + from sklearn.metrics import matthews_corrcoef + + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + return matthews_corrcoef(golds, preds) + + +@register_aggregation("bleu") +def bleu(items): + """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric + for evaluating a generated sentence to a reference sentence. It counts matching + n-grams in the candidate translation to n-grams in the reference text, where + 1-gram or unigram would be each token and a bigram comparison would be each + word pair. The comparison is made regardless of word order + Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/ + Paper: https://www.aclweb.org/anthology/P02-1040/ + + Higher is better + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + refs, preds = _sacreformat(refs, preds) + return sacrebleu.corpus_bleu(preds, refs).score + + +@register_aggregation("chrf") +def chrf(items): + """chrF++ is a tool for automatic evaluation of machine translation output + based on character n-gram precision and recall enhanced with word n-grams. + Source: https://github.com/m-popovic/chrF + Paper: https://www.aclweb.org/anthology/W15-3049.pdf + + Higher is better # TODO I think + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + refs, preds = _sacreformat(refs, preds) + return sacrebleu.corpus_chrf(preds, refs).score + + +@register_aggregation("ter") +def ter(items): + """Translation Error Rate is an error metric for machine translation that + measures the number of edits required to change a system output into one + of the references + Source: http://www.cs.umd.edu/~snover/tercom/ + Paper: http://mt-archive.info/AMTA-2006-Snover.pdf + + Lower is better + """ + refs = list(zip(*items))[0] + preds = list(zip(*items))[1] + refs, preds = _sacreformat(refs, preds) + return sacrebleu.corpus_ter(preds, refs).score + + +@register_aggregation("brier_score") +def brier_score(items): # This is a passthrough function + gold, predictions = list(zip(*items)) + bs, num_class = np.array(predictions).shape + + gold = list(gold) + gold_one_hot = np.eye(num_class)[gold] + return np.mean(np.sum((predictions - gold_one_hot) ** 2, axis=1)) + + +@register_metric( + metric="brier_score", + higher_is_better=False, + output_type=["multiple_choice"], + aggregation="brier_score", +) +def brier_score_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc", + higher_is_better=True, + output_type=["loglikelihood", "multiple_choice"], + aggregation="mean", +) +def acc_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc_norm", + higher_is_better=True, + output_type=["loglikelihood", "multiple_choice"], + aggregation="mean", +) +def acc_norm_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc_mutual_info", + higher_is_better=True, + output_type="multiple_choice", + aggregation="mean", +) +def acc_mutual_info_fn(items): # This is a passthrough function + return items + + +### the code used in the `exact_match_hf_evaluate` function is ported from +### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py +### which is under the apache license. + +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +def exact_match_hf_evaluate( + predictions, + references, + regexes_to_ignore=None, + ignore_case=False, + ignore_punctuation=False, + ignore_numbers=False, +): + if regexes_to_ignore is not None: + for s in regexes_to_ignore: + predictions = np.array([re.sub(s, "", x) for x in predictions]) + references = np.array([re.sub(s, "", x) for x in references]) + else: + predictions = np.asarray(predictions) + references = np.asarray(references) + + if ignore_case: + predictions = np.char.lower(predictions) + references = np.char.lower(references) + + if ignore_punctuation: + repl_table = string.punctuation.maketrans("", "", string.punctuation) + predictions = np.char.translate(predictions, table=repl_table) + references = np.char.translate(references, table=repl_table) + + if ignore_numbers: + repl_table = string.digits.maketrans("", "", string.digits) + predictions = np.char.translate(predictions, table=repl_table) + references = np.char.translate(references, table=repl_table) + + score_list = predictions == references + + return {"exact_match": np.mean(score_list)} + + +### + + +@register_metric( + metric="exact_match", + higher_is_better=True, + output_type="generate_until", + aggregation="mean", +) +def exact_match_fn(**kwargs): + return exact_match_hf_evaluate(**kwargs) + + +@register_metric( + metric="perplexity", + higher_is_better=False, + output_type="loglikelihood", + aggregation="perplexity", +) +def perplexity_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="word_perplexity", + higher_is_better=False, + output_type="loglikelihood_rolling", + aggregation="weighted_perplexity", +) +def word_perplexity_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="byte_perplexity", + higher_is_better=False, + output_type="loglikelihood_rolling", + aggregation="weighted_perplexity", +) +def byte_perplexity_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="bits_per_byte", + higher_is_better=False, + output_type="loglikelihood_rolling", + aggregation="bits_per_byte", +) +def bits_per_byte_fn(items): # This is a passthrough function + return items + + +def levenshtein_distance(s1, s2): + if len(s1) > len(s2): + s1, s2 = s2, s1 + + distances = range(len(s1) + 1) + for i2, c2 in enumerate(s2): + distances_ = [i2 + 1] + for i1, c1 in enumerate(s1): + if c1 == c2: + distances_.append(distances[i1]) + else: + distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1]))) + distances = distances_ + return distances[-1] + + +@register_metric( + metric="anls", + higher_is_better=True, + output_type="generate_until", + aggregation="mean", +) +def anls( + references, + predictions, + thresh_hold=0.5, +): # This is a passthrough function + """https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py""" + values = [] + for answer in references: + # preprocess both the answers - gt and prediction + gt_answer = " ".join(answer.strip().lower().split()) + det_answer = " ".join(predictions[0].strip().lower().split()) + + # dist = levenshtein_distance(answer.lower(), detObject['answer'].lower()) + dist = levenshtein_distance(gt_answer, det_answer) + length = max(len(answer.upper()), len(predictions[0].upper())) + values.append(0.0 if length == 0 else float(dist) / float(length)) + + question_result = 1 - min(values) + + if question_result < thresh_hold: + question_result = 0 + return {"anls": question_result} + + +def pop_stddev(arr): + mu = mean(arr) + return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr)) + + +def sample_stddev(arr): + mu = mean(arr) + return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1)) + + +def mean_stderr(arr): + return sample_stddev(arr) / math.sqrt(len(arr)) + + +@register_metric( + metric="bypass", + higher_is_better=True, + output_type=["loglikelihood", "multiple_choice", "generate_until", "generate_until_multi_round"], + aggregation="bypass", +) +def bypass(items): + return items + + +@register_metric( + metric="mcc", + higher_is_better=True, + output_type="multiple_choice", + aggregation="matthews_corrcoef", +) +def mcc_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="f1", + higher_is_better=True, + output_type="multiple_choice", + aggregation="f1", +) +def f1_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="bleu", + higher_is_better=True, + output_type=["generate_until", "generate_until_multi_round"], + aggregation="bleu", +) +def bleu_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="chrf", + higher_is_better=True, + output_type=["generate_until", "generate_until_multi_round"], + aggregation="chrf", +) +def chrf_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="ter", + higher_is_better=True, + output_type=["generate_until", "generate_until_multi_round"], + aggregation="ter", +) +def ter_fn(items): # This is a passthrough function + return items + + +@register_metric( + metric="acc_all", + higher_is_better=True, + output_type="loglikelihood", + aggregation="mean", +) +def acc_all(items): + # Only count as correct if all answers are labeled correctly for each question + question_scoring_dict = {} + preds = list(zip(*items))[0] + docs = list(zip(*items))[1] + + for doc, pred in zip(docs, preds): + paragraph_id = doc["idx"]["paragraph"] + question_id = doc["idx"]["question"] + if (paragraph_id, question_id) not in question_scoring_dict: + question_scoring_dict[(paragraph_id, question_id)] = [] + + gold_label = doc["label"] == 1 + + question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred) + acc = np.mean([int(all(x)) for x in question_scoring_dict.values()]) + return acc + + +def acc_all_stderr(items): + # Only count as correct if all answers are labeled correctly for each question + question_scoring_dict = {} + preds = list(zip(*items))[0] + docs = list(zip(*items))[1] + + for doc, pred in zip(docs, preds): + question_id = doc["idx"]["question"] + if question_id not in question_scoring_dict: + question_scoring_dict[question_id] = [] + + gold_label = doc["label"] == 1 + question_scoring_dict[question_id].append(gold_label == pred) + + acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()]) + return acc + + +def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): + """Compute max metric between prediction and each ground truth.""" + scores_for_ground_truths = [] + for ground_truth in ground_truths: + score = metric_fn(prediction, ground_truth) + scores_for_ground_truths.append(score) + return max(scores_for_ground_truths) + + +def weighted_mean(items): + a, b = zip(*items) + return sum(a) / sum(b) + + +def is_non_str_iterable(obj): + return isinstance(obj, Iterable) and not isinstance(obj, str) + + +def _sacreformat(refs, preds): + """Format refs and preds for sacrebleu corpus calculation. It is very particular""" + # Sacrebleu expects (List[str], List[List[str]) + # e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...]) + + # Note [ref1_stream] is the first reference for each pred. + # So lists are size N and (M, N) for N preds and M possible refs for each pred + # This is a different order of dimensions that I would expect + + # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds + # Must become List[List[str]] with the inner list corresponding to preds + if not is_non_str_iterable(refs): + refs = list(refs) + if not is_non_str_iterable(refs[0]): + refs = [[ref] for ref in refs] + refs = list(zip(*refs)) + # Note the number of refs in each ref list much match the number of preds + + # We expect preds to be List[str] or List[List[str]]. Must become List[str] + if not is_non_str_iterable(preds): + preds = list(preds) + if is_non_str_iterable(preds[0]): + assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}" + preds = [pred[0] for pred in preds] + + return refs, preds + + +# stderr stuff + + +class _bootstrap_internal: + def __init__(self, f, n) -> None: + self.f = f + self.n = n + + def __call__(self, v): + i, xs = v + rnd = random.Random() + rnd.seed(i) + res = [] + for _ in range(self.n): + res.append(self.f(rnd.choices(xs, k=len(xs)))) + return res + + +def bootstrap_stderr(f, xs, iters): + import multiprocessing as mp + + pool = mp.Pool(mp.cpu_count()) + # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something + # equivalent to stderr calculated without Bessel's correction in the stddev. + # Unfortunately, I haven't been able to figure out what the right correction is + # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but + # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator) + # Thankfully, shouldn't matter because our samples are pretty big usually anyways + res = [] + chunk_size = min(1000, iters) + from tqdm import tqdm + + print("bootstrapping for stddev:", f.__name__) + for bootstrap in tqdm( + pool.imap( + _bootstrap_internal(f, chunk_size), + [(i, xs) for i in range(iters // chunk_size)], + ), + total=iters // chunk_size, + ): + # sample w replacement + res.extend(bootstrap) + + pool.close() + return sample_stddev(res) + + +def stderr_for_metric(metric, bootstrap_iters: int): + if bootstrap_iters <= 0: + # return no function (don't compute stderr) if bootstrap iters = 0 + return None + + bootstrappable = [ + median, + matthews_corrcoef, + f1_score, + perplexity, + bleu, + chrf, + ter, + ] + + if metric in bootstrappable: + return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters) + + stderr = {mean: mean_stderr, acc_all: acc_all_stderr} + + return stderr.get(metric, None) + + +def pooled_sample_stderr(stderrs: List[float], sizes: List[int]): + # Used to aggregate bootstrapped stderrs across subtasks in a group, + # when we are weighting by the size of each subtask. + # + + assert len(stderrs) == len(sizes) + + # formula source: https://en.wikipedia.org/wiki/Pooled_variance + # and: https://stats.stackexchange.com/a/4841331 + # this empirically seems to match running `stderr_for_metric` on all instances + # from the subtasks concatenated with each other. + pooled_sample_var = (sum([(size - 1) * stderr**2 * size for size, stderr in zip(sizes, stderrs)])) / (sum(sizes) - len(sizes)) + + return np.sqrt(pooled_sample_var / sum(sizes)) + + +def combined_sample_stderr(stderrs: List[float], sizes: List[int], metrics=None): + assert metrics is not None, "Need to pass a list of each subtask's metric for this stderr aggregation" + assert len(stderrs) == len(sizes) and len(sizes) == len(metrics) + + # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1390 for more documentation. + # This formula depends on sample means. + # removed because it seems to give erroneously huge stderrs for groupings of tasks + # and does not seem to match up with bootstrap-calculated stderrs for groups. + + ### don't use this unless a statistician has told you it's the right thing to do ### + + # accumulators: we'll aggregate pairwise N - 1 times + variance = stderrs[0] ** 2 + curr_size = sizes[0] + curr_score = metrics[0] + + for stderr, size, score in zip(stderrs[1:], sizes[1:], metrics[1:]): + curr_score = ((curr_score * curr_size) + (score * size)) / (curr_size + size) # NOTE: this assumes our aggregation fn is "mean" + + variance = ((curr_size - 1) * variance + (size - 1) * (stderr**2)) / (curr_size + size - 1) + curr_size * size / ((curr_size + size) * (curr_size + size - 1)) * (curr_score - score) ** 2 + + return np.sqrt(variance) + + +def aggregate_subtask_metrics(metrics, sizes, weight_by_size=True): + # A helper function that is used to aggregate + # subtask scores cross-task. + # TODO: does not hold for non-mean aggregations + if not weight_by_size: + sizes = [1] * len(sizes) + + assert len(metrics) == len(sizes) + + return sum([metric * size for metric, size in zip(metrics, sizes)]) / sum(sizes) diff --git a/lmms_eval/api/model.py b/lmms_eval/api/model.py new file mode 100755 index 0000000..d99ba6e --- /dev/null +++ b/lmms_eval/api/model.py @@ -0,0 +1,221 @@ +import abc +import hashlib +import json +import os +from typing import List, Optional, Tuple, Type, TypeVar, Union + +from loguru import logger as eval_logger +from sqlitedict import SqliteDict +from tqdm import tqdm + +from lmms_eval import utils +from lmms_eval.api.instance import Instance + +T = TypeVar("T", bound="lmms") + + +class lmms(abc.ABC): + def __init__(self) -> None: + """Defines the interface that should be implemented by all lmms subclasses. + lmmss are assumed to take image-text as input and yield strings as output + (inputs/outputs should be tokenization-agnostic.) + """ + # set rank and world size to a single process, by default. + self._rank = 0 + self._world_size = 1 + self.cache_hook = CacheHook(None) + self.task_dict = {} + + @abc.abstractmethod + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + """Compute log-likelihood of generating a continuation from a context. + Downstream tasks should attempt to use loglikelihood instead of other + LMM calls whenever possible. + + :param requests: list[Instance] + A list of Instance objects, with property `args` which returns a tuple (context, continuation). + `context: str` + Context string. Implementations of LMM must be able to handle an + empty context string. + `continuation: str` + The continuation over which log likelihood will be calculated. If + there is a word boundary, the space should be in the continuation. + For example, context="hello" continuation=" world" is correct. + 'visual_list: list[dict]' + Visual input to the model. Can be None. + + :return: list[tuple[float, bool]] + A list of pairs (logprob, isgreedy) + `logprob: float` + The log probability of `continuation`. + `isgreedy`: + Whether `continuation` would be generated by greedy sampling from `context`. + """ + pass + + # TODO: Add an optional max length + @abc.abstractmethod + def generate_until(self, requests) -> List[str]: + """Generate greedily until a stopping sequence + + :param requests: list[Instance] + A list of Instance objects with property `args` which returns a tuple (context, until). + context: str + Context string + generation_kwargs: dict + Generation Kwargs + 'visual_list: list[dict]' + Visual input to the model. Can be None. + :return: list[str] + A list of strings continuation + continuation: str + The generated continuation. + """ + pass + + @abc.abstractmethod + def generate_until_multi_round(self, requests) -> List[str]: + """Generate greedily until a stopping sequence + + :param requests: list[Instance] + A list of Instance objects with property `args` which returns a tuple (context, until). + context: str + Context string + generation_kwargs: dict + Generation Kwargs + 'visual_list: list[dict]' + Visual input to the model. Can be None. + :return: list[str] + A list of strings continuation + continuation: str + The generated continuation. + """ + pass + + @classmethod + def create_from_arg_string(cls: Type[T], arg_string: str, additional_config: Optional[dict] = None) -> T: + """ + Creates an instance of the LMM class using the given argument string and additional config. + + Parameters: + - arg_string: A string containing arguments in the format key1=value1,key2=value2. + - additional_config: Optional dictionary containing additional configuration parameters. + + Returns: + - Instance of the LMM class. + """ + additional_config = {} if additional_config is None else additional_config + args = utils.simple_parse_args_string(arg_string) + args2 = {k: v for k, v in additional_config.items() if v is not None} + return cls(**args, **args2) + + @property + def rank(self): + # used in the case of parallelism. Hardcoded to + # ensure no errors arise using API models which do + # not support multi-device parallelism nor expect it. + return self._rank + + @property + def world_size(self): + # used in the case of parallelism. Hardcoded to + # ensure no errors arise using API models which do + # not support multi-device parallelism nor expect it. + return self._world_size + + def set_cache_hook(self, cache_hook) -> None: + self.cache_hook = cache_hook + + +### SQLite-based caching of LMM responses +def hash_args(attr, args): + dat = json.dumps([attr] + list(args)) + return hashlib.sha256(dat.encode("utf-8")).hexdigest() + + +class CacheHook: + def __init__(self, cachinglm) -> None: + if cachinglm is None: + self.dbdict = None + return + + self.dbdict = cachinglm.dbdict + + def add_partial(self, attr, req, res) -> None: + if self.dbdict is None: + return + hsh = hash_args(attr, req) + self.dbdict[hsh] = res + + +class CachingLMM: + def __init__(self, lm, cache_db) -> None: + """LMM wrapper that returns cached results if they exist, and uses the underlying LMM if not. + + :param lm: LMM + Underlying LMM + :param cache_db: str + Path to cache db + """ + self.lm = lm + self.cache_db = cache_db + if os.path.dirname(cache_db): + os.makedirs(os.path.dirname(cache_db), exist_ok=True) + self.dbdict = SqliteDict(cache_db, autocommit=True) + + # add hook to lm + lm.set_cache_hook(self.get_cache_hook()) + + def __getattr__(self, attr): + lm_attr = getattr(self.lm, attr) + if not callable(lm_attr): + return lm_attr + + def fn(requests): + res = [] + remaining_reqs = [] + warned = False + # figure out which ones are cached and which ones are new + eval_logger.info(f"Loading '{attr}' responses from cache '{self.cache_db}' where possible...") + for req in tqdm(requests): + hsh = hash_args(attr, req.args) + if attr in ["generate_until", "generate_until_multi_round"] and req.args[1].get("do_sample", False): + # when we are doing non-greedy generation, don't use the cache + # (else every "randomly sampled" generation would be identical for repeats > 1). + if not warned: + eval_logger.warning(f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests.") + warned = True + res.append(None) + remaining_reqs.append(req) + elif hsh in self.dbdict: + ob = self.dbdict[hsh] + + assert ob is not None + + res.append(ob) + else: + res.append(None) + remaining_reqs.append(req) + + # actually run the LMM on the requests that do not have cached results + rem_res = getattr(self.lm, attr)(remaining_reqs) + + # stick the new ones back into the list and also cache any of the new ones + resptr = 0 + for req, r in zip(remaining_reqs, rem_res): + while res[resptr] is not None: + resptr += 1 + + res[resptr] = r + + # caching + hsh = hash_args(attr, req.args) + self.dbdict[hsh] = r + self.dbdict.commit() + + return res + + return fn + + def get_cache_hook(self): + return CacheHook(self) diff --git a/lmms_eval/api/registry.py b/lmms_eval/api/registry.py new file mode 100755 index 0000000..b037ae8 --- /dev/null +++ b/lmms_eval/api/registry.py @@ -0,0 +1,158 @@ +from typing import Callable, Dict + +import evaluate as hf_evaluate +from loguru import logger as eval_logger + +from lmms_eval.api.model import lmms + +MODEL_REGISTRY = {} + + +def register_model(*names): + # either pass a list or a single alias. + # function receives them as a tuple of strings + + def decorate(cls): + for name in names: + assert issubclass(cls, lmms), f"Model '{name}' ({cls.__name__}) must extend lmms class" + + assert name not in MODEL_REGISTRY, f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead." + + MODEL_REGISTRY[name] = cls + return cls + + return decorate + + +def get_model(model_name): + try: + return MODEL_REGISTRY[model_name] + except KeyError: + raise ValueError(f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}") + + +TASK_REGISTRY = {} # Key: task name, Value: task ConfigurableTask class +GROUP_REGISTRY = {} # Key: group name, Value: list of task names or group names +TASK_INITIALIZED = False +ALL_TASKS = set() # Set of all task names and group names +func2task_index = {} # Key: task ConfigurableTask class, Value: task name + + +def register_task(name): + def decorate(fn): + assert name not in TASK_REGISTRY, f"task named '{name}' conflicts with existing registered task!" + + TASK_REGISTRY[name] = fn + ALL_TASKS.add(name) + func2task_index[fn.__name__] = name + return fn + + return decorate + + +def register_group(name): + def decorate(fn): + func_name = func2task_index[fn.__name__] + if name in GROUP_REGISTRY: + GROUP_REGISTRY[name].append(func_name) + else: + GROUP_REGISTRY[name] = [func_name] + ALL_TASKS.add(name) + return fn + + return decorate + + +OUTPUT_TYPE_REGISTRY = {} +METRIC_REGISTRY = {} +METRIC_AGGREGATION_REGISTRY = {} +AGGREGATION_REGISTRY = {} +HIGHER_IS_BETTER_REGISTRY = {} + +DEFAULT_METRIC_REGISTRY = { + "loglikelihood": [ + "perplexity", + "acc", + ], + "multiple_choice": ["acc", "acc_norm"], + "generate_until": ["exact_match"], + "generate_until_multi_round": ["exact_match"], +} + + +def register_metric(**args): + # TODO: do we want to enforce a certain interface to registered metrics? + def decorate(fn): + assert "metric" in args + name = args["metric"] + + for key, registry in [ + ("metric", METRIC_REGISTRY), + ("higher_is_better", HIGHER_IS_BETTER_REGISTRY), + ("aggregation", METRIC_AGGREGATION_REGISTRY), + ]: + if key in args: + value = args[key] + assert value not in registry, f"{key} named '{value}' conflicts with existing registered {key}!" + + if key == "metric": + registry[name] = fn + elif key == "aggregation": + registry[name] = AGGREGATION_REGISTRY[value] + else: + registry[name] = value + + return fn + + return decorate + + +def get_metric(name: str, hf_evaluate_metric=False) -> Callable: + if not hf_evaluate_metric: + if name in METRIC_REGISTRY: + return METRIC_REGISTRY[name] + else: + eval_logger.warning(f"Could not find registered metric '{name}' in lm-eval, searching in HF Evaluate library...") + + try: + metric_object = hf_evaluate.load(name) + return metric_object.compute + except Exception: + eval_logger.error( + f"{name} not found in the evaluate library! Please check https://huggingface.co/evaluate-metric", + ) + + +def register_aggregation(name): + def decorate(fn): + assert name not in AGGREGATION_REGISTRY, f"aggregation named '{name}' conflicts with existing registered aggregation!" + + AGGREGATION_REGISTRY[name] = fn + return fn + + return decorate + + +def get_aggregation(name): + try: + return AGGREGATION_REGISTRY[name] + except KeyError: + eval_logger.warning( + "{} not a registered aggregation metric!".format(name), + ) + + +def get_metric_aggregation(name): + try: + return METRIC_AGGREGATION_REGISTRY[name] + except KeyError: + eval_logger.warning( + "{} metric is not assigned a default aggregation!".format(name), + ) + + +def is_higher_better(metric_name): + try: + return HIGHER_IS_BETTER_REGISTRY[metric_name] + except KeyError: + eval_logger.warning(f"higher_is_better not specified for metric '{metric_name}'!") diff --git a/lmms_eval/api/samplers.py b/lmms_eval/api/samplers.py new file mode 100755 index 0000000..2cecfe2 --- /dev/null +++ b/lmms_eval/api/samplers.py @@ -0,0 +1,96 @@ +class ContextSampler: + def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None: + self.rnd = rnd + assert self.rnd, "must pass rnd to FewShotSampler!" + + self.task = task + self.config = task._config + + self.target_delimiter = self.config.target_delimiter + self.fewshot_delimiter = self.config.fewshot_delimiter + + self.doc_to_text = self.task.doc_to_text + self.doc_to_target = self.task.doc_to_target + self.doc_to_choice = self.task.doc_to_choice + + self.docs = docs # HF dataset split, provided by task._fewshot_docs() + if fewshot_indices: # subset few-shot docs from + self.docs = self.docs.select(fewshot_indices) + + def get_context(self, doc, num_fewshot): + # draw an extra fewshot sample if using same split as evaluating on + n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot + + # draw `n_samples` docs from fewshot_docs + fewshotex = self.sample(n_samples) + + # get rid of the doc that's the one we're evaluating, if it's in the fewshot + # TODO: should we just stop people from using fewshot from same split as evaluating? + selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] + + labeled_examples = ( + self.fewshot_delimiter.join( + [ + # TODO: is separating doc_to_text and doc_to_target by one space always desired? + (self.doc_to_text(doc) if (self.config.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)]) + + self.target_delimiter + + ( + str(self.doc_to_target(doc)[0]) + if type(self.doc_to_target(doc)) is list + else self.doc_to_target(doc) + if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) + else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]) + ) + for doc in selected_docs + ] + ) + + self.fewshot_delimiter + ) + + return labeled_examples + + def sample(self, n): + """ + Draw `n` samples from our fewshot docs. This method should be overridden by subclasses. + """ + + return self.rnd.sample(self.docs, n) + + +class FirstNSampler(ContextSampler): + def sample(self, n) -> None: + """ + Draw the first `n` samples in order from the specified split. + Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU. + """ + assert n <= len(self.docs), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available." + return self.docs[:n] + + +class BalancedSampler(ContextSampler): + def sample(self, n) -> None: + """ + TODO: this should return approximately class-balanced samples from our fewshot examples. + TODO: what order should they be in? maybe random? + """ + + pass + + +class ManualSampler(ContextSampler): + def sample(self, n) -> None: + """ """ + pass + + +SAMPLER_REGISTRY = { + "default": ContextSampler, + "first_n": FirstNSampler, +} + + +def get_sampler(name): + try: + return SAMPLER_REGISTRY[name] + except KeyError: + raise ValueError(f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}") diff --git a/lmms_eval/api/task.py b/lmms_eval/api/task.py new file mode 100755 index 0000000..937b0fc --- /dev/null +++ b/lmms_eval/api/task.py @@ -0,0 +1,1578 @@ +import abc +import ast +import copy +import inspect +import itertools +import json +import os +import random +import re +import shutil +import subprocess +from collections.abc import Callable +from dataclasses import asdict, dataclass, field +from functools import partial +from glob import glob +from typing import ( + Any, + Dict, + Iterable, + Iterator, + List, + Literal, + Mapping, + Optional, + Tuple, + Union, +) + +import datasets +import numpy as np +from accelerate import Accelerator +from datasets import DownloadConfig, Image, Sequence +from huggingface_hub import snapshot_download +from loguru import logger as eval_logger +from PIL import ImageFile +from tenacity import retry, stop_after_attempt, stop_after_delay, wait_fixed +from tqdm import tqdm + +from lmms_eval import utils +from lmms_eval.api import samplers +from lmms_eval.api.instance import Instance +from lmms_eval.api.registry import ( + AGGREGATION_REGISTRY, + DEFAULT_METRIC_REGISTRY, + METRIC_REGISTRY, + OUTPUT_TYPE_REGISTRY, + get_aggregation, + get_metric, + get_metric_aggregation, + is_higher_better, +) +from lmms_eval.caching.cache import load_from_cache, save_to_cache +from lmms_eval.filters import build_filter_ensemble + +# HuggingfaceM4/NoCaps contains truncated image in test split +# Include this inside code block to avoid error +ImageFile.LOAD_TRUNCATED_IMAGES = True + +ALL_OUTPUT_TYPES = [ + "loglikelihood", + "multiple_choice", + "generate_until", + "generate_until_multi_round", +] + + +@dataclass +class TaskConfig(dict): + # task naming/registry + task: str = None + task_alias: str = None + tag: str = None + group: Union[str, list] = None + group_alias: Union[str, list] = None + # HF dataset options. + # which dataset to use, + # and what splits for what purpose + dataset_path: str = None + dataset_name: str = None + dataset_kwargs: dict = None + training_split: str = None + validation_split: str = None + test_split: str = None + fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) + full_docs: bool = False + # formatting / prompting options. + # see docs/advanced_task_guide.md for more info + process_results_use_image: bool = False + process_docs: Callable = None + doc_to_visual: Union[Callable, str] = None + doc_to_text: Union[Callable, str] = None + doc_to_target: Union[Callable, str] = None + doc_to_choice: Union[Callable, str, dict, list] = None + process_results: Union[Callable, str] = None + use_prompt: str = None + description: str = "" + target_delimiter: str = " " + fewshot_delimiter: str = "\n\n" + fewshot_config: dict = None + # runtime configuration options + num_fewshot: int = None + # scoring options + metric_list: list = None + output_type: str = "generate_until" + generation_kwargs: dict = None + repeats: int = 1 + filter_list: Union[str, list] = None + should_decontaminate: bool = False + doc_to_decontamination_query: str = None + + metadata: Union[str, list] = None # by default, not used in the code. allows for users to pass arbitrary info to tasks + + lmms_eval_specific_kwargs: dict = None + model_specific_generation_kwargs: dict = None + model_specific_target_kwargs: dict = None + + def __post_init__(self) -> None: + if self.dataset_path and os.path.exists(os.path.dirname(self.dataset_path)): + import inspect + from importlib import import_module + + # self.dataset_path = inspect.getfile(import_module(self.dataset_path)) + + if self.group is not None: + eval_logger.warning( + "A task YAML file was found to contain a `group` key. Groups which provide aggregate scores over several subtasks now require a separate config file--if not aggregating, you may want to use the `tag` config option instead within your config. Setting `group` within a TaskConfig will be deprecated in v0.4.4. Please see https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md for more information." + ) + + if self.tag is None: + self.tag = self.group + else: + raise ValueError("Got both a `group` and `tag` entry within a TaskConfig. Please use one or the other--`group` values will be deprecated in v0.4.4.") + + if self.generation_kwargs is not None: + if "generate_until" not in self.output_type: + eval_logger.warning(f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!") + assert "generate_until" not in self.output_type + + if "temperature" in self.generation_kwargs: + self.generation_kwargs["temperature"] = float(self.generation_kwargs["temperature"]) + + if "until" not in self.generation_kwargs: + self.generation_kwargs["until"] = [self.fewshot_delimiter] + else: + if "generate_until" in self.output_type: + # ensure that we greedily generate in absence of explicit arguments otherwise + self.generation_kwargs = { + "until": None if self.fewshot_delimiter is None else [self.fewshot_delimiter], + "do_sample": False, + } + + # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor? + + def __getitem__(self, item): + return getattr(self, item) + + def __setitem__(self, item, value): + return setattr(self, item, value) + + def to_dict(self): + """dumps the current config as a dictionary object, as a printable format. + null fields will not be printed. + Used for dumping results alongside full task configuration + + :return: dict + A printable dictionary version of the TaskConfig object. + + # TODO: should any default value in the TaskConfig not be printed? + """ + cfg_dict = asdict(self) + # remove values that are `None` + for k, v in list(cfg_dict.items()): + if v is None: + cfg_dict.pop(k) + elif isinstance(v, Callable): + # TODO: this should handle Promptsource template objects as a separate case? + cfg_dict[k] = str(v) + return cfg_dict + + +class Task(abc.ABC): + """A task represents an entire benchmark including its dataset, problems, + answers, and evaluation methods. See BoolQ for a simple example implementation + + A `doc` can be any python object which represents one instance of evaluation. + This is usually a dictionary e.g. + {"question": ..., "answer": ...} or + {"question": ..., question, answer) + """ + + VERSION = None + + # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub + # or a path to a custom `datasets` loading script. + DATASET_PATH: str = None + + # The name of a subset within `DATASET_PATH`. + DATASET_NAME: str = None + + OUTPUT_TYPE: str = None + + def __init__( + self, + data_dir=None, + cache_dir=None, + download_mode=None, + config=None, + ) -> None: + """ + :param data_dir: str + Stores the path to a local folder containing the `Task`'s data files. + Use this to specify the path to manually downloaded data (usually when + the dataset is not publicly accessible). + :param cache_dir: str + The directory to read/write the `Task` dataset. This follows the + HuggingFace `datasets` API with the default cache directory located at: + `~/.cache/huggingface/datasets` + NOTE: You can change the cache location globally for a given process + to another directory: + `export HF_DATASETS_CACHE="/path/to/another/directory"` + :param download_mode: datasets.DownloadMode + How to treat pre-existing `Task` downloads and data. + - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS` + Reuse download and reuse dataset. + - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS` + Reuse download with fresh dataset. + - `datasets.DownloadMode.FORCE_REDOWNLOAD` + Fresh download and fresh dataset. + """ + self.download(data_dir, cache_dir, download_mode) + self._training_docs = None + self._fewshot_docs = None + self._instances = None + + self._config = TaskConfig({**config}) if config else TaskConfig() + + self._filters = [build_filter_ensemble("none", [["take_first", None]])] + + def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None: + """Downloads and returns the task dataset. + Override this method to download the dataset from a custom API. + + :param data_dir: str + Stores the path to a local folder containing the `Task`'s data files. + Use this to specify the path to manually downloaded data (usually when + the dataset is not publicly accessible). + :param cache_dir: str + The directory to read/write the `Task` dataset. This follows the + HuggingFace `datasets` API with the default cache directory located at: + `~/.cache/huggingface/datasets` + NOTE: You can change the cache location globally for a given process + by setting the shell environment variable, `HF_DATASETS_CACHE`, + to another directory: + `export HF_DATASETS_CACHE="/path/to/another/directory"` + :param download_mode: datasets.DownloadMode + How to treat pre-existing `Task` downloads and data. + - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS` + Reuse download and reuse dataset. + - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS` + Reuse download with fresh dataset. + - `datasets.DownloadMode.FORCE_REDOWNLOAD` + Fresh download and fresh dataset. + """ + self.dataset = datasets.load_dataset( + path=self.DATASET_PATH, + name=self.DATASET_NAME, + data_dir=data_dir, + cache_dir=cache_dir, + download_mode=download_mode, + ) + self.dataset_no_image = datasets.load_dataset( + path=self.DATASET_PATH, + name=self.DATASET_NAME, + data_dir=data_dir, + cache_dir=cache_dir, + download_mode=download_mode, + ) + for doc_name in self.dataset_no_image: + remove_cols = [] + features = self.dataset_no_image[doc_name].features + # If it is an Image instance or a Sequence of Image instance. Remove it + for feature in features: + if isinstance(features[feature], Image): + remove_cols.append(feature) + elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image): + remove_cols.append(feature) + for remove_col in remove_cols: + self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col) + + @property + def config(self): + """Returns the TaskConfig associated with this class.""" + return self._config + + @abc.abstractmethod + def has_training_docs(self): + """Whether the task has a training set""" + pass + + @abc.abstractmethod + def has_validation_docs(self): + """Whether the task has a validation set""" + pass + + @abc.abstractmethod + def has_test_docs(self): + """Whether the task has a test set""" + pass + + def training_docs(self): + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + return [] + + def validation_docs(self): + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + return [] + + def test_docs(self): + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + return [] + + def fewshot_docs(self): + """ + :return: Iterable[obj] + A iterable of any object, that doc_to_text can handle + """ + if self.has_training_docs(): + return self.training_docs() + elif self.has_validation_docs(): + return self.validation_docs() + else: + if self.config.num_fewshot is not None: + eval_logger.warning("has_training_docs and has_validation_docs are False" ", using test_docs as fewshot_docs but this is not recommended.") + return self.test_docs() + + def _process_doc(self, doc): + """ + Override this to process (detokenize, strip, replace, etc.) individual + documents. This can be used in a map over documents of a data split. + E.g. `map(self._process_doc, self.dataset["validation"])` + + :return: dict + The processed version of the specified `doc`. + """ + return doc + + @property + def instances(self): + """After calling `task.build_all_requests()`, tasks + maintain a list of the dataset instances which will be evaluated. + """ + return self._instances + + def fewshot_examples(self, k, rnd): + if self._training_docs is None: + self._training_docs = list(self.training_docs()) + + return rnd.sample(self._training_docs, k) + + def doc_to_decontamination_query(self, doc) -> None: + print("Override doc_to_decontamination_query with document specific decontamination query.") + assert False + + @abc.abstractmethod + def doc_to_text(self, doc): + pass + + @abc.abstractmethod + def doc_to_target(self, doc): + pass + + # @profile + def build_all_requests( + self, + *, + limit: Union[int, None] = None, + rank: int = 0, + world_size: int = 1, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + chat_template: Optional[Callable] = None, + tokenizer_name: str = "", + ) -> None: + """Build a set of Instances for a task, and store them in task.instances""" + if self.has_test_docs(): + docs = self.test_docs() + split = self.config.test_split + elif self.has_validation_docs(): + docs = self.validation_docs() + split = self.config.validation_split + else: + assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" + + # used with caching + og_limit = limit + + cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}" + cache_key += "-chat_template" if apply_chat_template else "" + cache_key += "-fewshot_as_multiturn" if fewshot_as_multiturn else "" + cache_key += f"-system_prompt_hash{utils.hash_string(system_instruction)}" if system_instruction is not None else "" + cache_key += f"-tokenizer{tokenizer_name}" + + cached_instances = load_from_cache(file_name=cache_key) + + if cache_requests and cached_instances and not rewrite_requests_cache: + cached_instances = cached_instances[:limit] + + flattened_instances = [instance for instance_group in cached_instances for instance in instance_group] + + self._instances = flattened_instances + return + + eval_logger.info(f"Building contexts for {self.config.task} on rank {rank}...") + + instances = [] + + # process all documents when caching is specified for simplicity + if cache_requests and (not cached_instances or rewrite_requests_cache) and limit is not None: + limit = None + + doc_id_docs = list(self.doc_iterator(rank=rank, limit=limit, world_size=world_size)) + + num_docs = len(doc_id_docs) + + for doc_id, doc in tqdm( + doc_id_docs, + total=num_docs, + ): + # sample fewshot context #TODO: need to offset doc_id by rank now! + fewshot_ctx = self.fewshot_context( + doc, + 0 if self.config.num_fewshot is None else self.config.num_fewshot, + system_instruction, + apply_chat_template, + fewshot_as_multiturn, + chat_template, + ) + + # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute + per_task_metadata = {"task": self.config["task"], "doc_id": doc_id, "repeats": self.config.repeats, "split": split} + if self.config.metadata and type(self.config.metadata) == dict: # TODO: temporary fix for metadata loading, ignore the list of dict type. + per_task_metadata.update(self.config.metadata) + + inst = self.construct_requests(doc_id=doc_id, ctx=fewshot_ctx, metadata=per_task_metadata) + + if not isinstance(inst, list): + inst = [inst] + + instances.append(inst) + + # now flatten, this is to allow slicing to work with pickles + + sliced_instances = instances[:og_limit] + + flattened_instances = [instance for instance_group in sliced_instances for instance in instance_group] + + self._instances = flattened_instances + + if len(self._instances) == 0: + raise ValueError("task.build_requests() did not find any docs!") + + if cache_requests and (not cached_instances or rewrite_requests_cache): + save_to_cache(file_name=cache_key, obj=instances) + + # FIXME: Bo - We need to check if the doc_to_visual if it's exists and restore it. If we use cache, the doc_to_visual will be None since it's not serializable + for instance in self._instances: + if instance.arguments[2] is None: + arguments = (instance.arguments[0], instance.arguments[1], self.doc_to_visual, *instance.arguments[3:]) + else: + arguments = instance.arguments + + instance.arguments = arguments + + @abc.abstractmethod + def construct_requests(self, doc_id, ctx, **kwargs): + """Uses RequestFactory to construct Requests and returns an iterable of + Requests which will be sent to the LMM. + + :param doc_id: int + The index of a document within `self.test_docs()` or `self.validation_docs()`, + whichever is the main split used. + :param ctx: str + The context string, generated by fewshot_context. This includes the natural + language description, as well as the few shot examples, and the question + part of the document for `doc`. + :param repeats: int + TODO: update this docstring + The number of times each instance in a dataset is inferred on. Defaults to 1, + can be increased for techniques like majority voting. + """ + pass + + @abc.abstractmethod + def process_results(self, doc, results): + """Take a single document and the LMM results and evaluates, returning a + dict where keys are the names of submetrics and values are the values of + the metric for that one document + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param results: + The results of the requests created in construct_requests. + """ + pass + + @abc.abstractmethod + def aggregation(self): + """ + :returns: {str: [metric_score] -> float} + A dictionary where keys are the names of submetrics and values are + functions that aggregate a list of metric scores + """ + pass + + @abc.abstractmethod + def higher_is_better(self): + """ + :returns: {str: bool} + A dictionary where keys are the names of submetrics and values are + whether a higher value of the submetric is better + """ + pass + + @classmethod + def count_bytes(cls, doc): + """Used for byte-level perplexity metrics in rolling loglikelihood""" + return len(doc.encode("utf-8")) + + @utils.positional_deprecated + def fewshot_context( + self, + doc_id, + num_fewshot, + split, + rnd=random.Random(1234), + description=None, + ): + """Returns a fewshot context string that is made up of a prepended description + (if provided), the `num_fewshot` number of examples, and an appended prompt example. + + :param doc_id: int + The document id as returned from training_docs, validation_docs, or test_docs. + :param num_fewshot: int + The number of fewshot examples to provide in the returned context string. + :param split: str + The split of the document to retrieve from the dataset + :param rnd: random.Random + The pseudo-random number generator used to randomly sample examples. + WARNING: This is currently a required arg although it's optionalized with a default `None`. + :param description: str + The task's description that will be prepended to the fewshot examples. + :returns: str + The fewshot context. + """ + assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`" + + description = description if description else "" + doc = self.dataset_no_image[split][doc_id] + + if num_fewshot == 0: + labeled_examples = "" + else: + # for sets with no training docs, draw from other set *but ensure no overlap with current doc* + if self.has_training_docs(): + fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd) + else: + if self._fewshot_docs is None: + self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs()) + + fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1) + + # get rid of the doc that's the one we're evaluating, if it's in the fewshot + fewshotex = [x for x in fewshotex if x != doc][:num_fewshot] + + labeled_examples = "\n\n".join([self.doc_to_text(doc) + self.doc_to_target(doc) for doc in fewshotex]) + "\n\n" + + example = self.doc_to_text(doc) + return description + labeled_examples + example + + def apply_filters(self): + if hasattr(self, "_filters"): + for f in self._filters: + f.apply(self._instances, None) + else: + eval_logger.warning("No filter defined, passing through instances") + return self._instances + + def dump_config(self) -> dict: + """Returns a dictionary representing the task's config. + + :returns: str + The fewshot context. + """ + # TODO: this should only return the overrides applied to a non-YAML task's configuration. + # (num_fewshot) + return self.config.to_dict() + + def set_config(self, key: str, value: Any, update: bool = False) -> None: + """Set or update the configuration for a given key.""" + if key is None: + raise ValueError("Key must be provided.") + + if update: + current_value = getattr(self._config, key, {}) + if not isinstance(current_value, dict): + raise TypeError(f"Expected a dict for key '{key}', got {type(current_value).__name__} instead.") + current_value.update(value) + else: + setattr(self._config, key, value) + + def override_metric(self, metric_name: str) -> None: + """ + Override the default metrics used for evaluation with custom metrics. + + Parameters: + - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics. + """ + ( + self._metric_fn_list, + self._aggregation_list, + self._metric_fn_kwargs, + self._higher_is_better, + ) = ({}, {}, {}, {}) + self._metric_fn_list[metric_name] = get_metric(metric_name) + self._aggregation_list[metric_name] = get_metric_aggregation(metric_name) + self._higher_is_better[metric_name] = is_higher_better(metric_name) + self._metric_fn_kwargs[metric_name] = {} + if not isinstance(self, ConfigurableTask): + self.process_results = lambda x, y: {metric_name: get_metric(metric_name)} + self.aggregation = lambda: {metric_name: get_metric_aggregation(metric_name)} + setattr(self._config, "metric_list", [{"metric": metric_name}]) + setattr(self._config, "process_results", None) + + def set_fewshot_seed(self, seed: Optional[int] = None) -> None: + self.fewshot_rnd = random.Random(seed) + if hasattr(self, "sampler"): + self.sampler.rnd = self.fewshot_rnd + + @property + def eval_docs(self) -> Union[datasets.Dataset, List[dict]]: + if self.has_test_docs(): + return self.test_docs() + elif self.has_validation_docs(): + return self.validation_docs() + else: + raise ValueError(f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!") + + def doc_iterator(self, *, rank: int = 0, limit: Union[int, None] = None, world_size: int = 1) -> Iterator[Tuple[int, Any]]: + limit = int(limit) if limit else None + doc_iterator = utils.create_iterator( + enumerate(self.eval_docs), + rank=int(rank), + limit=limit, + world_size=int(world_size), + ) + return doc_iterator + + +class ConfigurableTask(Task): + VERSION = "Yaml" + OUTPUT_TYPE = None + CONFIG = None + + def __init__( + self, + data_dir=None, + cache_dir=None, + download_mode=None, + config: Optional[dict] = None, + model_name: Optional[str] = None, + ) -> None: # TODO no super() call here + # Get pre-configured attributes + self._config = self.CONFIG + + # Use new configurations if there was no preconfiguration + if self.config is None: + self._config = TaskConfig(**config) + # Overwrite configs + else: + if config is not None: + self._config.__dict__.update(config) + + if self.config is None: + raise ValueError("Must pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwarg") + + if isinstance(self.config.metadata, dict): + if "version" in self.config.metadata: + self.VERSION = self.config.metadata["version"] + + self.model_name = model_name + self._prepare_model_specific_config() + + if self.config.output_type is not None: + if self.config.output_type not in ALL_OUTPUT_TYPES: + raise ValueError(f"Got invalid output_type '{self.config.output_type}', must be in '{','.join(ALL_OUTPUT_TYPES)}'") + self.OUTPUT_TYPE = self.config.output_type + + if self.config.dataset_path is not None: + self.DATASET_PATH = self.config.dataset_path + + if self.config.dataset_name is not None: + self.DATASET_NAME = self.config.dataset_name + + self._prepare_metric_and_aggregation() + + self.download(self.config.dataset_kwargs) + self._training_docs = None + self._fewshot_docs = None + + if self.config.filter_list is not None: + self._filters = [] + for filter_config in self.config.filter_list: + for filter_pipeline in filter_config: + filter_name = filter_config["name"] + filter_functions = filter_config["filter"] + components = [] + for function in filter_functions: + kwargs = {key: function[key] for key in function if key != "function"} + components.append([function["function"], kwargs]) + filter_pipeline = build_filter_ensemble(filter_name, components) + self._filters.append(filter_pipeline) + else: + self._filters = [build_filter_ensemble("none", [["take_first", None]])] + if self.config.fewshot_config is not None: + self.sampler = samplers.get_sampler(self.config.fewshot_config.get("sampler", "default") if self.config.fewshot_config else "default")(list(self.fewshot_docs()), self, rnd=random.Random(1234)) + + if self.has_test_docs(): + self.task_docs = self.test_docs() + elif self.has_validation_docs(): + self.task_docs = self.validation_docs() + else: + assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" + + # Test One Doc + self.features = list(self.task_docs.features.keys()) + self.multiple_input = 0 + self.multiple_target = 0 + test_doc = self.task_docs[0] + test_text = self.doc_to_text(test_doc) + test_target = self.doc_to_target(test_doc) + + if self.config.doc_to_choice is not None: + test_choice = self.doc_to_choice(test_doc) + if type(test_choice) is not list: + eval_logger.error("doc_to_choice must return list") + else: + num_choice = len(test_choice) + + if type(test_text) is int: + self.multiple_input = num_choice + else: + test_choice = None + + if type(test_target) is list: + self.multiple_target = len(test_target) + else: + if (type(test_target) is int) and (test_choice is not None): + test_target = test_choice[test_target] + else: + test_target = str(test_target) + + if test_choice is not None: + check_choices = test_choice + else: + check_choices = [test_target] + if self.config.doc_to_choice is not None: + for choice in check_choices: + choice_has_whitespace = True if choice[0].isspace() else False + delimiter_has_whitespace = True if self.config.target_delimiter.rstrip() != self.config.target_delimiter else False + + if delimiter_has_whitespace and choice_has_whitespace: + eval_logger.warning(f'Both target_delimiter and target choice: "{choice}" have whitespace') + elif (not delimiter_has_whitespace) and (not choice_has_whitespace): + eval_logger.warning(f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace') + + def _prepare_model_specific_config(self): + self.lmms_eval_specific_kwargs = self.config.lmms_eval_specific_kwargs + if self.lmms_eval_specific_kwargs is not None: + if self.model_name in self.lmms_eval_specific_kwargs: + self.lmms_eval_specific_kwargs = self.lmms_eval_specific_kwargs[self.model_name] + elif "default" in self.lmms_eval_specific_kwargs: + self.lmms_eval_specific_kwargs.update(self.lmms_eval_specific_kwargs.get("default", {})) + elif "dataset" in self.lmms_eval_specific_kwargs: + self.lmms_eval_specific_kwargs.update(self.lmms_eval_specific_kwargs.get("dataset", {})) + + self.model_specific_target_kwargs = self.config.model_specific_target_kwargs + if self.model_specific_target_kwargs is not None: + if self.model_name in self.model_specific_target_kwargs: + self.model_specific_target_kwargs = self.model_specific_target_kwargs[self.model_name] + else: + self.model_specific_target_kwargs = self.model_specific_target_kwargs.get("default", None) + self.model_specific_generation_kwargs = self.config.model_specific_generation_kwargs + if self.model_specific_generation_kwargs is not None: + if self.model_name in self.model_specific_generation_kwargs: + self.model_specific_generation_kwargs = self.model_specific_generation_kwargs[self.model_name] + else: + self.model_specific_generation_kwargs = self.model_specific_generation_kwargs.get("default", {}) + + self.config.generation_kwargs.update(self.model_specific_generation_kwargs) + + def _prepare_metric_and_aggregation(self): + self._metric_fn_list = {} + self._metric_fn_kwargs = {} + self._aggregation_list = {} + self._higher_is_better = {} + + if self.config.metric_list is None: + # TODO: handle this in TaskConfig.__post_init__ ? + _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type] + + for metric_name in _metric_list: + self._metric_fn_list[metric_name] = METRIC_REGISTRY[metric_name] + self._metric_fn_kwargs[metric_name] = {} + self._aggregation_list[metric_name] = get_metric_aggregation(metric_name) + self._higher_is_better[metric_name] = is_higher_better(metric_name) + else: + for metric_config in self.config.metric_list: + assert "metric" in metric_config + metric_name = metric_config["metric"] + kwargs = {key: metric_config[key] for key in metric_config if key not in ["metric", "aggregation", "higher_is_better"]} + + if self.config.process_results is not None: + self._metric_fn_list[metric_name] = None + self._metric_fn_kwargs[metric_name] = {} + elif callable(metric_name): + metric_fn = metric_name.__call__ + metric_name = metric_name.__name__ + self._metric_fn_list[metric_name] = metric_fn + self._metric_fn_kwargs[metric_name] = kwargs + else: + self._metric_fn_list[metric_name] = METRIC_REGISTRY[metric_name] + self._metric_fn_kwargs[metric_name] = kwargs + + if "aggregation" in metric_config: + agg_name = metric_config["aggregation"] + if type(agg_name) == str: + self._aggregation_list[metric_name] = get_aggregation(agg_name) + elif callable(agg_name): + self._aggregation_list[metric_name] = metric_config["aggregation"] + else: + INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()} + metric_agg = get_metric_aggregation(metric_name) + eval_logger.warning(f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. " f"using default " f"aggregation={INV_AGG_REGISTRY[metric_agg]}") + self._aggregation_list[metric_name] = metric_agg + + if "higher_is_better" in metric_config: + self._higher_is_better[metric_name] = metric_config["higher_is_better"] + else: + eval_logger.warning(f"[Task: {self._config.task}] metric {metric_name} is defined, but higher_is_better is not. " f"using default " f"higher_is_better={is_higher_better(metric_name)}") + self._higher_is_better[metric_name] = is_higher_better(metric_name) + + @retry(stop=(stop_after_attempt(5) | stop_after_delay(60)), wait=wait_fixed(2)) + def download(self, dataset_kwargs=None) -> None: + # If the dataset is a video dataset, + # Recursively search whether their is a zip and unzip it to the huggingface home + download_config = DownloadConfig() + download_config.max_retries = dataset_kwargs.get("max_retries", 10) if dataset_kwargs is not None else 10 + download_config.num_proc = dataset_kwargs.get("num_proc", 8) if dataset_kwargs is not None else 8 + download_config.local_files_only = dataset_kwargs.get("local_files_only", False) if dataset_kwargs is not None else False + if dataset_kwargs is not None: + if "From_YouTube" in dataset_kwargs: + + def _download_from_youtube(path): + try: + for video in tqdm(self.all_dataset[split]): + video_id = video["videoID"] + target_path = os.path.join(path, f"{video_id}.mp4") + assert shutil.which("yt-dlp") is not None, "yt-dlp must be installed and available in the system's PATH" + command = f"yt-dlp -o {target_path} -f mp4 https://www.youtube.com/watch?v={video_id}" + subprocess.run(command, shell=True) + with open(os.path.join(cache_path, f"{task}_download_status.json"), "w") as f: + f.write(json.dumps({task: "downloaded"})) + except Exception as e: + eval_logger.error(f"Error while downloading {task} data: {e}") + with open(os.path.join(cache_path, f"{task}_download_status.json"), "w") as f: + f.write(json.dumps({task: "not downloaded"})) + + hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/") + accelerator = Accelerator() + if accelerator.is_main_process: + dataset_kwargs.pop("From_YouTube") + assert "load_from_disk" not in dataset_kwargs, "load_from_disk must not be True when From_YouTube is True" + self.all_dataset = datasets.load_dataset( + path=self.DATASET_PATH, + name=self.DATASET_NAME, + download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, + **dataset_kwargs if dataset_kwargs is not None else {}, + ) + dataset_kwargs["From_YouTube"] = True + cache_path = snapshot_download(repo_id=self.DATASET_PATH, repo_type="dataset") # download_parquet + split = vars(self.config)["test_split"] + task = vars(self.config)["task"] + + video_path = os.path.join(hf_home, task) + if os.path.exists(os.path.join(cache_path, f"{task}_download_status.json")): + download_status = json.load(open(os.path.join(cache_path, f"{task}_download_status.json"), "r")) + if download_status[task] == "downloaded": + eval_logger.info(f"Data for {task} already download!") + else: + eval_logger.info(f"Start downloading YouTube data to {video_path}...") + _download_from_youtube(video_path) + else: + eval_logger.info(f"Start downloading YouTube data to {video_path}...") + _download_from_youtube(video_path) + + accelerator.wait_for_everyone() + if "builder_script" in dataset_kwargs: + builder_script = dataset_kwargs["builder_script"] + self.DATASET_PATH = os.path.join(cache_path, builder_script) + dataset_kwargs.pop("builder_script") + + downloaded_video_ids = [i.split(".mp4")[0] for i in os.listdir(os.path.expanduser(video_path)) if i.endswith(".mp4")] + # Filtered the existing dataset with the downloaded video ids + self.dataset = datasets.DatasetDict({split: self.all_dataset[split].filter(lambda x: x["videoID"] in downloaded_video_ids)}) + + self.dataset_no_image = self.dataset + dataset_kwargs.pop("From_YouTube") + return + + if "video" in dataset_kwargs and dataset_kwargs["video"]: + hf_home = os.getenv("HF_HOME", "~/.cache/huggingface/") + hf_home = os.path.expanduser(hf_home) + cache_dir = dataset_kwargs["cache_dir"] + cache_dir = os.path.join(hf_home, cache_dir) + accelerator = Accelerator() + if accelerator.is_main_process: + force_download = dataset_kwargs.get("force_download", False) + force_unzip = dataset_kwargs.get("force_unzip", False) + revision = dataset_kwargs.get("revision", "main") + create_link = dataset_kwargs.get("create_link", False) + cache_path = snapshot_download(repo_id=self.DATASET_PATH, revision=revision, repo_type="dataset", force_download=force_download, etag_timeout=60) + zip_files = glob(os.path.join(cache_path, "**/*.zip"), recursive=True) + tar_files = glob(os.path.join(cache_path, "**/*.tar*"), recursive=True) + + def unzip_video_data(zip_file): + import os + import zipfile + + with zipfile.ZipFile(zip_file, "r") as zip_ref: + for file_info in zip_ref.infolist(): + target_path = os.path.join(cache_dir, file_info.filename) + if not os.path.exists(target_path): + zip_ref.extract(file_info, cache_dir) + else: + eval_logger.info(f"Skipping existing file: {target_path}") + + eval_logger.info(f"Extracted all files from {zip_file} to {cache_dir}") + + def untar_video_data(tar_file): + import tarfile + + with tarfile.open(tar_file, "r") as tar_ref: + tar_ref.extractall(cache_dir) + eval_logger.info(f"Extracted all files from {tar_file} to {cache_dir}") + + def concat_tar_parts(tar_parts, output_tar): + with open(output_tar, "wb") as out_tar: + from tqdm import tqdm + + for part in tqdm(sorted(tar_parts)): + with open(part, "rb") as part_file: + out_tar.write(part_file.read()) + eval_logger.info(f"Concatenated parts {tar_parts} into {output_tar}") + + # Unzip zip files if needed + if force_unzip or (not os.path.exists(cache_dir) and len(zip_files) > 0): + for zip_file in zip_files: + unzip_video_data(zip_file) + + # Concatenate and extract tar files if needed + if force_unzip or (not os.path.exists(cache_dir) and len(tar_files) > 0): + tar_parts_dict = {} + + # Group tar parts together + for tar_file in tar_files: + base_name = tar_file.split(".tar")[0] + if base_name not in tar_parts_dict: + tar_parts_dict[base_name] = [] + tar_parts_dict[base_name].append(tar_file) + + # Concatenate and untar split parts + for base_name, parts in tar_parts_dict.items(): + eval_logger.info(f"Extracting following tar files: {parts}") + output_tar = base_name + ".tar" + if not os.path.exists(output_tar): + eval_logger.info(f"Start concatenating tar files") + + concat_tar_parts(parts, output_tar) + eval_logger.info(f"Finish concatenating tar files") + + if not os.path.exists(os.path.join(cache_dir, os.path.basename(base_name))): + untar_video_data(output_tar) + + # Link cache_path to cache_dir if needed. + if create_link: + if not os.path.exists(cache_dir) or os.path.islink(cache_dir): + if os.path.islink(cache_dir): + os.remove(cache_dir) + eval_logger.info(f"Removed existing symbolic link: {cache_dir}") + # Create a new symbolic link + os.symlink(cache_path, cache_dir) + eval_logger.info(f"Symbolic link created successfully: {cache_path} -> {cache_dir}") + + accelerator.wait_for_everyone() + dataset_kwargs.pop("cache_dir") + dataset_kwargs.pop("video") + + if "builder_script" in dataset_kwargs: + builder_script = dataset_kwargs["builder_script"] + self.DATASET_PATH = os.path.join(cache_path, builder_script) + dataset_kwargs.pop("builder_script") + + if "force_download" in dataset_kwargs: + dataset_kwargs.pop("force_download") + + if "force_unzip" in dataset_kwargs: + dataset_kwargs.pop("force_unzip") + + if "local_files_only" in dataset_kwargs: + dataset_kwargs.pop("local_files_only") + + if "create_link" in dataset_kwargs: + dataset_kwargs.pop("create_link") + + if dataset_kwargs is not None and "load_from_disk" in dataset_kwargs and dataset_kwargs["load_from_disk"]: + dataset_kwargs.pop("load_from_disk") + # using local task in offline environment, need to process the online dataset into local format via + # `ds = load_datasets("lmms-lab/MMMU")` + self.dataset = datasets.load_from_disk(path=self.DATASET_PATH, name=self.DATASET_NAME) + else: + self.dataset = datasets.load_dataset( + path=self.DATASET_PATH, + name=self.DATASET_NAME, + download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, + download_config=download_config, + **dataset_kwargs if dataset_kwargs is not None else {}, + ) + + if self.config.process_docs is not None: + for split in self.dataset: + if split in [self.config.training_split, self.config.validation_split, self.config.test_split, self.config.fewshot_split]: + self.dataset[split] = self.config.process_docs(self.dataset[split]) + + # copy dataset, remove image features + self.dataset_no_image = self.dataset.copy() + for doc_name in self.dataset_no_image: + remove_cols = [] + features = self.dataset_no_image[doc_name].features + # If it is an Image instance or a Sequence of Image instance. Remove it + for feature in features: + if isinstance(features[feature], Image): + remove_cols.append(feature) + elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image): + remove_cols.append(feature) + for remove_col in remove_cols: + self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col) + + def has_training_docs(self) -> bool: + if self.config.training_split is not None: + return True + else: + return False + + def has_validation_docs(self) -> bool: + if self.config.validation_split is not None: + return True + else: + return False + + def has_test_docs(self) -> bool: + if self.config.test_split is not None: + return True + else: + return False + + def training_docs(self) -> datasets.Dataset: + if self.has_training_docs(): + return self.dataset[self.config.training_split] + + def validation_docs(self) -> datasets.Dataset: + if self.has_validation_docs(): + return self.dataset[self.config.validation_split] + + def test_docs(self) -> datasets.Dataset: + if self.has_test_docs(): + return self.dataset[self.config.test_split] + + def fewshot_docs(self): + if self.config.fewshot_split is not None: + return self.dataset[self.config.fewshot_split] + else: + if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0): + eval_logger.warning(f"Task '{self.config.task}': " "num_fewshot > 0 but fewshot_split is None. " "using preconfigured rule.") + return super().fewshot_docs() + + @utils.positional_deprecated + def fewshot_context( + self, + doc: str, + num_fewshot: int, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + chat_template: Optional[Callable] = None, + ) -> str: + """Returns a fewshot context string that is made up of a prepended description + (if provided), the `num_fewshot` number of examples, and an appended prompt example. + + :param doc: str + The document as returned from training_docs, validation_docs, or test_docs. + :param num_fewshot: int + The number of fewshot examples to provide in the returned context string. + :param system_instruction: str + System instruction to be applied to the prompt. + :param apply_chat_template: bool + Whether to apply the chat template to the fewshot context. + :param fewshot_as_multiturn: bool + Whether to provide the fewshot examples as a multiturn conversation or a single user turn. + :param chat_template: + callable (from lm.apply_chat_template) that takes in a list[Dict] chat transcript and renders it into a string. + :returns: str + The fewshot context. + """ + + if apply_chat_template: + labeled_examples = [] + else: + labeled_examples = "" + + # get task description + if description := self.config.description: + description = utils.apply_template(self.config.description, doc) + + # create system prompt based on the provided system instruction and description + if system_instruction is not None and description: + system_prompt = f"{system_instruction}{self.sampler.fewshot_delimiter}{description}" + elif system_instruction is not None: + system_prompt = system_instruction + elif description: + system_prompt = description + else: + system_prompt = "" + + # add system prompt if specified + if system_prompt: + if apply_chat_template: + labeled_examples.append({"role": "system", "content": system_prompt}) + else: + labeled_examples = system_prompt + + # if few-shot - append examples after the system prompt + if num_fewshot > 0: + if apply_chat_template: + labeled_examples.extend(self.sampler.get_chat_context(doc, num_fewshot, fewshot_as_multiturn)) + else: + labeled_examples += self.sampler.get_context(doc, num_fewshot) + + example = self.doc_to_text(doc) + if apply_chat_template: + if self.multiple_input: + return chat_template(labeled_examples) + if isinstance(example, str): + self.append_target_question(labeled_examples, example, fewshot_as_multiturn) + # for loglikelihood create a list of questions with appended choices + elif isinstance(example, list): + labeled_examples_list = [] + # copy chat history for each example and append the answer + for ex in example: + chat = deepcopy(labeled_examples) + self.append_target_question(chat, ex, fewshot_as_multiturn) + labeled_examples_list.append(chat_template(chat)) + return labeled_examples_list + # if example is an integer, append the choice or convert to string + elif isinstance(example, int): + if self.config.doc_to_choice is not None: + choices = self.doc_to_choice(doc) + self.append_target_question(labeled_examples, choices[example], fewshot_as_multiturn) + else: + self.append_target_question(labeled_examples, str(example), fewshot_as_multiturn) + # return lm.apply_chat_template(labeled_examples) + return chat_template(labeled_examples) + else: + if self.multiple_input: + return labeled_examples + if isinstance(example, str): + return labeled_examples + example + elif isinstance(example, list): + return [labeled_examples + ex for ex in example] + elif isinstance(example, int): + if self.config.doc_to_choice is not None: + choices = self.doc_to_choice(doc) + return labeled_examples + choices[example] + else: + return labeled_examples + str(example) + + def apply_filters(self): + if hasattr(self, "_filters"): + for f in self._filters: + f.apply(self._instances, self.task_docs) + else: + eval_logger.warning("No filter defined, passing through instances") + return self._instances + + def should_decontaminate(self): + return self.config.should_decontaminate + + def doc_to_decontamination_query(self, doc): + if self.config.should_decontaminate: + if self.config.doc_to_decontamination_query is None: + return self.doc_to_text(doc) + else: + doc_to_decontamination_query = self.config.doc_to_decontamination_query + if doc_to_decontamination_query in self.features: + return doc[doc_to_decontamination_query] + elif callable(doc_to_decontamination_query): + return doc_to_decontamination_query(doc) + else: + return ast.literal_eval(utils.apply_template(self.config.doc_to_decontamination_query, doc)) + + def _process_doc(self, doc): + """ + Override this to process (detokenize, strip, replace, etc.) individual + documents. This can be used in a map over documents of a data split. + E.g. `map(self._process_doc, self.dataset["validation"])` + + :return: dict + The processed version of the specified `doc`. + """ + return doc + + def doc_to_text(self, doc): + doc_to_text = self.config.doc_to_text + + if type(doc_to_text) == int: + return doc_to_text + elif type(doc_to_text) == str: + if doc_to_text in self.features: + # if self.config.doc_to_choice is not None: + # return self.doc_to_choice(doc)[doc[doc_to_text]] + # else: + return doc[doc_to_text] + else: + text_string = utils.apply_template(doc_to_text, doc) + if text_string.isdigit() and self._config.doc_to_choice is not None: + return ast.literal_eval(text_string) + else: + return text_string + elif callable(doc_to_text): + return ( + doc_to_text(doc, self.lmms_eval_specific_kwargs) + if self.lmms_eval_specific_kwargs is not None + else doc_to_text( + doc, + ) + ) + # Used when applying a Promptsource template + elif hasattr(doc_to_text, "apply"): + applied_prompt = doc_to_text.apply(doc) + if len(applied_prompt) == 2: + return applied_prompt[0] + else: + eval_logger.warning("Applied prompt returns empty string") + return self.config.fewshot_delimiter + else: + print(type(doc_to_text)) + raise TypeError + + def doc_to_target(self, doc: dict) -> Union[int, str, list]: + doc_to_target = self.config.doc_to_target + + if type(doc_to_target) == int: + return doc_to_target + elif type(doc_to_target) == str: + if doc_to_target in self.features: + # if self.config.doc_to_choice is not None: + # return self.doc_to_choice(doc)[doc[doc_to_target]] + # else: + return doc[doc_to_target] + else: + target_string = utils.apply_template(doc_to_target, doc) + if target_string.isdigit() and self._config.doc_to_choice is not None: + return ast.literal_eval(target_string) + elif len(target_string) >= 2 and (target_string[0] == "[") and (target_string[-1] == "]"): + try: + return ast.literal_eval(target_string) + except (SyntaxError, ValueError): + return target_string + else: + return target_string + elif type(doc_to_target) == list: + return doc_to_target + elif callable(doc_to_target): + return doc_to_target(doc, self.model_specific_target_kwargs) if self.model_specific_target_kwargs is not None else doc_to_target(doc) + # Used when applying a Promptsource template + elif hasattr(doc_to_target, "apply"): + applied_prompt = doc_to_target.apply(doc) + if len(applied_prompt) == 2: + return applied_prompt[1] + else: + eval_logger.warning("Applied prompt returns empty string") + return self.config.fewshot_delimiter + else: + raise TypeError + + def doc_to_visual(self, doc: dict) -> Union[int, str, list]: + self.config.doc_to_visual + if type(self.config.doc_to_visual) == str: + assert self.config.doc_to_visual in self.features + # Single image. Still return a list for consistency. + return [doc[self.config.doc_to_visual]] + elif callable(self.config.doc_to_visual): + return ( + self.config.doc_to_visual(doc, self.lmms_eval_specific_kwargs) + if self.lmms_eval_specific_kwargs is not None and len(inspect.signature(self.config.doc_to_visual).parameters) == 2 + else self.config.doc_to_visual( + doc, + ) + ) + else: + # eval_logger.warning("Note that doc_to_visual was called but not set in config. Please check if this is a text-only task.") + return self.config.doc_to_visual + + def doc_to_choice(self, doc: Any) -> List[str]: + if self.config.doc_to_choice is None: + eval_logger.error("Note that doc_to_choice was called but not set in config.") + else: + doc_to_choice = self.config.doc_to_choice + + if type(doc_to_choice) == str: + if doc_to_choice in self.features: + return doc[doc_to_choice] + else: + return ast.literal_eval(utils.apply_template(doc_to_choice, doc)) + elif type(doc_to_choice) == list: + return doc_to_choice + elif type(doc_to_choice) == dict: + return list(doc_to_choice.values()) + elif callable(doc_to_choice): + return doc_to_choice(doc) + elif hasattr(doc_to_choice, "get_answer_choices_list"): + return doc_to_choice.get_answer_choices_list(doc) + else: + raise TypeError + + def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Instance], Instance]: + split = kwargs.get("metadata").get("split") + # kwargs.pop("split") + if self.OUTPUT_TYPE == "loglikelihood": + arguments = (ctx, self.doc_to_target, self.doc_to_visual, doc_id, self.config.task, split) + elif self.OUTPUT_TYPE == "multiple_choice": + doc = self.dataset[split][doc_id] + choices = self.doc_to_choice(doc) + target_delimiter = self.config.target_delimiter + if self.multiple_input: + # If there are multiple inputs, choices are placed in the ctx + cont = self.doc_to_target(doc) + arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, doc_id, self.config.task, split) for ctx in choices] + else: + # Otherwise they are placed in the continuation + arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, doc_id, self.config.task, split) for cont in choices] + request_list = [ + Instance( + request_type="loglikelihood", + # doc=doc, + arguments=arg, + idx=i, + **kwargs, + ) + for i, arg in enumerate(arguments) + ] + # TODO: we should raise a warning telling users this will at most ~2x runtime. + if "acc_mutual_info" in self._metric_fn_list.keys(): + # if we are calculating multiple choice accuracy + # using mutual information instead of raw loglikelihood as metric, need unconditional lls. + + # here mutual info refers to calculating + # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice)) + # in other words normalizing by subtracting the unconditional logprob of each choice. + request_list.extend( + [ + Instance( + request_type="loglikelihood", + # doc=doc, + arguments=("", "{}".format(choice)), + idx=i, + **kwargs, + ) + for i, choice in enumerate(choices) + ] + ) + return request_list + + elif self.OUTPUT_TYPE == "generate_until": + arguments = (ctx, copy.deepcopy(self.config.generation_kwargs), self.doc_to_visual, doc_id, self.config.task, split) + elif self.OUTPUT_TYPE == "generate_until_multi_round": + arguments = (ctx, copy.deepcopy(self.config.generation_kwargs), self.doc_to_visual, partial(self.config.doc_to_text, lmms_eval_specific_kwargs=self.lmms_eval_specific_kwargs), doc_id, self.config.task, split) + return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs) + + # TODO: we add a full_docs interface here for some evaluations that needs to access the full datasets during process_results function. we may have better ways to handle this. + @retry(stop=(stop_after_attempt(5) | stop_after_delay(1200)), wait=wait_fixed(2)) + def process_results(self, doc, results, full_docs=None): + if self.OUTPUT_TYPE == "generate_until": + results[0] = results[0].strip() + + kwargs = {} + if full_docs is not None: + kwargs["full_docs"] = full_docs + if callable(self.config.process_results): + return self.config.process_results(doc, results, **kwargs) + + result_dict = {} + use_metric = list(self._metric_fn_list.keys()) + if self.OUTPUT_TYPE == "loglikelihood": + results = results[0] + ll, is_greedy = results + return { + **({"perplexity": ll} if "perplexity" in use_metric else {}), + **({"acc": int(is_greedy)} if "acc" in use_metric else {}), + } + elif self.OUTPUT_TYPE == "multiple_choice": + lls, is_greedy = zip(*results) + + # retrieve choices in List[str] form, to compute choice lengths, etc. + choices = self.doc_to_choice(doc) + completion_len = np.array([float(len(i)) for i in choices]) + + if 2 * len(choices) == len(lls) and "acc_mutual_info" in self._metric_fn_list.keys(): + # then we are doing mutual info. + # this stores the "dryrun" / unconditional answer loglikelihoods + lls_unconditional = lls[1::2] + assert len(lls_unconditional) == len(choices) + # and this stores our "regular" conditional loglikelihoods + lls = lls[::2] + + # Warning : + # Here may be different from original lm-eval + # since we return the actual loss in many model loglikelihood + # we just use the argmin here + pred = np.argmin(lls) + pred_norm = np.argmin(lls / completion_len) + + if self.multiple_input: + gold = self.doc_to_text(doc) + else: + gold = self.doc_to_target(doc) + + gold_index_error = False + if type(gold) is list: + gold = [i if i < len(choices) else -100 for i in gold] + if -100 in gold: + gold_index_error = True + else: + if type(gold) is int: + gold = gold if gold < len(choices) else -100 + elif type(gold) is str: + gold = choices.index(gold) if gold in choices else -100 + + if gold == -100: + gold_index_error = True + + if gold_index_error: + eval_logger.warning(f"Label index was not in within range of available choices," f"Sample:\n\n{doc}\n\n") + + if self.multiple_target: + acc = 1.0 if pred in gold else 0.0 + acc_norm = 1.0 if pred_norm in gold else 0.0 + exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold])) + else: + acc = 1.0 if pred == gold else 0.0 + acc_norm = 1.0 if pred_norm == gold else 0.0 + # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly + exact_match = int(is_greedy[gold]) if gold != -100 else 0 + + result_dict = { + **({"acc": acc} if "acc" in use_metric else {}), + **({"f1": (gold, pred)} if "f1" in use_metric else {}), + **({"mcc": (gold, pred)} if "mcc" in use_metric else {}), + **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), + **({"exact_match": exact_match} if "exact_match" in use_metric else {}), + } + + if "acc_mutual_info" in use_metric: + lls_mutual_info = [ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)] + acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0 + result_dict["acc_mutual_info"] = acc_mutual_info + + elif "generate_until" in self.OUTPUT_TYPE: + gold = self.doc_to_target(doc) + result = results[0] + if self.config.doc_to_choice is not None: + # If you set doc_to_choice, + # it assumes that doc_to_target returns a number. + choices = self.doc_to_choice(doc) + gold = choices[gold] + # we expect multiple_targets to be a list. + elif self.multiple_target: + gold = list(gold) + elif type(gold) != type(result): + # cast gold to the same type as result + gold = type(result)(gold) + + for metric in self._metric_fn_list.keys(): + if self.multiple_target and metric != "anls": + # in the case where we have multiple targets, + # return true if any are true + # TODO: this may break for multipLe_target, non zero-or-1 metrics + scores = [] + if not isinstance(gold, list): + # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer + # print(gold) + gold = [gold] + for gold_option in gold: + try: + result_score = self._metric_fn_list[metric]( + references=[gold_option], + predictions=[result], + **self._metric_fn_kwargs[metric], + ) + except TypeError: # TODO: this is hacky and I don't want to do it + result_score = self._metric_fn_list[metric]([gold_option, result]) + if isinstance(result_score, dict): + # TODO: this handles the case where HF evaluate returns a dict. + result_score = result_score[metric] + scores.append(result_score) + if any(scores): + result_score = 1.0 + else: + result_score = 0.0 + else: + if not isinstance(gold, list): + gold = [gold] + try: + result_score = self._metric_fn_list[metric]( + references=gold, + predictions=[result], + **self._metric_fn_kwargs[metric], + ) + except TypeError: # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics + result_score = self._metric_fn_list[metric]([gold, result]) + if isinstance(result_score, dict): + # TODO: this handles the case where HF evaluate returns a dict. + result_score = result_score[metric] + result_dict[metric] = result_score + else: + raise ValueError( + f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", + "'loglikelihood','generate_until', 'generate_until_multi_round', or 'multiple_choice'", + ) + + return result_dict + + def aggregation(self): + return self._aggregation_list + + def higher_is_better(self): + return self._higher_is_better + + def get_config(self, key: str) -> Any: + return getattr(self._config, key, None) + + @property + def task_name(self) -> Any: + return getattr(self.config, "task", None) + + def __repr__(self): + return f"ConfigurableTask(task_name={getattr(self.config, 'task', None)}," f"output_type={self.OUTPUT_TYPE}," f"num_fewshot={getattr(self.config, 'num_fewshot', None)}," f"num_samples={len(self.eval_docs)})" diff --git a/lmms_eval/caching/__init__.py b/lmms_eval/caching/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lmms_eval/caching/cache.py b/lmms_eval/caching/cache.py new file mode 100644 index 0000000..7bc3791 --- /dev/null +++ b/lmms_eval/caching/cache.py @@ -0,0 +1,66 @@ +import hashlib +import os + +import dill + +from lmms_eval.loggers.utils import _handle_non_serializable +from lmms_eval.utils import eval_logger + +MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) + +OVERRIDE_PATH = os.getenv("LM_HARNESS_CACHE_PATH") + + +PATH = OVERRIDE_PATH if OVERRIDE_PATH else f"{MODULE_DIR}/.cache" + +# This should be sufficient for uniqueness +HASH_INPUT = "EleutherAI-lm-evaluation-harness" + +HASH_PREFIX = hashlib.sha256(HASH_INPUT.encode("utf-8")).hexdigest() + +FILE_SUFFIX = f".{HASH_PREFIX}.pickle" + + +def load_from_cache(file_name): + try: + path = f"{PATH}/{file_name}{FILE_SUFFIX}" + + with open(path, "rb") as file: + cached_task_dict = dill.loads(file.read()) + return cached_task_dict + + except Exception: + eval_logger.debug(f"{file_name} is not cached, generating...") + pass + + +def save_to_cache(file_name, obj): + if not os.path.exists(PATH): + os.mkdir(PATH) + + file_path = f"{PATH}/{file_name}{FILE_SUFFIX}" + + serializable_obj = [] + + for item in obj: + sub_serializable_obj = [] + for subitem in item: + if hasattr(subitem, "arguments"): # we need to handle the arguments specially since doc_to_visual is callable method and not serializable + serializable_arguments = tuple(arg if not callable(arg) else None for arg in subitem.arguments) + subitem.arguments = serializable_arguments + sub_serializable_obj.append(_handle_non_serializable(subitem)) + serializable_obj.append(sub_serializable_obj) + + eval_logger.debug(f"Saving {file_path} to cache...") + with open(file_path, "wb") as file: + file.write(dill.dumps(serializable_obj)) + + +# NOTE the "key" param is to allow for flexibility +def delete_cache(key: str = ""): + files = os.listdir(PATH) + + for file in files: + if file.startswith(key) and file.endswith(FILE_SUFFIX): + file_path = f"{PATH}/{file}" + os.unlink(file_path) diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py new file mode 100755 index 0000000..0646e21 --- /dev/null +++ b/lmms_eval/evaluator.py @@ -0,0 +1,638 @@ +import collections +import inspect +import itertools +import json +import os +import random +import sys +import time +from collections import defaultdict +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import torch +import torch.distributed as dist +from datasets import Image, Sequence +from loguru import logger as eval_logger +from tqdm import tqdm + +import lmms_eval.api +import lmms_eval.api.metrics +import lmms_eval.api.registry +from lmms_eval.evaluator_utils import ( + consolidate_group_results, + consolidate_results, + get_sample_size, + get_subtask_list, + get_task_list, + prepare_print_tasks, + print_writeout, + run_task_tests, +) +from lmms_eval.loggers.evaluation_tracker import EvaluationTracker +from lmms_eval.models import get_model +from lmms_eval.tasks import TaskManager, get_task_dict +from lmms_eval.utils import ( + create_iterator, + get_datetime_str, + get_git_commit_hash, + handle_non_serializable, + hash_string, + make_table, + positional_deprecated, + run_task_tests, + simple_parse_args_string, +) + + +@positional_deprecated +def simple_evaluate( + model, + model_args: Optional[Union[str, dict]] = None, + tasks: Optional[List[Union[str, dict, object]]] = None, + num_fewshot: Optional[int] = None, + batch_size: Optional[Union[int, str]] = None, + max_batch_size: Optional[int] = None, + device: Optional[str] = None, + use_cache: Optional[str] = None, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + delete_requests_cache: bool = False, + limit: Optional[Union[int, float]] = None, + bootstrap_iters: int = 100000, + check_integrity: bool = False, + write_out: bool = False, + log_samples: bool = True, + evaluation_tracker: Optional[EvaluationTracker] = None, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + gen_kwargs: Optional[str] = None, + task_manager: Optional[TaskManager] = None, + verbosity: str = "INFO", + predict_only: bool = False, + random_seed: int = 0, + numpy_random_seed: int = 1234, + torch_random_seed: int = 1234, + fewshot_random_seed: int = 1234, + datetime_str: str = get_datetime_str(), + cli_args=None, +): + """Instantiate and evaluate a model on a list of tasks. + + :param model: Union[str, LM] + Name of model or LM object, see lm_eval.models.get_model + :param model_args: Optional[str, dict] + String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object. + Ignored if `model` argument is a LM object. + :param tasks: list[Union[str, dict, Task]] + List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. + :param num_fewshot: int + Number of examples in few-shot context + :param batch_size: int or str, optional + Batch size for model + :param max_batch_size: int, optional + Maximal batch size to try with automatic batch size detection + :param device: str, optional + PyTorch device (e.g. "cpu" or "cuda:0") for running models + :param use_cache: str, optional + A path to a sqlite db file for caching model responses. `None` if not caching. + :param cache_requests: bool, optional + Speed up evaluation by caching the building of dataset requests. `None` if not caching. + :param rewrite_requests_cache: bool, optional + Rewrites all of the request cache if set to `True`. `None` if not desired. + :param delete_requests_cache: bool, optional + Deletes all of the request cache if set to `True`. `None` if not desired. + :param limit: int or float, optional + Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. + :param bootstrap_iters: + Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. + :param check_integrity: bool + Whether to run the relevant part of the test suite for the tasks + :param write_out: bool + If True, write out an example document and model input for checking task integrity + :param log_samples: bool + If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis + :param system_instruction: str + System instruction to be applied to the prompt + :param apply_chat_template: bool + If True, apply chat template to the prompt + :param fewshot_as_multiturn: bool + Whether to provide the fewshot examples as a multiturn conversation or a single user turn. + :param gen_kwargs: str + String arguments for model generation + Ignored for all tasks with loglikelihood output_type + :param predict_only: bool + If true only model outputs will be generated and returned. Metrics will not be evaluated + :param random_seed: int + Random seed for python's random module. If set to None, the seed will not be set. + :param numpy_random_seed: int + Random seed for numpy. If set to None, the seed will not be set. + :param torch_random_seed: int + Random seed for torch. If set to None, the seed will not be set. + :param fewshot_random_seed: int + Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None. + + :return + Dictionary of results + """ + seed_message = [] + if random_seed is not None: + # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412 + seed_message.append(f"Setting random seed to {random_seed}") + random.seed(random_seed) + + if numpy_random_seed is not None: + seed_message.append(f"Setting numpy seed to {numpy_random_seed}") + np.random.seed(numpy_random_seed) + + if torch_random_seed is not None: + seed_message.append(f"Setting torch manual seed to {torch_random_seed}") + torch.manual_seed(torch_random_seed) + + if seed_message: + eval_logger.info(" | ".join(seed_message)) + + assert tasks != [], "No tasks specified, or no tasks found. Please verify the task names." + + if gen_kwargs: + gen_kwargs = simple_parse_args_string(gen_kwargs) + eval_logger.warning(f"generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.") + if gen_kwargs == "": + gen_kwargs = None + + if model_args is None: + model_args = "" + + if task_manager is None: + task_manager = TaskManager(verbosity, model_name=model) + + task_dict = get_task_dict(tasks, task_manager) + + ModelClass = get_model(model) + lm = ModelClass.create_from_arg_string( + model_args, + { + "batch_size": batch_size, + "device": device, + }, + ) + + # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups. + # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed) + def _adjust_config(task_dict): + adjusted_task_dict = {} + for task_name, task_obj in task_dict.items(): + if isinstance(task_obj, dict): + adjusted_task_dict = { + **adjusted_task_dict, + **{task_name: _adjust_config(task_obj)}, + } + + else: + task_obj = task_dict[task_name] + if type(task_obj) == tuple: + group, task_obj = task_obj + if task_obj is None: + continue + lm.task_dict[task_name] = task_obj.dataset + if "generate_until" in task_obj.get_config("output_type"): + if gen_kwargs is not None: + task_obj.set_config(key="generation_kwargs", value=gen_kwargs, update=True) + + if predict_only: + eval_logger.info(f"Processing {task_name} in output-only mode. Metrics will not be calculated!") + # we have to change the class properties post-hoc. This is pretty hacky. + task_obj.override_metric(metric_name="bypass") + + # override tasks' fewshot values to the provided num_fewshot arg value + # except if tasks have it set to 0 manually in their configs--then we should never overwrite that + if num_fewshot is not None: + if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: + eval_logger.info(f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored.") + else: + eval_logger.warning(f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}") + task_obj.set_config(key="num_fewshot", value=num_fewshot) + else: + # if num_fewshot not provided, and the task does not define a default one, default to 0 + if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None: + task_obj.set_config(key="num_fewshot", value=0) + # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file) + task_obj.set_fewshot_seed(seed=fewshot_random_seed) + # eval_logger.info(f"Setting fewshot random generator seed to {fewshot_random_seed}") + + adjusted_task_dict[task_name] = task_obj + + return adjusted_task_dict + + task_dict = _adjust_config(task_dict) + + if check_integrity: + run_task_tests(task_list=tasks) + + if evaluation_tracker is not None: + evaluation_tracker.general_config_tracker.log_experiment_args( + model_source=model, + model_args=model_args, + system_instruction=system_instruction, + chat_template=lm.chat_template if apply_chat_template else None, + fewshot_as_multiturn=fewshot_as_multiturn, + ) + + results = evaluate( + lm=lm, + task_dict=task_dict, + limit=limit, + cache_requests=cache_requests, + rewrite_requests_cache=rewrite_requests_cache, + bootstrap_iters=bootstrap_iters, + write_out=write_out, + log_samples=True if predict_only else log_samples, + system_instruction=system_instruction, + apply_chat_template=apply_chat_template, + fewshot_as_multiturn=fewshot_as_multiturn, + verbosity=verbosity, + cli_args=cli_args, + ) + + if hasattr(lm, "_model"): + del lm._model + torch.cuda.empty_cache() + + if lm.rank == 0: + if isinstance(model, str): + model_name = model + elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"): + model_name = model.config._name_or_path + else: + model_name = type(model).__name__ + + # add info about the model and few shot config + results["config"] = { + "model": model_name, + "model_args": model_args, + } + # add more detailed model info if available TODO: add model info + # if isinstance(lm, lm_eval.models.huggingface.HFLM): + # results["config"].update(lm.get_model_info()) + # add info about execution + results["config"].update( + { + "batch_size": batch_size, + "batch_sizes": (list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []), + "device": device, + "use_cache": use_cache, + "limit": limit, + "bootstrap_iters": bootstrap_iters, + "gen_kwargs": gen_kwargs, + "random_seed": random_seed, + "numpy_seed": numpy_random_seed, + "torch_seed": torch_random_seed, + "fewshot_seed": fewshot_random_seed, + } + ) + results["git_hash"] = get_git_commit_hash() + results["date"] = datetime_str + # add_env_info(results) # additional environment info to results + # add_tokenizer_info(results, lm) # additional info about tokenizer + return results + else: + return None + + +decontaminate_suffix = "_decontaminate" + + +@positional_deprecated +def evaluate( + lm: "LM", + task_dict, + limit: Optional[int] = None, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + bootstrap_iters: Optional[int] = 100000, + write_out: bool = False, + log_samples: bool = True, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + verbosity: str = "INFO", + cli_args=None, +): + """Instantiate and evaluate a model on a list of tasks. + + :param lm: obj + Language Model + :param task_dict: dict[str, Task] + Dictionary of tasks. Tasks will be taken to have name type(task).config.task . + :param limit: int, optional + Limit the number of examples per task (only use this for testing) + :param bootstrap_iters: + Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations. + :param write_out: bool + If True, write out an example document and model input for checking task integrity + :param log_samples: bool + If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis + :param system_instruction: str + System instruction to be applied to the prompt + :param apply_chat_template: bool + If True, apply chat template to the prompt + :param fewshot_as_multiturn: bool + Whether to provide the fewshot examples as a multiturn conversation or a single user turn. + :return + Dictionary of results + """ + + # stores the final result for each task, for each metric/filter pair. + results = collections.defaultdict(dict) + # Tracks each task's version. + versions = collections.defaultdict(dict) + # Tracks the YAML configs of all chosen tasks. + configs = collections.defaultdict(dict) + # logs info about each document evaluated. + samples = collections.defaultdict(list) + # tracks all Instances/requests a model must generate output on. + requests = collections.defaultdict(list) + # Aggregated task scores presented with groups + results_agg = collections.defaultdict(dict) + # Aggregated groups scores only + groups_agg = collections.defaultdict(dict) + # stores the amount to pad out reqs per req. type so that + # number of fwd passes per distributed rank is equal + padding_requests = collections.defaultdict(int) + # store the hierarchy to do proper ordering + task_hierarchy = collections.defaultdict(list) + # store the ordering of tasks and groups + task_order = collections.defaultdict(int) + task_group_alias = collections.defaultdict(dict) + # store num-fewshot value per task + num_fewshot = collections.defaultdict(int) + + # get lists of group hierarchy and each type of request + eval_tasks = get_task_list(task_dict) + name_to_task = {} + if not log_samples: + if not all("bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() for task_output in eval_tasks): + raise ValueError("log_samples must be True for 'bypass' metric-only tasks") + + for task_output in eval_tasks: + task: Task = task_output.task + task_name = task_output.task_name + task.args = cli_args + + name_to_task[task_name] = task + + if type(task) == tuple: + group_name, task = task + task_hierarchy[group_name].append(task_name) + versions[group_name] = "N/A" + else: + group_name = None + task_hierarchy[task_name] = [] + + if task is None: + continue + + versions[task_name] = task.VERSION + configs[task_name] = dict(task.dump_config()) + + if "num_fewshot" in configs[task_name]: + n_shot = configs[task_name]["num_fewshot"] + else: + n_shot = 0 + num_fewshot[task_name] = n_shot + + if "task_alias" in configs[task_name]: + task_group_alias[task_name] = configs[task_name]["task_alias"] + + if ("group_alias" in configs[task_name]) and (group_name not in task_group_alias) and (group_name is not None): + task_group_alias[group_name] = configs[task_name]["group_alias"] + + limit = get_sample_size(task, limit) + task.build_all_requests( + limit=limit, + rank=lm.rank, + world_size=lm.world_size, + cache_requests=cache_requests, # later we will add them + rewrite_requests_cache=rewrite_requests_cache, + system_instruction=system_instruction, + apply_chat_template=apply_chat_template, + fewshot_as_multiturn=fewshot_as_multiturn, + chat_template=getattr(lm, "apply_chat_template") if apply_chat_template else None, + tokenizer_name=getattr(lm, "tokenizer_name", "") if apply_chat_template else "", + ) + eval_logger.debug(f"Task: {task_output.task_name}; number of requests on this rank: {len(task._instances)}") + if write_out: + print_writeout(task) + # aggregate Instances by LM method requested to get output. + for instance in task.instances: + reqtype = instance.request_type + requests[reqtype].append(instance) + + if lm.world_size > 1: + instances_rnk = torch.tensor(len(task._instances), device=lm.device) + gathered_item = lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist() + # "multiple_choice" task types dispatch (several) "loglikelihood" request types + reqtype = "loglikelihood" if task.OUTPUT_TYPE == "multiple_choice" else task.OUTPUT_TYPE + # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks) + numpad = max(gathered_item) - gathered_item[lm.rank] + # todo: may not account for padding in cases like SquadV2 which has multiple req types + padding_requests[reqtype] += numpad + + ### Run LMM on inputs, get all outputs ### + # execute each type of request + for reqtype, reqs in requests.items(): + eval_logger.info("Running {} requests".format(reqtype)) + # create `K` copies of each request `req` based off `K = req.repeats` + cloned_reqs = [] + for req in reqs: + cloned_reqs.extend([req] * req.repeats) + + if (lm.world_size > 1) and (padding_requests[reqtype] > 0): + for _ in range(padding_requests[reqtype]): + cloned_reqs.extend([req] * req.repeats) + + # run requests through model + resps = getattr(lm, reqtype)(cloned_reqs) # Choiszt run generate until + + # put responses from model into a list of length K for each request. + for x, req in zip(resps, cloned_reqs): + req.resps.append(x) + + if lm.world_size > 1: + lm.accelerator.wait_for_everyone() + + RANK = lm.rank + WORLD_SIZE = lm.world_size + ### Postprocess outputs ### + # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately) + for task_output in eval_tasks: + task = task_output.task + task.apply_filters() + + ### Collect values of metrics on all datapoints ### + # # unpack results and sort back in order and return control to Task + # TODO: make it possible to use a different metric per filter + # Pre-process task.instances to group by doc_id + instances_by_doc_id = collections.defaultdict(list) + for instance in task.instances: + instances_by_doc_id[instance.doc_id].append(instance) + # Sort instances within each group + for instances in instances_by_doc_id.values(): + instances.sort(key=lambda x: x.idx) + # iterate over different filters used + for filter_key in task.instances[0].filtered_resps.keys(): + doc_iterator = task.doc_iterator(rank=RANK, limit=limit, world_size=WORLD_SIZE) + doc_iterator_for_counting = itertools.islice(range(len(task.test_docs())), RANK, limit, WORLD_SIZE) if task.has_test_docs() else itertools.islice(range(len(task.validation_docs())), RANK, limit, WORLD_SIZE) + total_docs = sum(1 for _ in doc_iterator_for_counting) + pbar = tqdm(total=total_docs, desc=f"Postprocessing", disable=(RANK != 0)) + for doc_id, doc in doc_iterator: + requests = instances_by_doc_id[doc_id] + metrics = task.process_results(doc, [req.filtered_resps[filter_key] for req in requests]) + if log_samples: + target = task.doc_to_target(doc) + saved_doc = {key: value for key, value in doc.items() if "image" not in key} + filtered_arguments = [] + for req in requests: + # check if req.args is a list of tuples, and each item in the list is a serializable object + for value in req.args: + if isinstance(value, (str, int, float, bool, list, dict, type(None))): + filtered_arguments.append(value) + # else: + # filtered_arguments.append(_handle_non_serializable(value)) + + example = { + "doc_id": doc_id, + "doc": saved_doc, + "target": target, + "arguments": filtered_arguments, + "resps": [req.resps for req in requests], + "filtered_resps": [req.filtered_resps[filter_key] for req in requests], + "doc_hash": hash_string( + json.dumps( + requests[0].doc, + indent=2, + default=handle_non_serializable, + ensure_ascii=False, + ) + ), + "prompt_hash": hash_string(requests[0].arguments[0]), + "target_hash": hash_string(str(target)), + } + example.update(metrics) + task_output.logged_samples.append(example) + for metric, value in metrics.items(): + task_output.sample_metrics[(metric, filter_key)].append(value) + pbar.update(1) + + pbar.close() + + if WORLD_SIZE > 1: + # if multigpu, then gather data across all ranks to rank 0 + # first gather logged samples across all ranks + for task_output in eval_tasks: + if log_samples: + # for task_name, task_samples in list(samples.items()): + full_samples = [None] * WORLD_SIZE if RANK == 0 else None + per_rank_samples = [] + for sample in task_output.logged_samples: + per_rank_samples.append(sample) + + torch.distributed.gather_object( + obj=per_rank_samples, + object_gather_list=full_samples, + dst=0, + ) + + if RANK == 0: + task_output.logged_samples = list(itertools.chain.from_iterable(full_samples)) + + # then collect metrics across all ranks + for metrics in task_output.sample_metrics: + metric_list = [None] * WORLD_SIZE if RANK == 0 else None + torch.distributed.gather_object( + obj=task_output.sample_metrics[metrics], + object_gather_list=metric_list, + dst=0, + ) + if RANK == 0: + task_output.sample_metrics[metrics] = list(itertools.chain.from_iterable(metric_list)) + + dist.barrier() # Ensure all processes are synced before proceeding + + if RANK == 0: + ### Aggregate results over all datapoints ### + # aggregate results ; run bootstrap CIs + for task_output in eval_tasks: + task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) + ( + results, + samples, + configs, + versions, + num_fewshot, + higher_is_better, + ) = consolidate_results(eval_tasks) + + ### Calculate group metrics ### + if bool(results): + results, versions, show_group_table, *_ = consolidate_group_results(results, versions, task_dict) + + results_agg, group_agg = prepare_print_tasks(task_dict, results) + subtask_list = get_subtask_list(task_dict) + + # collect all higher_is_better values for metrics + # in the group's subtasks. + # TODO: clean this up ; unify with the below metric_list loop? + _higher_is_better = {} + for group, task_list in subtask_list.items(): + if len(task_list) != 0: # subtask list will list "task_name": [] for solo tasks + for task in task_list: + for m, h in higher_is_better[task].items(): + if m not in _higher_is_better.keys(): + _higher_is_better[m] = h + + if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h: + eval_logger.warning(f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None.") + _higher_is_better[m] = None + higher_is_better[group] = _higher_is_better + + results_dict = { + "results": dict(results_agg.items()), + **({"groups": dict(group_agg.items())} if (bool(group_agg) & show_group_table) else {}), + "group_subtasks": dict(reversed(subtask_list.items())), + "configs": dict(sorted(configs.items())), + "versions": dict(sorted(versions.items())), + "n-shot": dict(sorted(num_fewshot.items())), + "higher_is_better": dict(sorted(higher_is_better.items())), + "n-samples": { + task_output.task_name: { + "original": len(task_output.task.eval_docs), + "effective": min( + limit if limit else len(task_output.task.eval_docs), + len(task_output.task.eval_docs), + ), + } + for task_output in eval_tasks + }, + } + if log_samples: + results_dict["samples"] = dict(samples) + else: + results_dict = None + + if hasattr(lm, "accelerator"): + lm.accelerator.wait_for_everyone() + + return results_dict + + +def request_caching_arg_to_dict(cache_requests: str) -> dict: + request_caching_args = { + "cache_requests": cache_requests in {"true", "refresh"}, + "rewrite_requests_cache": cache_requests == "refresh", + "delete_requests_cache": cache_requests == "delete", + } + + return request_caching_args diff --git a/lmms_eval/evaluator_utils.py b/lmms_eval/evaluator_utils.py new file mode 100644 index 0000000..48b5c97 --- /dev/null +++ b/lmms_eval/evaluator_utils.py @@ -0,0 +1,486 @@ +# credit to https://github.com/EleutherAI/lm-evaluation-harness +import collections +import inspect +import math +import pathlib +import sys +from typing import List, Optional, Tuple, Union + +from lmms_eval.api.group import ConfigurableGroup +from lmms_eval.api.metrics import ( + aggregate_subtask_metrics, + pooled_sample_stderr, + stderr_for_metric, +) +from lmms_eval.api.task import Task +from lmms_eval.utils import eval_logger, positional_deprecated + + +class TaskOutput: + """ + Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task. + + Attributes: + task (object): The task object. + task_name (str): The name of the task. + task_config (dict): The configuration of the task. + version (str): The version of the task. + group_name (str): The name of the task group. + n_shot (int): The number of shots for the task. + task_alias (str): The alias of the task. + group_alias (str): The alias of the task group. + is_group (bool): Indicates if the task is a group. + logged_samples (list): The list of logged samples. + sample_len (int): The length of the samples. + sample_metrics (defaultdict): The dictionary of samples' metrics. + agg_metrics (defaultdict): The dictionary of aggregate metrics. + + Methods: + from_taskdict(cls, task_name: str, task): + Creates a TaskOutput instance from a task dictionary. + + calculate_aggregate_metric(bootstrap_iters=100000) -> None: + Calculates the aggregate metrics for the task. + """ + + def __init__( + self, + task=None, + task_name=None, + task_config=None, + version=None, + group_name=None, + n_shot=None, + task_alias=None, + group_alias=None, + is_group=None, + ): + self.task = task + self.task_config = task_config + self.task_name = task_name + self.group_name = group_name + self.version = version + self.n_shot = n_shot + self.task_alias = task_alias + self.group_alias = group_alias + self.is_group = is_group + self.logged_samples = [] + self.sample_len = None + self.sample_metrics = collections.defaultdict(list) + self.agg_metrics = collections.defaultdict(list) + self.args = None + + @classmethod + def from_taskdict(cls, task_name: str, task): + if isinstance(task, tuple): + group_name, task = task + else: + group_name = None + if not task: + # these gets filtered out in get_task_list + # once they are added to group hierarchy + is_group = True + return cls(task=task, task_name=task_name, is_group=is_group, group_name=group_name) + version = task.VERSION + task_config = dict(task.dump_config()) + if (n_shot := task_config.get("num_fewshot")) == 0: + meta_config = task_config.get("metadata", {}) + if isinstance(meta_config, dict): + n_shot = meta_config.get("num_fewshot", 0) + else: + eval_logger.info(f"No metadata found in task config for {task_name}, using default n_shot=0") + n_shot = 0 + task_alias = task_config.get("alias") + group_alias = task_config.get("group_alias") + return cls( + task=task, + task_name=task_name, + task_config=task_config, + group_name=group_name, + version=version, + n_shot=n_shot, + task_alias=task_alias, + group_alias=group_alias, + ) + + def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None: + for (metric, filter_key), items in self.sample_metrics.items(): + if metric in self.task.aggregation(): + agg_fn = self.task.aggregation()[metric] + metric_key = f"{metric},{filter_key}" + if "args" in inspect.signature(agg_fn).parameters: + self.agg_metrics[metric_key] = agg_fn(items, args=self.task.args) + else: + self.agg_metrics[metric_key] = agg_fn(items) + self.sample_len = len(items) # TODO: same sample size for each metric? + if isinstance(bootstrap_iters, int): + stderr_fn = stderr_for_metric( + metric=agg_fn, + bootstrap_iters=min(bootstrap_iters, 100) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters, + ) + self.agg_metrics[f"{metric}_stderr,{filter_key}"] = stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A" + else: + raise ValueError(f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations.") + + def __repr__(self): + return f"TaskOutput(task_name={self.task_name}, " f"group_name={self.group_name}, " f"version={self.version}, " f"n_shot={self.n_shot}, " f"task_alias={self.task_alias}, " f"group_alias={self.group_alias})" + + +def get_task_list(task_dict: dict) -> List[TaskOutput]: + outputs = [] + for task_name, task_obj in task_dict.items(): + if isinstance(task_obj, dict): + _outputs = get_task_list(task_obj) + outputs.extend(_outputs) + else: + task_output = TaskOutput.from_taskdict(task_name, task_obj) + outputs.append(task_output) + + return outputs + + +def get_subtask_list(task_dict, task_root=None, depth=0): + subtask_list = {} + for group_obj, task_obj in task_dict.items(): + if isinstance(group_obj, ConfigurableGroup): + # group_name = group_obj.group_name + group_name = group_obj.group_name + else: + group_name = group_obj + if isinstance(task_obj, dict): + _subtask_list = get_subtask_list(task_obj, task_root=group_name, depth=depth + 1) + if task_root: + subtask_list.setdefault((task_root, depth), []).extend([_task for (_task, _depth) in _subtask_list.keys() if (_depth - 1) == depth]) + + subtask_list = {**subtask_list, **_subtask_list} + else: + if isinstance(task_obj, ConfigurableGroup): + # group_or_task_name = task_obj.group_name + group_or_task_name = task_obj.group_name + elif isinstance(task_obj, Task): + # group_or_task_name = task_obj.task_name + group_or_task_name = task_obj.task_name + + if task_root is None: + subtask_list.setdefault((group_or_task_name, depth), []) + else: + subtask_list.setdefault((task_root, depth), []).append(group_or_task_name) + + if depth == 0: + _subtask_list = {} + for group_key, task_list in subtask_list.items(): + group_name, depth = group_key + _subtask_list[group_name] = task_list + subtask_list = _subtask_list + + return subtask_list + + +def print_writeout(task) -> None: + for inst in task.instances: + # print the prompt for the first few documents + if inst.doc_id < 1: + eval_logger.info( + f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\ + \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)" + ) + eval_logger.info(f"Request: {str(inst)}") + + +def get_sample_size(task, limit: Optional[int]) -> Union[int, None]: + if limit is not None: + limit = int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit) + return limit + + +def prepare_print_tasks( + task_dict: dict, + results: dict, + task_depth=0, + group_depth=0, +) -> Tuple[dict, dict]: + """ + @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its + value is a list of task names. + @param results: Dictionary containing the results of each task. Each key is a + group name and its value is a dictionary of task results. + @param task_depth: The indentation level for printing the task + hierarchy. Default is 0. + @param group_depth: The indentation level for printing the group + hierarchy. Default is 0. + @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains + aggregated results for each task, and groups_agg contains aggregated results for each group. + + Prepares the task hierarchy and aggregates the results for each task and group recursively for printing. + """ + + def _sort_task_dict(task_dict): + """ + Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name. + Required so that we end up sorting within each sub-header correctly. + """ + + return dict( + sorted( + task_dict.items(), + key=lambda item: item[0].group_name if isinstance(item[0], ConfigurableGroup) else item[0], + ) + ) + + task_agg = collections.defaultdict(dict) + group_agg = collections.defaultdict(dict) + task_dict = _sort_task_dict(task_dict) + for task_or_group_name, task_or_group_obj in task_dict.items(): + tab_string = " " * task_depth + "- " if task_depth > 0 else "" + if isinstance(task_or_group_name, ConfigurableGroup): + # string_name = task_or_group_name.group_name + name = task_or_group_name.group_name + from_configurable_group = True + task_or_group_obj = _sort_task_dict(task_or_group_obj) + elif isinstance(task_or_group_name, str): + name = task_or_group_name + if isinstance(task_or_group_obj, Task): + # string_name = task_or_group_obj.task_name + name = task_or_group_obj.task_name + from_configurable_group = False + + task_agg[name] = results[name].copy() + if from_configurable_group: + if task_or_group_name.group_alias is not None: + alias = task_or_group_name.group_alias + else: + alias = task_or_group_name.group + else: + if "alias" in task_agg[name]: + alias = task_agg[name]["alias"] + else: + alias = name + + task_agg[name]["alias"] = tab_string + alias + if "samples" in task_agg[name]: + task_agg[name].pop("samples") + + if from_configurable_group and (" " not in results[name]): + group_tab_string = " " * group_depth + "- " if group_depth > 0 else "" + group_agg[name] = results[name].copy() + group_agg[name]["alias"] = group_tab_string + alias + if "samples" in group_agg[name]: + group_agg[name].pop("samples") + + if isinstance(task_or_group_obj, dict): + task_depth += 1 + group_depth += 1 + _task_agg, _group_agg = prepare_print_tasks(task_or_group_obj, results, task_depth, group_depth) + task_agg = { + **task_agg, + **_task_agg, + } + group_agg = {**group_agg, **_group_agg} + task_depth -= 1 + group_depth -= 1 + return task_agg, group_agg + + +def consolidate_results( + eval_tasks: List[TaskOutput], +) -> Tuple[dict, dict, dict, dict, dict, dict]: + """ + @param eval_tasks: list(TaskOutput). + @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot. + + Consolidates the results of multiple evaluation tasks into a single structure. + + The method iterates over each evaluation instance and extracts relevant information to create the consolidated + results structure. The consolidated results structure has the following properties: + + - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains + metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task + aliases specified in the task configuration. + - samples: A defaultdict with task names as keys and lists of log samples as values. + - configs: A defaultdict with task names as keys and task configurations as values. + - versions: A defaultdict with task names as keys and task versions as values. + - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values. + - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better + for each metric as values. + + The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple. + """ + # stores the final result for each task, for each metric/filter pair. + results = collections.defaultdict(dict) + # logs info about each document evaluated. + samples = collections.defaultdict(list) + # store num-fewshot value per task + num_fewshot = collections.defaultdict(int) + # Tracks the YAML configs of all chosen task + configs = collections.defaultdict(dict) + # Tracks each task's version. + versions = collections.defaultdict(dict) + # Track `higher_is_better` for each metric + higher_is_better = collections.defaultdict(dict) + + for task_output in eval_tasks: + if "task_alias" in (task_config := task_output.task_config): + results[task_output.task_name]["alias"] = task_config["task_alias"] + else: + results[task_output.task_name]["alias"] = task_output.task_name + if group_alias := task_output.group_alias: + if group_alias not in results and (group_name := task_output.group_name): + results[group_name]["alias"] = group_alias + num_fewshot[task_output.task_name] = task_output.n_shot + configs[task_output.task_name] = task_output.task_config + versions[task_output.task_name] = task_output.version + samples[task_output.task_name] = task_output.logged_samples + higher_is_better[task_output.task_name] = task_output.task.higher_is_better() + for (metric, filter_key), items in task_output.sample_metrics.items(): + metric_key = f"{metric},{filter_key}" + results[task_output.task_name][metric_key] = task_output.agg_metrics[metric_key] + results[task_output.task_name]["samples"] = task_output.sample_len + results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] + return results, samples, configs, versions, num_fewshot, higher_is_better + + +def consolidate_group_results( + results, + versions, + task_dict, + task_root=None, + show_group_table=False, + task_aggregation_list=None, +) -> Tuple[dict, dict, bool, Union[None,]]: + """ + (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info. + + @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below: + + - results: A defaultdict with task names (and, after this function is called, group names of + groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys. + - versions: A defaultdict with task names (and, after this function is called, group names of + groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None). + - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table. + - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric. + + The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple. + In the top-level invocation of this function, task_aggregation_list is ignored. + """ + if task_root is None: + task_root = {} + + if task_aggregation_list is None: + task_aggregation_list = {} + + for group_or_task, group_or_task_info in task_dict.items(): + # Convert to string + if isinstance(group_or_task, ConfigurableGroup): + group_config = group_or_task.config + group_or_task = group_or_task.group_name + else: + group_config = None + + if isinstance(group_or_task_info, Task): + if task_root: + task_aggregation_list.setdefault(task_root, []).append(group_or_task_info.task_name) + else: + ( + results, + versions, + show_group_table, + _task_aggregation_list, + ) = consolidate_group_results( + results, + versions, + group_or_task_info, + group_or_task, + show_group_table, + task_aggregation_list, + ) + if task_root: + task_aggregation_list.setdefault(task_root, []).extend(task_aggregation_list.get(group_or_task, [])) + + if (group_config is None) or (group_config["aggregate_metric_list"] is None): + results[group_or_task][" "] = " " + continue + + if "aggregate_metric_list" in group_config: + agg_metric_list = group_config["aggregate_metric_list"] + + show_group_table = show_group_table | bool(group_config["aggregate_metric_list"]) + + task_list = _task_aggregation_list[group_or_task] + + metric_list = list({key for task in task_list for key in results[task].keys() if "_stderr" not in key and key not in ["task", "alias", "samples"]}) + for metric in metric_list: + stderr = "_stderr,".join(metric.split(",")) + + # gather metrics, sizes, and stderrs from subtasks + metrics = [results[task][metric] for task in task_list if metric in results[task]] # TODO: copy? + stderrs = [results[task][stderr] for task in task_list if stderr in results[task]] + sizes = [results[task]["samples"] for task in task_list if metric in results[task]] + + for metric_config in agg_metric_list: + for filter_name in metric_config["filter_list"]: + if metric != ",".join([metric_config["metric"], filter_name]): + continue + + # compute group's pooled metric and stderr + if metric_config["aggregation"] == "mean": + aggregate_fn = aggregate_subtask_metrics + elif callable(metric_config["aggregation"]): + aggregate_fn = metric_config["aggregation"] + else: + raise ValueError(f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'") + + results[group_or_task][metric] = aggregate_fn( + metrics, + sizes, + metric_config["weight_by_size"], + ) + # TODO: calculate groups' metrics using arbitrary agg fns + if "N/A" in stderrs: + results[group_or_task][stderr] = "N/A" + else: + # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere + results[group_or_task][stderr] = pooled_sample_stderr(stderrs, sizes) + + results[group_or_task]["samples"] = sum(sizes) + group_metadata = group_config.get("metadata", None) + if group_metadata is not None: + versions[group_or_task] = group_metadata.get("version", None) + # print(results) + return results, versions, show_group_table, task_aggregation_list + + +@positional_deprecated +def find_test_root(start_path: pathlib.Path) -> pathlib.Path: + """ + Search upward in the directory tree to a maximum of three layers + to find and return the package root (containing the 'tests' folder) + """ + cur_path = start_path.resolve() + max_layers = 3 + for _ in range(max_layers): + if (cur_path / "tests" / "test_version_stable.py").exists(): + return cur_path + else: + cur_path = cur_path.parent.resolve() + raise FileNotFoundError(f"Unable to find package root within {max_layers} upwards" + f"of {start_path}") + + +@positional_deprecated +def run_task_tests(task_list: List[str]): + """ + Find the package root and run the tests for the given tasks + """ + import pytest + + package_root = find_test_root(start_path=pathlib.Path(__file__)) + task_string = " or ".join(task_list) + args = [ + f"{package_root}/tests/test_version_stable.py", + f"--rootdir={package_root}", + "-k", + f"{task_string}", + ] + sys.path.append(str(package_root)) + pytest_return_val = pytest.main(args) + if pytest_return_val: + raise ValueError(f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}") diff --git a/lmms_eval/filters/__init__.py b/lmms_eval/filters/__init__.py new file mode 100755 index 0000000..f6c353f --- /dev/null +++ b/lmms_eval/filters/__init__.py @@ -0,0 +1,43 @@ +from lmms_eval.api.filter import Filter, FilterEnsemble + +from . import extraction, selection, transformation + +FILTER_REGISTRY = { + "take_first": selection.TakeFirstFilter, + "regex": extraction.RegexFilter, + "majority_vote": selection.MajorityVoteFilter, + "take_first_k": selection.TakeKFilter, + "remove_whitespace": extraction.WhitespaceFilter, + "lowercase": transformation.LowercaseFilter, + "uppercase": transformation.UppercaseFilter, + "map": transformation.MapFilter, + "multi_choice_regex": extraction.MultiChoiceRegexFilter, + # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function + # that takes an input and returns a scalar and then should select the max reward, + # or should implement different filters for different ways of handling a reward model's inference. + # "arg_max": selection.ArgMaxFilter, +} + + +def get_filter(filter_name): + if filter_name in FILTER_REGISTRY: + return FILTER_REGISTRY[filter_name] + else: + return filter_name + + +def build_filter_ensemble(filter_name, components): + """ + Create a filtering pipeline. + """ + filters = [] + for function, kwargs in components: + if kwargs is None: + f = get_filter(function)() + else: + # create a filter given its name in the registry + f = get_filter(function)(**kwargs) # TODO: pass kwargs to filters properly + # add the filter as a pipeline step + filters.append(f) + + return FilterEnsemble(name=filter_name, filters=filters) diff --git a/lmms_eval/filters/decontamination.py b/lmms_eval/filters/decontamination.py new file mode 100755 index 0000000..d2e1b45 --- /dev/null +++ b/lmms_eval/filters/decontamination.py @@ -0,0 +1,23 @@ +from lmms_eval.api.filter import Filter + + +class DecontaminationFilter(Filter): + """ + A filter which evaluates + """ + + name = "track_decontamination" + + def __init__(self, path) -> None: + """ + + TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). + should further cache result on a given (task_name, doc_id) + """ + self._decontam_results = None + + def apply(self, resps, docs) -> None: + """ + Return {"no_contamination", "only_contamination"} keys for the 2 different subsets + """ + pass diff --git a/lmms_eval/filters/extraction.py b/lmms_eval/filters/extraction.py new file mode 100755 index 0000000..392e21a --- /dev/null +++ b/lmms_eval/filters/extraction.py @@ -0,0 +1,279 @@ +import re +import sys +import unicodedata + +from lmms_eval.api.filter import Filter + + +class WhitespaceFilter(Filter): + """ """ + + def __init__(self) -> None: + pass + + def apply(self, resps, docs): + def filter_set(inst): + filtered_resp = [] + for resp in inst: + if resp.startswith(" "): + resp = resp[1:] + + filtered_resp.append(resp) + + return filtered_resp + + filtered_resps = [filter_set(resp) for resp in resps] + + return filtered_resps + + +class RegexFilter(Filter): + """ """ + + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ) -> None: + """ + pass a string `regex` to run `re.compile(r"regex")` on. + `fallback` defines the output returned if no matches for the regex are located. + """ + self.regex_pattern = regex_pattern + self.regex = re.compile(regex_pattern) + self.group_select = group_select + self.fallback = fallback + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + def filter_set(inst): + filtered = [] + for resp in inst: + match = self.regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + else: + match = self.fallback + filtered.append(match) + return filtered + + # print(resps) + filtered_resps = list(map(lambda x: filter_set(x), resps)) + # print(filtered_resps) + + return filtered_resps + + +class MultiChoiceRegexFilter(RegexFilter): + """ + A filter used to extract a model's answer on multiple choice questions with + letter answers. assumes each document has a "choices" field + containing the list of answer choices and that the answer label symbols + are of the form (A), (B), (C), ... or A, B, C. + """ + + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ignore_case=False, + ignore_punctuation=False, + regexes_to_ignore=None, + ) -> None: + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(regex_pattern, group_select, fallback) + self.ignore_case = ignore_case + self.ignore_punctuation = ignore_punctuation + self.regexes_to_ignore = regexes_to_ignore + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + + def find_match(regex, resp, convert_dict={}): + match = regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + if match and match in convert_dict: + match = convert_dict[match] + return match + + punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")) + + def filter_ignores(st): + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + + if self.ignore_case: + st = st.lower() + + if self.ignore_punctuation: + # https://stackoverflow.com/a/266162 + st = st.translate(punct_tbl) + return st + + filtered_resps = [] + + for r, doc in zip(resps, docs): + fallback_regexes = [] + choice_to_alpha = {} + next_alpha = "A" + + without_paren_fallback_regexes = [] + without_paren_to_target = {} + + choices = doc["choices"] + for c in choices: + m = filter_ignores(c.strip()) + fallback_regexes.append(f"{re.escape(m)}") + choice_to_alpha[m] = f"({next_alpha})" + + without_paren_fallback_regexes.append(next_alpha) + without_paren_to_target[next_alpha] = f"({next_alpha})" + + next_alpha = chr(ord(next_alpha) + 1) + fallback_regex = re.compile("|".join(fallback_regexes)) + without_paren_fallback_regex = "|".join(without_paren_fallback_regexes) + without_paren_fallback_regex = re.compile(f":[\s]*({without_paren_fallback_regex})") + + filtered = [] + for resp in r: + match = find_match(self.regex, resp) + if not match: + match = find_match(fallback_regex, filter_ignores(resp), choice_to_alpha) + if not match: + match = find_match(without_paren_fallback_regex, resp, without_paren_to_target) + if not match: + match = self.fallback + filtered.append(match) + filtered_resps.append(filtered) + + return filtered_resps + + +class ExtendedRegexFilter(RegexFilter): + punct_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")) + + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ignore_case=False, + ignore_punctuation=False, + regexes_to_ignore=None, + ) -> None: + super().__init__(regex_pattern, group_select, fallback) + self.ignore_case = ignore_case + self.ignore_punctuation = ignore_punctuation + self.regexes_to_ignore = regexes_to_ignore + + def filter_ignores(self, st): + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + + if self.ignore_case: + st = st.lower() + + if self.ignore_punctuation: + # https://stackoverflow.com/a/266162 + st = st.translate(self.punct_tbl) + return st + + def find_match(self, regex, resp, convert_dict={}): + match = regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + if match and match in convert_dict: + match = convert_dict[match] + return match + + +# Designed for the AI2D/RealworldQA dataset +class SimpleMultiChoiceRegexFilter(ExtendedRegexFilter): + def __init__(self, *args, **kwargs): + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(*args, **kwargs) + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + + filtered_resps = [] + + for r, doc in zip(resps, docs): + fallback_regexes = [] + choice_to_alpha = {} + next_alpha = "A" + + without_paren_fallback_regexes = [] + without_paren_to_target = {} + + # Regex to extract multiple choice options from the question + multiple_choices_regex = re.compile(r"\b([A-Z])\.\s+([^\n]*)") + matches = multiple_choices_regex.findall(doc["question"]) + + # Build regex patterns and mappings for each choice + for m in matches: + choice_text = m[1].strip() + fallback_regexes.append(f"{re.escape(choice_text)}") + choice_to_alpha[choice_text] = next_alpha + + next_alpha = chr(ord(next_alpha) + 1) + + # Compile regex to match any of the extracted choices + fallback_regex = re.compile("|".join(fallback_regexes)) + + # Process each response + filtered = [] + for resp in r: + # Remove any punctuation and extra spaces + cleaned_resp = re.sub(r"[^\w\s]", "", resp).strip() + # Try to match cleaned response with the choice text + match = fallback_regex.search(cleaned_resp) + if match and match.group() in choice_to_alpha: + # Map the matched choice text back to its corresponding letter + filtered.append(choice_to_alpha[match.group()]) + else: + # If no match, return the cleaned response + filtered.append(cleaned_resp) + + filtered_resps.append(filtered[0]) + + return filtered_resps diff --git a/lmms_eval/filters/selection.py b/lmms_eval/filters/selection.py new file mode 100755 index 0000000..acbfb82 --- /dev/null +++ b/lmms_eval/filters/selection.py @@ -0,0 +1,48 @@ +from collections import Counter + +from lmms_eval.api.filter import Filter + + +class TakeFirstFilter(Filter): + def __init__(self) -> None: + """ + Can define custom behavior here, if an individual instantiation of a Filter class should have state. + """ + + def apply(self, resps, docs): + """ + Assuming each entry of `resps` is a list of model responses, we discard all but the first response. + """ + return map(lambda r: r[0], resps) + + +class TakeKFilter(Filter): + def __init__(self, *args, **kwargs) -> None: + self.k = kwargs.pop("k") + + super().__init__(*args, **kwargs) + + def apply(self, resps, docs): + # check we have at least k responses per doc, else we can't take the first k + assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ." + return map(lambda r: r[: self.k], resps) + + +class MajorityVoteFilter(Filter): + def __init__(self) -> None: + """ + Can define custom behavior here, if an individual instantiation of a Filter class should have state. + """ + + def apply(self, resps, docs): + """ + Each entry of `resps` is a list of model responses. + We select the response that occurs most frequently in each entry of `resps`. + """ + + def select_majority(resp): + counts = Counter(resp) + vote = counts.most_common(1)[0][0] + return vote + + return map(lambda r: [select_majority(r)], resps) diff --git a/lmms_eval/filters/transformation.py b/lmms_eval/filters/transformation.py new file mode 100755 index 0000000..9842115 --- /dev/null +++ b/lmms_eval/filters/transformation.py @@ -0,0 +1,48 @@ +from lmms_eval.api.filter import Filter + + +class LowercaseFilter(Filter): + def __init__(self) -> None: + pass + + def apply(self, resps, docs): + def filter_set(inst): + return [resp.lower() for resp in inst] + + return [filter_set(resp) for resp in resps] + + +class UppercaseFilter(Filter): + def __init__(self) -> None: + pass + + def apply(self, resps, docs): + def filter_set(inst): + return [resp.upper() for resp in inst] + + return [filter_set(resp) for resp in resps] + + +class MapFilter(Filter): + def __init__(self, mapping_dict: dict = {}, default_value=None) -> None: + """ + Initializes the MapFilter with a given mapping dictionary and default value. + + Args: + - mapping_dict (dict): A dictionary containing the key-value mappings. + Default is an empty dictionary. + - default_value (Any): The value to be returned when a key is not found in the mapping_dict. + Default is None. + + Example: + mapper = MapFilter({'A': 1, 'B': 2}, default_value=0) + """ + assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary" + self.mapping_dict = mapping_dict + self.default_value = default_value + + def apply(self, resps, docs): + def filter_set(inst): + return [self.mapping_dict.get(resp, self.default_value) for resp in inst] + + return [filter_set(resp) for resp in resps] diff --git a/lmms_eval/loggers/__init__.py b/lmms_eval/loggers/__init__.py new file mode 100644 index 0000000..02b7a68 --- /dev/null +++ b/lmms_eval/loggers/__init__.py @@ -0,0 +1,2 @@ +from .evaluation_tracker import EvaluationTracker +from .wandb_logger import WandbLogger diff --git a/lmms_eval/loggers/evaluation_tracker.py b/lmms_eval/loggers/evaluation_tracker.py new file mode 100644 index 0000000..f13926b --- /dev/null +++ b/lmms_eval/loggers/evaluation_tracker.py @@ -0,0 +1,452 @@ +# code is adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/loggers/evaluation_tracker.py +import json +import os +import re +import time +from collections import defaultdict +from dataclasses import asdict, dataclass +from datetime import datetime +from pathlib import Path + +from datasets import load_dataset +from datasets.utils.metadata import MetadataConfigs +from huggingface_hub import DatasetCard, DatasetCardData, HfApi, hf_hub_url +from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status + +from lmms_eval.utils import ( + eval_logger, + get_datetime_str, + get_file_datetime, + get_file_task_name, + get_results_filenames, + get_sample_results_filenames, + handle_non_serializable, + hash_string, + sanitize_list, + sanitize_model_name, + sanitize_task_name, +) + + +@dataclass(init=False) +class GeneralConfigTracker: + """ + Tracker for the evaluation parameters. + + Attributes: + model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.) + model_name (str): Name of the model. + model_name_sanitized (str): Sanitized model name for directory creation. + start_time (float): Start time of the experiment. Logged at class init. + end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`] + total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times). + """ + + model_source: str = None + model_name: str = None + model_name_sanitized: str = None + system_instruction: str = None + system_instruction_sha: str = None + fewshot_as_multiturn: bool = None + chat_template: str = None + chat_template_sha: str = None + start_time: float = None + end_time: float = None + total_evaluation_time_seconds: str = None + + def __init__(self) -> None: + """Starts the evaluation timer.""" + self.start_time = time.perf_counter() + + @staticmethod + def _get_model_name(model_args: str) -> str: + """Extracts the model name from the model arguments.""" + + def extract_model_name(model_args: str, key: str) -> str: + """Extracts the model name from the model arguments using a key.""" + args_after_key = model_args.split(key)[1] + return args_after_key.split(",")[0] + + # order does matter, e.g. peft and delta are provided together with pretrained + prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="] + for prefix in prefixes: + if prefix in model_args: + return extract_model_name(model_args, prefix) + return "" + + def log_experiment_args( + self, + model_source: str, + model_args: str, + system_instruction: str, + chat_template: str, + fewshot_as_multiturn: bool, + ) -> None: + """Logs model parameters and job ID.""" + self.model_source = model_source + self.model_name = GeneralConfigTracker._get_model_name(model_args) + self.model_name_sanitized = sanitize_model_name(self.model_name) + self.system_instruction = system_instruction + self.system_instruction_sha = hash_string(system_instruction) if system_instruction else None + self.chat_template = chat_template + self.chat_template_sha = hash_string(chat_template) if chat_template else None + self.fewshot_as_multiturn = fewshot_as_multiturn + + def log_end_time(self) -> None: + """Logs the end time of the evaluation and calculates the total evaluation time.""" + self.end_time = time.perf_counter() + self.total_evaluation_time_seconds = str(self.end_time - self.start_time) + + +class EvaluationTracker: + """ + Keeps track and saves relevant information of the evaluation process. + Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested. + """ + + def __init__( + self, + output_path: str = None, + hub_results_org: str = "", + hub_repo_name: str = "", + details_repo_name: str = "", + results_repo_name: str = "", + push_results_to_hub: bool = False, + push_samples_to_hub: bool = False, + public_repo: bool = False, + token: str = "", + leaderboard_url: str = "", + point_of_contact: str = "", + gated: bool = False, + ) -> None: + """ + Creates all the necessary loggers for evaluation tracking. + + Args: + output_path (str): Path to save the results. If not provided, the results won't be saved. + hub_results_org (str): The Hugging Face organization to push the results to. If not provided, the results will be pushed to the owner of the Hugging Face token. + hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`. + details_repo_name (str): The name of the Hugging Face repository to push the details to. If not provided, the results will be pushed to `lm-eval-results`. + result_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will not be pushed and will be found in the details_hub_repo. + push_results_to_hub (bool): Whether to push the results to the Hugging Face hub. + push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub. + public_repo (bool): Whether to push the results to a public or private repository. + token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`. + leaderboard_url (str): URL to the leaderboard on the Hugging Face hub on the dataset card. + point_of_contact (str): Contact information on the Hugging Face hub dataset card. + gated (bool): Whether to gate the repository. + """ + self.general_config_tracker = GeneralConfigTracker() + + self.output_path = output_path + self.push_results_to_hub = push_results_to_hub + self.push_samples_to_hub = push_samples_to_hub + self.public_repo = public_repo + self.leaderboard_url = leaderboard_url + self.point_of_contact = point_of_contact + self.api = HfApi(token=token) if token else None + self.gated_repo = gated + + if not self.api and (push_results_to_hub or push_samples_to_hub): + raise ValueError("Hugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. " "Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable.") + + if self.api and hub_results_org == "" and (push_results_to_hub or push_samples_to_hub): + hub_results_org = self.api.whoami()["name"] + eval_logger.warning(f"hub_results_org was not specified. Results will be pushed to '{hub_results_org}'.") + + if hub_repo_name == "": + details_repo_name = details_repo_name if details_repo_name != "" else "lmms-eval-results" + results_repo_name = results_repo_name if results_repo_name != "" else details_repo_name + else: + details_repo_name = hub_repo_name + results_repo_name = hub_repo_name + eval_logger.warning("hub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead.") + + self.details_repo = f"{hub_results_org}/{details_repo_name}" + self.details_repo_private = f"{hub_results_org}/{details_repo_name}-private" + self.results_repo = f"{hub_results_org}/{results_repo_name}" + self.results_repo_private = f"{hub_results_org}/{results_repo_name}-private" + + def save_results_aggregated( + self, + results: dict, + samples: dict, + datetime_str: str, + ) -> None: + """ + Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested. + + Args: + results (dict): The aggregated results to save. + samples (dict): The samples results to save. + datetime_str (str): The datetime string to use for the results file. + """ + self.general_config_tracker.log_end_time() + + if self.output_path: + try: + eval_logger.info("Saving results aggregated") + + # calculate cumulative hash for each task - only if samples are provided + task_hashes = {} + if samples: + for task_name, task_samples in samples.items(): + sample_hashes = [s["doc_hash"] + s["prompt_hash"] + s["target_hash"] for s in task_samples] + task_hashes[task_name] = hash_string("".join(sample_hashes)) + + # update initial results dict + results.update({"task_hashes": task_hashes}) + results.update(asdict(self.general_config_tracker)) + dumped = json.dumps( + results, + indent=2, + default=handle_non_serializable, + ensure_ascii=False, + ) + + path = Path(self.output_path if self.output_path else Path.cwd()) + path = path.joinpath(self.general_config_tracker.model_name_sanitized) + path.mkdir(parents=True, exist_ok=True) + + self.date_id = datetime_str.replace(":", "-") + file_results_aggregated = path.joinpath(f"{self.date_id}_results.json") + file_results_aggregated.open("w", encoding="utf-8").write(dumped) + + if self.api and self.push_results_to_hub: + repo_id = self.results_repo if self.public_repo else self.results_repo_private + self.api.create_repo( + repo_id=repo_id, + repo_type="dataset", + private=not self.public_repo, + exist_ok=True, + ) + self.api.upload_file( + repo_id=repo_id, + path_or_fileobj=str(path.joinpath(f"{self.date_id}_results.json")), + path_in_repo=os.path.join( + self.general_config_tracker.model_name, + f"{self.date_id}_results.json", + ), + repo_type="dataset", + commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}", + ) + eval_logger.info("Successfully pushed aggregated results to the Hugging Face Hub. " f"You can find them at: {repo_id}") + + except Exception as e: + eval_logger.warning("Could not save results aggregated") + eval_logger.info(repr(e)) + else: + eval_logger.info("Output path not provided, skipping saving results aggregated") + + def save_results_samples( + self, + task_name: str, + samples: dict, + ) -> None: + """ + Saves the samples results to the output path and pushes them to the Hugging Face hub if requested. + + Args: + task_name (str): The task name to save the samples for. + samples (dict): The samples results to save. + """ + if self.output_path: + try: + eval_logger.info(f"Saving per-sample results for: {task_name}") + + path = Path(self.output_path if self.output_path else Path.cwd()) + path = path.joinpath(self.general_config_tracker.model_name_sanitized) + path.mkdir(parents=True, exist_ok=True) + + file_results_samples = path.joinpath(f"{self.date_id}_samples_{task_name}.jsonl") + + for sample in samples: + # we first need to sanitize arguments and resps + # otherwise we won't be able to load the dataset + # using the datasets library + arguments = {} + for key, value in enumerate(sample["arguments"][1]): # update metadata into args + arguments[key] = value + + sample["input"] = sample["arguments"][0] + sample["resps"] = sanitize_list(sample["resps"]) + sample["filtered_resps"] = sanitize_list(sample["filtered_resps"]) + sample["arguments"] = arguments + sample["target"] = str(sample["target"]) + + sample_dump = ( + json.dumps( + sample, + default=handle_non_serializable, + ensure_ascii=False, + ) + + "\n" + ) + + with open(file_results_samples, "a", encoding="utf-8") as f: + f.write(sample_dump) + + if self.api and self.push_samples_to_hub: + repo_id = self.details_repo if self.public_repo else self.details_repo_private + self.api.create_repo( + repo_id=repo_id, + repo_type="dataset", + private=not self.public_repo, + exist_ok=True, + ) + try: + if self.gated_repo: + headers = build_hf_headers() + r = get_session().put( + url=f"https://huggingface.co/api/datasets/{repo_id}/settings", + headers=headers, + json={"gated": "auto"}, + ) + hf_raise_for_status(r) + except Exception as e: + eval_logger.warning("Could not gate the repository") + eval_logger.info(repr(e)) + self.api.upload_folder( + repo_id=repo_id, + folder_path=str(path), + path_in_repo=self.general_config_tracker.model_name_sanitized, + repo_type="dataset", + commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}", + ) + eval_logger.info(f"Successfully pushed sample results for task: {task_name} to the Hugging Face Hub. " f"You can find them at: {repo_id}") + + except Exception as e: + eval_logger.warning("Could not save sample results") + eval_logger.info(repr(e)) + else: + eval_logger.info("Output path not provided, skipping saving sample results") + + def recreate_metadata_card(self) -> None: + """ + Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub. + """ + + eval_logger.info("Recreating metadata card") + repo_id = self.details_repo if self.public_repo else self.details_repo_private + + files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset") + results_files = get_results_filenames(files_in_repo) + sample_files = get_sample_results_filenames(files_in_repo) + + # Build a dictionary to store the latest evaluation datetime for: + # - Each tested model and its aggregated results + # - Each task and sample results, if existing + # i.e. { + # "org__model_name__gsm8k": "2021-09-01T12:00:00", + # "org__model_name__ifeval": "2021-09-01T12:00:00", + # "org__model_name__results": "2021-09-01T12:00:00" + # } + latest_task_results_datetime = defaultdict(lambda: datetime.min.isoformat()) + + for file_path in sample_files: + file_path = Path(file_path) + filename = file_path.name + model_name = file_path.parent + task_name = get_file_task_name(filename) + results_datetime = get_file_datetime(filename) + task_name_sanitized = sanitize_task_name(task_name) + # Results and sample results for the same model and task will have the same datetime + samples_key = f"{model_name}__{task_name_sanitized}" + results_key = f"{model_name}__results" + latest_datetime = max( + latest_task_results_datetime[samples_key], + results_datetime, + ) + latest_task_results_datetime[samples_key] = latest_datetime + latest_task_results_datetime[results_key] = max( + latest_task_results_datetime[results_key], + latest_datetime, + ) + + # Create metadata card + card_metadata = MetadataConfigs() + + # Add the latest aggregated results to the metadata card for easy access + for file_path in results_files: + file_path = Path(file_path) + results_filename = file_path.name + model_name = file_path.parent + eval_date = get_file_datetime(results_filename) + eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date) + results_filename = Path("**") / Path(results_filename).name + config_name = f"{model_name}__results" + sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", latest_task_results_datetime[config_name]) + + if eval_date_sanitized == sanitized_last_eval_date_results: + # Ensure that all results files are listed in the metadata card + current_results = card_metadata.get(config_name, {"data_files": []}) + current_results["data_files"].append({"split": eval_date_sanitized, "path": [str(results_filename)]}) + card_metadata[config_name] = current_results + # If the results file is the newest, update the "latest" field in the metadata card + card_metadata[config_name]["data_files"].append({"split": "latest", "path": [str(results_filename)]}) + + # Add the tasks details configs + for file_path in sample_files: + file_path = Path(file_path) + filename = file_path.name + model_name = file_path.parent + task_name = get_file_task_name(filename) + eval_date = get_file_datetime(filename) + task_name_sanitized = sanitize_task_name(task_name) + eval_date_sanitized = re.sub(r"[^\w\.]", "_", eval_date) + results_filename = Path("**") / Path(filename).name + config_name = f"{model_name}__{task_name_sanitized}" + sanitized_last_eval_date_results = re.sub(r"[^\w\.]", "_", latest_task_results_datetime[config_name]) + if eval_date_sanitized == sanitized_last_eval_date_results: + # Ensure that all sample results files are listed in the metadata card + current_details_for_task = card_metadata.get(config_name, {"data_files": []}) + current_details_for_task["data_files"].append({"split": eval_date_sanitized, "path": [str(results_filename)]}) + card_metadata[config_name] = current_details_for_task + # If the samples results file is the newest, update the "latest" field in the metadata card + card_metadata[config_name]["data_files"].append({"split": "latest", "path": [str(results_filename)]}) + + # Get latest results and extract info to update metadata card examples + latest_datetime = max(latest_task_results_datetime.values()) + latest_model_name = max(latest_task_results_datetime, key=lambda k: latest_task_results_datetime[k]) + last_results_file = [f for f in results_files if latest_datetime.replace(":", "-") in f][0] + last_results_file_path = hf_hub_url(repo_id=repo_id, filename=last_results_file, repo_type="dataset") + latest_results_file = load_dataset("json", data_files=last_results_file_path, split="train") + results_dict = latest_results_file["results"][0] + new_dictionary = {"all": results_dict} + new_dictionary.update(results_dict) + results_string = json.dumps(new_dictionary, indent=4) + + dataset_summary = "Dataset automatically created during the evaluation run of model " + if self.general_config_tracker.model_source == "hf": + dataset_summary += f"[{self.general_config_tracker.model_name}](https://huggingface.co/{self.general_config_tracker.model_name})\n" + else: + dataset_summary += f"{self.general_config_tracker.model_name}\n" + dataset_summary += ( + f"The dataset is composed of {len(card_metadata)-1} configuration(s), each one corresponding to one of the evaluated task.\n\n" + f"The dataset has been created from {len(results_files)} run(s). Each run can be found as a specific split in each " + 'configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.\n\n' + 'An additional configuration "results" store all the aggregated results of the run.\n\n' + "To load the details from a run, you can for instance do the following:\n" + ) + if self.general_config_tracker.model_source == "hf": + dataset_summary += "```python\nfrom datasets import load_dataset\n" f'data = load_dataset(\n\t"{repo_id}",\n\tname="{latest_model_name}",\n\tsplit="latest"\n)\n```\n\n' + dataset_summary += ( + "## Latest results\n\n" + f'These are the [latest results from run {latest_datetime}]({last_results_file_path.replace("/resolve/", "/blob/")}) ' + "(note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. " + 'You find each in the results and the "latest" split for each eval):\n\n' + f"```python\n{results_string}\n```" + ) + card_data = DatasetCardData( + dataset_summary=dataset_summary, + repo_url=f"https://huggingface.co/{self.general_config_tracker.model_name}", + pretty_name=f"Evaluation run of {self.general_config_tracker.model_name}", + leaderboard_url=self.leaderboard_url, + point_of_contact=self.point_of_contact, + ) + card_metadata.to_dataset_card_data(card_data) + card = DatasetCard.from_template( + card_data, + pretty_name=card_data.pretty_name, + ) + card.push_to_hub(repo_id, repo_type="dataset") diff --git a/lmms_eval/loggers/utils.py b/lmms_eval/loggers/utils.py new file mode 100644 index 0000000..14257c7 --- /dev/null +++ b/lmms_eval/loggers/utils.py @@ -0,0 +1,127 @@ +import os +import re +import subprocess +from pathlib import Path +from typing import Any, Dict, Optional, Tuple, Union + +import numpy as np +from loguru import logger +from torch.utils.collect_env import get_pretty_env_info +from transformers import __version__ as trans_version + + +def remove_none_pattern(input_string: str) -> Tuple[str, bool]: + """Remove the ',none' substring from the input_string if it exists at the end. + + Args: + input_string (str): The input string from which to remove the ',none' substring. + + Returns: + Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed + and a boolean indicating whether the modification was made (True) or not (False). + """ + # Define the pattern to match ',none' at the end of the string + pattern = re.compile(r",none$") + + # Use sub() to replace ',none' with an empty string + result = re.sub(pattern, "", input_string) + + # check if the input_string changed + removed = result != input_string + + return result, removed + + +def _handle_non_serializable(o: Any) -> Union[int, str, list]: + """Handle non-serializable objects by converting them to serializable types. + + Args: + o (Any): The object to be handled. + + Returns: + Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32, + it will be converted to int. If the object is of type set, it will be converted + to a list. Otherwise, it will be converted to str. + """ + if isinstance(o, np.int64) or isinstance(o, np.int32): + return int(o) + elif isinstance(o, set): + return list(o) + else: + return str(o) + + +def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]: + try: + git_folder = Path(repo_path, ".git") + if git_folder.is_file(): + git_folder = Path( + git_folder.parent, + git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1], + ) + if Path(git_folder, "HEAD").exists(): + head_name = Path(git_folder, "HEAD").read_text(encoding="utf-8").split("\n")[0].split(" ")[-1] + head_ref = Path(git_folder, head_name) + git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "") + else: + git_hash = None + except Exception as err: + logger.debug(f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}") + return None + return git_hash + + +def get_git_commit_hash(): + """ + Gets the git commit hash of your current repo (if it exists). + Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42 + """ + try: + git_hash = subprocess.check_output(["git", "describe", "--always"]).strip() + git_hash = git_hash.decode() + except (subprocess.CalledProcessError, FileNotFoundError): + # FileNotFoundError occurs when git not installed on system + git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists + return git_hash + + +def add_env_info(storage: Dict[str, Any]): + try: + pretty_env_info = get_pretty_env_info() + except Exception as err: + pretty_env_info = str(err) + transformers_version = trans_version + upper_dir_commit = get_commit_from_path(Path(os.getcwd(), "..")) # git hash of upper repo if exists + added_info = { + "pretty_env_info": pretty_env_info, + "transformers_version": transformers_version, + "upper_git_hash": upper_dir_commit, # in case this repo is submodule + } + storage.update(added_info) + + +def add_tokenizer_info(storage: Dict[str, Any], lm): + if getattr(lm, "tokenizer", False): + try: + tokenizer_info = { + "tokenizer_pad_token": [ + lm.tokenizer.pad_token, + str(lm.tokenizer.pad_token_id), + ], + "tokenizer_eos_token": [ + lm.tokenizer.eos_token, + str(lm.tokenizer.eos_token_id), + ], + "tokenizer_bos_token": [ + lm.tokenizer.bos_token, + str(lm.tokenizer.bos_token_id), + ], + "eot_token_id": getattr(lm, "eot_token_id", None), + "max_length": getattr(lm, "max_length", None), + } + storage.update(tokenizer_info) + except Exception as err: + logger.debug(f"Logging detailed tokenizer info failed with {err}, skipping...") + # seems gguf and textsynth do not have tokenizer + else: + logger.debug("LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results.") diff --git a/lmms_eval/loggers/wandb_logger.py b/lmms_eval/loggers/wandb_logger.py new file mode 100644 index 0000000..b30b1ce --- /dev/null +++ b/lmms_eval/loggers/wandb_logger.py @@ -0,0 +1,315 @@ +import copy +import json +import logging +from typing import Any, Dict, List, Literal, Tuple + +import numpy as np +import pandas as pd +from loguru import logger +from packaging.version import Version + +from lmms_eval.loggers.utils import _handle_non_serializable, remove_none_pattern + + +def get_wandb_printer() -> Literal["Printer"]: + """Returns a wandb printer instance for pretty stdout.""" + from wandb.sdk.lib.printer import get_printer + from wandb.sdk.wandb_settings import Settings + + printer = get_printer(Settings()._jupyter) + return printer + + +class WandbLogger: + def __init__(self, **kwargs) -> None: + """Attaches to wandb logger if already initialized. Otherwise, passes kwargs to wandb.init() + + Args: + kwargs Optional[Any]: Arguments for configuration. + + Parse and log the results returned from evaluator.simple_evaluate() with: + wandb_logger.post_init(results) + wandb_logger.log_eval_result() + wandb_logger.log_eval_samples(results["samples"]) + """ + try: + import wandb + + assert Version(wandb.__version__) >= Version("0.13.6") + if Version(wandb.__version__) < Version("0.13.6"): + wandb.require("report-editing:v0") + except Exception as e: + logger.warning("To use the wandb reporting functionality please install wandb>=0.13.6.\n" "To install the latest version of wandb run `pip install wandb --upgrade`\n" f"{e}") + + self.wandb_args: Dict[str, Any] = kwargs + + # initialize a W&B run + if wandb.run is None: + self.run = wandb.init(**self.wandb_args) + else: + self.run = wandb.run + + self.printer = get_wandb_printer() + + def post_init(self, results: Dict[str, Any]) -> None: + self.results: Dict[str, Any] = copy.deepcopy(results) + self.task_names: List[str] = list(results.get("results", {}).keys()) + self.group_names: List[str] = list(results.get("groups", {}).keys()) + + def _get_config(self) -> Dict[str, Any]: + """Get configuration parameters.""" + self.task_configs = self.results.get("configs", {}) + cli_configs = self.results.get("config", {}) + configs = { + "task_configs": self.task_configs, + "cli_configs": cli_configs, + } + + return configs + + def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]: + """Sanitize the results dictionary.""" + _results = copy.deepcopy(self.results.get("results", dict())) + + # Remove None from the metric string name + tmp_results = copy.deepcopy(_results) + for task_name in self.task_names: + task_result = tmp_results.get(task_name, dict()) + for metric_name, metric_value in task_result.items(): + _metric_name, removed = remove_none_pattern(metric_name) + if removed: + _results[task_name][_metric_name] = metric_value + _results[task_name].pop(metric_name) + + # remove string valued keys from the results dict + wandb_summary = {} + for task in self.task_names: + task_result = _results.get(task, dict()) + for metric_name, metric_value in task_result.items(): + if isinstance(metric_value, str): + wandb_summary[f"{task}/{metric_name}"] = metric_value + + for summary_metric, summary_value in wandb_summary.items(): + _task, _summary_metric = summary_metric.split("/") + _results[_task].pop(_summary_metric) + + tmp_results = copy.deepcopy(_results) + for task_name, task_results in tmp_results.items(): + for metric_name, metric_value in task_results.items(): + _results[f"{task_name}/{metric_name}"] = metric_value + _results[task_name].pop(metric_name) + for task in self.task_names: + _results.pop(task) + + return wandb_summary, _results + + def _log_results_as_table(self) -> None: + """Generate and log evaluation results as a table to W&B.""" + columns = [ + "Version", + "Filter", + "num_fewshot", + "Metric", + "Value", + "Stderr", + ] + + def make_table(columns: List[str], key: str = "results"): + import wandb + + table = wandb.Table(columns=columns) + results = copy.deepcopy(self.results) + + for k, dic in results.get(key).items(): + if k in self.group_names and not key == "groups": + continue + version = results.get("versions").get(k) + if version == "N/A": + version = None + n = results.get("n-shot").get(k) + + for (mf), v in dic.items(): + m, _, f = mf.partition(",") + if m.endswith("_stderr"): + continue + if m == "alias": + continue + + if m + "_stderr" + "," + f in dic: + se = dic[m + "_stderr" + "," + f] + if se != "N/A": + se = "%.4f" % se + table.add_data(*[k, version, f, n, m, str(v), str(se)]) + else: + table.add_data(*[k, version, f, n, m, str(v), ""]) + + return table + + # log the complete eval result to W&B Table + table = make_table(["Tasks"] + columns, "results") + self.run.log({"evaluation/eval_results": table}) + + if "groups" in self.results.keys(): + table = make_table(["Groups"] + columns, "groups") + self.run.log({"evaluation/group_eval_results": table}) + + def _log_results_as_artifact(self) -> None: + """Log results as JSON artifact to W&B.""" + import wandb + + dumped = json.dumps(self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False) + artifact = wandb.Artifact("results", type="eval_results") + with artifact.new_file("results.json", mode="w", encoding="utf-8") as f: + f.write(dumped) + self.run.log_artifact(artifact) + + def log_eval_result(self) -> None: + """Log evaluation results to W&B.""" + # Log configs to wandb + configs = self._get_config() + self.run.config.update(configs) + + wandb_summary, self.wandb_results = self._sanitize_results_dict() + # update wandb.run.summary with items that were removed + self.run.summary.update(wandb_summary) + # Log the evaluation metrics to wandb + self.run.log(self.wandb_results) + # Log the evaluation metrics as W&B Table + self._log_results_as_table() + # Log the results dict as json to W&B Artifacts + self._log_results_as_artifact() + + def _generate_dataset(self, data: List[Dict[str, Any]], config: Dict[str, Any]) -> pd.DataFrame: + """Generate a dataset from evaluation data. + + Args: + data (List[Dict[str, Any]]): The data to generate a dataset for. + config (Dict[str, Any]): The configuration of the task. + + Returns: + pd.DataFrame: A dataframe that is ready to be uploaded to W&B. + """ + ids = [x["doc_id"] for x in data] + labels = [x["target"] for x in data] + instance = [""] * len(ids) + resps = [""] * len(ids) + filtered_resps = [""] * len(ids) + model_outputs = {} + + metrics_list = config["metric_list"] + metrics = {} + for metric in metrics_list: + metric = metric.get("metric") + if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]: + metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data] + if metric in ["byte_perplexity", "bits_per_byte"]: + metrics[f"{metric}_bytes"] = [x[metric][1] for x in data] + else: + metrics[f"{metric}_words"] = [x[metric][1] for x in data] + else: + metrics[metric] = [x[metric] for x in data] + + if config["output_type"] == "loglikelihood": + instance = [x["arguments"][0][0] for x in data] + labels = [x["arguments"][0][1] for x in data] + resps = [f'log probability of continuation is {x["resps"][0][0][0]} ' + "\n\n" + "continuation will {} generated with greedy sampling".format("not be" if not x["resps"][0][0][1] else "be") for x in data] + filtered_resps = [f'log probability of continuation is {x["filtered_resps"][0][0]} ' + "\n\n" + "continuation will {} generated with greedy sampling".format("not be" if not x["filtered_resps"][0][1] else "be") for x in data] + elif config["output_type"] == "multiple_choice": + instance = [x["arguments"][0][0] for x in data] + choices = ["\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])]) for x in data] + resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data] + filtered_resps = [np.argmax([n[0] for n in x["filtered_resps"]]) for x in data] + elif config["output_type"] == "loglikelihood_rolling": + instance = [x["arguments"][0][0] for x in data] + resps = [x["resps"][0][0] for x in data] + filtered_resps = [x["filtered_resps"][0] for x in data] + elif config["output_type"] == "generate_until": + instance = [x["arguments"][0][0] for x in data] + resps = [x["resps"][0][0] for x in data] + filtered_resps = [x["filtered_resps"][0] for x in data] + + model_outputs["raw_predictions"] = resps + model_outputs["filtered_predictions"] = filtered_resps + + df_data = { + "id": ids, + "data": instance, + } + if config["output_type"] == "multiple_choice": + df_data["choices"] = choices + + tmp_data = { + "input_len": [len(x) for x in instance], + "labels": labels, + "output_type": config["output_type"], + } + df_data.update(tmp_data) + df_data.update(model_outputs) + df_data.update(metrics) + + return pd.DataFrame(df_data) + + def _log_samples_as_artifact(self, data: List[Dict[str, Any]], task_name: str) -> None: + import wandb + + # log the samples as an artifact + dumped = json.dumps( + data, + indent=2, + default=_handle_non_serializable, + ensure_ascii=False, + ) + artifact = wandb.Artifact(f"{task_name}", type="samples_by_task") + with artifact.new_file(f"{task_name}_eval_samples.json", mode="w", encoding="utf-8") as f: + f.write(dumped) + self.run.log_artifact(artifact) + # artifact.wait() + + def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None: + """Log evaluation samples to W&B. + + Args: + samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task. + """ + task_names: List[str] = [x for x in self.task_names if x not in self.group_names] + + ungrouped_tasks = [] + tasks_by_groups = {} + + for task_name in task_names: + group_names = self.task_configs[task_name].get("group", None) + if group_names: + if isinstance(group_names, str): + group_names = [group_names] + + for group_name in group_names: + if not tasks_by_groups.get(group_name): + tasks_by_groups[group_name] = [task_name] + else: + tasks_by_groups[group_name].append(task_name) + else: + ungrouped_tasks.append(task_name) + + for task_name in ungrouped_tasks: + eval_preds = samples[task_name] + + # log the samples as a W&B Table + df = self._generate_dataset(eval_preds, self.task_configs.get(task_name)) + self.run.log({f"{task_name}_eval_results": df}) + + # log the samples as a json file as W&B Artifact + self._log_samples_as_artifact(eval_preds, task_name) + + for group, grouped_tasks in tasks_by_groups.items(): + grouped_df = pd.DataFrame() + for task_name in grouped_tasks: + eval_preds = samples[task_name] + df = self._generate_dataset(eval_preds, self.task_configs.get(task_name)) + df["group"] = group + df["task"] = task_name + grouped_df = pd.concat([grouped_df, df], ignore_index=True) + + # log the samples as a json file as W&B Artifact + self._log_samples_as_artifact(eval_preds, task_name) + + self.run.log({f"{group}_eval_results": grouped_df}) diff --git a/lmms_eval/logging_utils.py b/lmms_eval/logging_utils.py new file mode 100755 index 0000000..27f2551 --- /dev/null +++ b/lmms_eval/logging_utils.py @@ -0,0 +1,368 @@ +# Code mostly from: https://github.com/EleutherAI/lm-evaluation-harness/pull/1339, credit to: https://github.com/ayulockin +import copy +import glob +import json +import os +import re +from datetime import datetime +from typing import Any, Dict, List, Literal, Tuple, Union + +import numpy as np +import pandas as pd +import tenacity +from loguru import logger +from packaging.version import Version + +from lmms_eval import utils + +try: + import wandb + + assert Version(wandb.__version__) >= Version("0.13.6") + if Version(wandb.__version__) < Version("0.13.6"): + wandb.require("report-editing:v0") +except Exception as e: + logger.warning("To use the wandb reporting functionality please install wandb>=0.13.6.\n" "To install the latest version of wandb run `pip install wandb --upgrade`\n" f"{e}") + + +def remove_none_pattern(input_string): + # Define the pattern to match ',none' at the end of the string + pattern = re.compile(r",none$") + + # Use sub() to replace ',none' with an empty string + result = re.sub(pattern, "", input_string) + + # check if the input_string changed + removed = result != input_string + + return result, removed + + +def _handle_non_serializable(o: Any) -> Union[int, str, list]: + """Handle non-serializable objects by converting them to serializable types. + + Args: + o (Any): The object to be handled. + + Returns: + Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32, + it will be converted to int. If the object is of type set, it will be converted + to a list. Otherwise, it will be converted to str. + """ + if isinstance(o, np.int64) or isinstance(o, np.int32): + return int(o) + elif isinstance(o, set): + return list(o) + else: + return str(o) + + +def get_wandb_printer() -> Literal["Printer"]: + """Returns a wandb printer instance for pretty stdout.""" + from wandb.sdk.lib.printer import get_printer + from wandb.sdk.wandb_settings import Settings + + printer = get_printer(Settings()._jupyter) + return printer + + +# class WandbLogger: +class WandbLogger: + def __init__(self, args): + self.wandb_args = utils.simple_parse_args_string(args.wandb_args) + self.args = args + self.all_args_dict = vars(args) + self.printer = get_wandb_printer() + try: + self.init_run() + except Exception as e: + logger.warning(f"Failed to initialize W&B run: {e}") + os.environ["WANDB_MODE"] = "offline" + self.init_run() + + def finish(self): + self.run.finish() + + @tenacity.retry(wait=tenacity.wait_fixed(5), stop=tenacity.stop_after_attempt(5)) + def init_run(self): + if "name" not in self.wandb_args: + if "config" in self.all_args_dict and self.all_args_dict["config"] != "": + self.wandb_args["name"] = self.all_args_dict["config"].split("/")[-1].replace(".yaml", "") + "/" + self.args.log_samples_suffix + else: + task_names = self.args.tasks.replace(",", "/") + self.wandb_args["name"] = f"{self.args.model}/<{task_names}>/{self.args.log_samples_suffix}" + if self.args.num_fewshot: + self.wandb_args["name"] += f"_{self.args.num_fewshot}shot" + if "project" not in self.wandb_args: + self.wandb_args["project"] = "lmms-eval" + # initialize a W&B run + self.run = wandb.init(**self.wandb_args) + + def post_init(self, results: Dict[str, Any]) -> None: + self.results: Dict[str, Any] = copy.deepcopy(results) + self.task_names: List[str] = list(results.get("results", {}).keys()) + self.group_names: List[str] = list(results.get("groups", {}).keys()) + + def _get_config(self) -> Dict[str, Any]: + """Get configuration parameters.""" + self.task_configs = self.results.get("configs", {}) + cli_configs = self.results.get("config", {}) + configs = { + "task_configs": self.task_configs, + "cli_configs": cli_configs, + } + + return configs + + def _sanitize_results_dict(self) -> Tuple[Dict[str, str], Dict[str, Any]]: + """Sanitize the results dictionary.""" + _results = copy.deepcopy(self.results.get("results", dict())) + _results["model_configs"] = self.results.get("model_configs", dict()) + + # Remove None from the metric string name + tmp_results = copy.deepcopy(_results) + for task_name in self.task_names: + task_result = tmp_results.get(task_name, dict()) + for metric_name, metric_value in task_result.items(): + _metric_name, removed = remove_none_pattern(metric_name) + if removed: + _results[task_name][_metric_name] = metric_value + _results[task_name].pop(metric_name) + + # remove string valued keys from the results dict + wandb_summary = {} + for task in self.task_names: + task_result = _results.get(task, dict()) + for metric_name, metric_value in task_result.items(): + if isinstance(metric_value, str): + wandb_summary[f"{task}/{metric_name}"] = metric_value + + wandb_summary["model_configs"] = self.results.get("model_configs", dict()) + for summary_metric, summary_value in wandb_summary.items(): + if summary_metric != "model_configs": + _task, _summary_metric = summary_metric.split("/") + _results[_task].pop(_summary_metric) + + tmp_results = copy.deepcopy(_results) + for task_name, task_results in tmp_results.items(): + if task_name != "model_configs": + for metric_name, metric_value in task_results.items(): + _results[f"{task_name}/{metric_name}"] = metric_value + _results[task_name].pop(metric_name) + for task in self.task_names: + _results.pop(task) + + return wandb_summary, _results + + def _log_results_as_table(self) -> None: + """Generate and log evaluation results as a table to W&B.""" + columns = [ + "Model", + "Args", + "Tasks", + "Version", + "Filter", + "num_fewshot", + "Metric", + "Value", + "Stderr", + ] + + def make_table(columns: List[str], key: str = "results"): + table = wandb.Table(columns=columns) + results = copy.deepcopy(self.results) + + model_name = results.get("model_configs").get("model") + model_args = results.get("model_configs").get("model_args") + + for k, dic in results.get(key).items(): + if k in self.group_names and not key == "groups": + continue + version = results.get("versions").get(k) + if version == "N/A": + version = None + n = results.get("n-shot").get(k) + + for (mf), v in dic.items(): + m, _, f = mf.partition(",") + if m.endswith("_stderr"): + continue + if m == "alias": + continue + + if m + "_stderr" + "," + f in dic: + se = dic[m + "_stderr" + "," + f] + if se != "N/A": + se = "%.4f" % se + data = [model_name, model_args, k, version, f, n, m, str(v), str(se)] + if key == "groups": + data = [self.group_names] + data + table.add_data(*data) + else: + data = [model_name, model_args, k, version, f, n, m, str(v), ""] + if key == "groups": + data = [self.group_names] + data + table.add_data(*data) + + return table + + # log the complete eval result to W&B Table + table = make_table(columns, "results") + self.run.log({"evaluation/eval_results": table}) + + if "groups" in self.results.keys(): + table = make_table(["Groups"] + columns, "groups") + self.run.log({"evaluation/group_eval_results": table}) + + def _log_results_as_artifact(self) -> None: + """Log results as JSON artifact to W&B.""" + dumped = json.dumps(self.results, indent=2, default=_handle_non_serializable, ensure_ascii=False) + artifact = wandb.Artifact("results", type="eval_results") + with artifact.new_file("results.json", mode="w", encoding="utf-8") as f: + f.write(dumped) + self.run.log_artifact(artifact) + + def log_eval_result(self) -> None: + """Log evaluation results to W&B.""" + # Log configs to wandb + configs = self._get_config() + self.run.config.update(configs, allow_val_change=True) + + wandb_summary, self.wandb_results = self._sanitize_results_dict() + # update wandb.run.summary with items that were removed + self.run.summary.update(wandb_summary) + # Log the evaluation metrics to wandb + self.run.log(self.wandb_results) + # Log the evaluation metrics as W&B Table + self._log_results_as_table() + # Log the results dict as json to W&B Artifacts + self._log_results_as_artifact() + + def _generate_dataset(self, data: List[Dict[str, Any]], config: Dict[str, Any]) -> pd.DataFrame: + """Generate a dataset from evaluation data. + + Args: + data (List[Dict[str, Any]]): The data to generate a dataset for. + config (Dict[str, Any]): The configuration of the task. + + Returns: + pd.DataFrame: A dataframe that is ready to be uploaded to W&B. + """ + ids = [x["doc_id"] for x in data] + labels = [x["target"] for x in data] + instance = [""] * len(ids) + resps = [""] * len(ids) + filtered_resps = [""] * len(ids) + model_outputs = {} + + metrics_list = config["metric_list"] + metrics = {} + for metric in metrics_list: + metric = metric.get("metric") + if metric in ["word_perplexity", "byte_perplexity", "bits_per_byte"]: + metrics[f"{metric}_loglikelihood"] = [x[metric][0] for x in data] + if metric in ["byte_perplexity", "bits_per_byte"]: + metrics[f"{metric}_bytes"] = [x[metric][1] for x in data] + else: + metrics[f"{metric}_words"] = [x[metric][1] for x in data] + else: + metrics[metric] = [x[metric] for x in data] + + if config["output_type"] == "loglikelihood": + instance = [x["arguments"][0][0] for x in data] + labels = [x["arguments"][0][1] for x in data] + resps = [f'log probability of continuation is {x["resps"][0][0][0]} ' + "\n\n" + "continuation will {} generated with greedy sampling".format("not be" if not x["resps"][0][0][1] else "be") for x in data] + filtered_resps = [f'log probability of continuation is {x["filtered_resps"][0][0]} ' + "\n\n" + "continuation will {} generated with greedy sampling".format("not be" if not x["filtered_resps"][0][1] else "be") for x in data] + elif config["output_type"] == "multiple_choice": + instance = [x["arguments"][0][0] for x in data] + choices = ["\n".join([f"{idx}. {y[1]}" for idx, y in enumerate(x["arguments"])]) for x in data] + resps = [np.argmax([n[0][0] for n in x["resps"]]) for x in data] + filtered_resps = [np.argmax([n[0] for n in x["filtered_resps"]]) for x in data] + elif "generate_until" in config["output_type"]: + instance = [x["arguments"][0][0] for x in data] + resps = [x["resps"][0][0] for x in data] + filtered_resps = [x["filtered_resps"][0] for x in data] + + model_outputs["raw_predictions"] = resps + model_outputs["filtered_predictions"] = filtered_resps + + df_data = { + "id": ids, + "data": instance, + } + if config["output_type"] == "multiple_choice": + df_data["choices"] = choices + + tmp_data = { + "input_len": [len(x) for x in instance], + "labels": labels, + "output_type": config["output_type"], + } + df_data.update(tmp_data) + df_data.update(model_outputs) + df_data.update(metrics) + + return pd.DataFrame(df_data) + + def _log_samples_as_artifact(self, data: List[Dict[str, Any]], task_name: str) -> None: + # log the samples as an artifact + dumped = json.dumps( + data, + indent=2, + default=_handle_non_serializable, + ensure_ascii=False, + ) + artifact = wandb.Artifact(f"{task_name}", type="samples_by_task") + with artifact.new_file(f"{task_name}_eval_samples.json", mode="w", encoding="utf-8") as f: + f.write(dumped) + self.run.log_artifact(artifact) + # artifact.wait() + + def log_eval_samples(self, samples: Dict[str, List[Dict[str, Any]]]) -> None: + """Log evaluation samples to W&B. + + Args: + samples (Dict[str, List[Dict[str, Any]]]): Evaluation samples for each task. + """ + task_names: List[str] = [x for x in self.task_names if x not in self.group_names] + + ungrouped_tasks = [] + tasks_by_groups = {} + + for task_name in task_names: + group_names = self.task_configs[task_name].get("group", None) + if group_names: + if isinstance(group_names, str): + group_names = [group_names] + + for group_name in group_names: + if not tasks_by_groups.get(group_name): + tasks_by_groups[group_name] = [task_name] + else: + tasks_by_groups[group_name].append(task_name) + else: + ungrouped_tasks.append(task_name) + + for task_name in ungrouped_tasks: + eval_preds = samples[task_name] + + # log the samples as a W&B Table + df = self._generate_dataset(eval_preds, self.task_configs.get(task_name)) + self.run.log({f"{task_name}_eval_results": df}) + + # log the samples as a json file as W&B Artifact + self._log_samples_as_artifact(eval_preds, task_name) + + for group, grouped_tasks in tasks_by_groups.items(): + grouped_df = pd.DataFrame() + for task_name in grouped_tasks: + eval_preds = samples[task_name] + df = self._generate_dataset(eval_preds, self.task_configs.get(task_name)) + df["group"] = group + df["task"] = task_name + grouped_df = pd.concat([grouped_df, df], ignore_index=True) + + # log the samples as a json file as W&B Artifact + self._log_samples_as_artifact(eval_preds, task_name) + + self.run.log({f"{group}_eval_results": grouped_df}) diff --git a/lmms_eval/models/__init__.py b/lmms_eval/models/__init__.py new file mode 100644 index 0000000..1fc9b25 --- /dev/null +++ b/lmms_eval/models/__init__.py @@ -0,0 +1,81 @@ +import importlib +import os +import sys + +import hf_transfer +from loguru import logger + +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +logger.remove() +logger.add(sys.stdout, level="WARNING") + +AVAILABLE_MODELS = { + "auroracap": "AuroraCap", + "batch_gpt4": "BatchGPT4", + "claude": "Claude", + "cogvlm2": "CogVLM2", + "from_log": "FromLog", + "fuyu": "Fuyu", + "gemini_api": "GeminiAPI", + "gpt4v": "GPT4V", + "idefics2": "Idefics2", + "instructblip": "InstructBLIP", + "internvl": "InternVLChat", + "internvl2": "InternVL2", + "llama_vid": "LLaMAVid", + "llava": "Llava", + "llava_hf": "LlavaHf", + "llava_onevision": "Llava_OneVision", + "llava_onevision_moviechat": "Llava_OneVision_MovieChat", + "llava_sglang": "LlavaSglang", + "llava_vid": "LlavaVid", + "longva": "LongVA", + "mantis": "Mantis", + "minicpm_v": "MiniCPM_V", + "minimonkey": "MiniMonkey", + "moviechat": "MovieChat", + "mplug_owl_video": "mplug_Owl", + "phi3v": "Phi3v", + "qwen_vl": "Qwen_VL", + "qwen2_vl": "Qwen2_VL", + "qwen_vl_api": "Qwen_VL_API", + "reka": "Reka", + "srt_api": "SRT_API", + "tinyllava": "TinyLlava", + "videoChatGPT": "VideoChatGPT", + "video_llava": "VideoLLaVA", + "vila": "VILA", + "xcomposer2_4KHD": "XComposer2_4KHD", + "internvideo2": "InternVideo2", + "xcomposer2d5": "XComposer2D5", + "oryx": "Oryx", + "videochat2": "VideoChat2", + "llama_vision": "LlamaVision", + "aria": "Aria", +} + + +def get_model(model_name): + if model_name not in AVAILABLE_MODELS: + raise ValueError(f"Model {model_name} not found in available models.") + + model_class = AVAILABLE_MODELS[model_name] + if "." not in model_class: + model_class = f"lmms_eval.models.{model_name}.{model_class}" + + try: + model_module, model_class = model_class.rsplit(".", 1) + module = __import__(model_module, fromlist=[model_class]) + return getattr(module, model_class) + except Exception as e: + logger.error(f"Failed to import {model_class} from {model_name}: {e}") + raise + + +if os.environ.get("LMMS_EVAL_PLUGINS", None): + # Allow specifying other packages to import models from + for plugin in os.environ["LMMS_EVAL_PLUGINS"].split(","): + m = importlib.import_module(f"{plugin}.models") + for model_name, model_class in getattr(m, "AVAILABLE_MODELS").items(): + AVAILABLE_MODELS[model_name] = f"{plugin}.models.{model_name}.{model_class}" diff --git a/lmms_eval/models/aria.py b/lmms_eval/models/aria.py new file mode 100644 index 0000000..cc51314 --- /dev/null +++ b/lmms_eval/models/aria.py @@ -0,0 +1,338 @@ +import re +import warnings +from typing import List, Optional, Tuple, Union + +import numpy as np +import PIL +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from decord import VideoReader, cpu +from PIL import Image +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoProcessor + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.load_video import read_video_pyav_pil + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + +DEFAULT_IMAGE_TOKEN = "" + + +@register_model("aria") +class Aria(lmms): + """ + Llava Model for Hugging Face Transformers: https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/llava + + Adapted from the LLaVA-HF model in lmms_eval/models/llava_hf.py + + Example usage: + + accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval \ + --model aria \ + --model_args pretrained=rhymes-ai/Aria \ + --tasks seedbench \ + --batch_size 1 \ + --output_path ./logs/ \ + --log_samples + """ + + def __init__( + self, + pretrained: str = "rhymes-ai/Aria", + revision: str = "main", + device: str = "cuda", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: int = 1, + attn_implementation: Optional[str] = None, + device_map: str = "auto", + chat_template: Optional[str] = None, + use_cache: bool = True, + specified_eot_token_id: Optional[int] = None, + max_frames_num: Optional[int] = 64, + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + if accelerator.num_processes > 1 and device_map == "": + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + else: + self._device = torch.device(device) + self.device_map = device_map + if isinstance(dtype, str) and dtype != "auto": + dtype = getattr(torch, dtype) + + self.max_frames_num = max_frames_num + self._model = AutoModelForCausalLM.from_pretrained(pretrained, revision=revision, device_map=self.device_map, torch_dtype=torch.bfloat16, trust_remote_code=True, attn_implementation=attn_implementation) + + self.pretrained = pretrained + self._image_processor = AutoProcessor.from_pretrained(pretrained, revision=revision, trust_remote_code=True) + self._tokenizer = self._image_processor.tokenizer + + self._config = self._model.config + self.batch_size_per_gpu = int(batch_size) + self.chat_template = chat_template + self.specified_eot_token_id = specified_eot_token_id + if accelerator.num_processes > 1 and device_map == "": + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with pipeline parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._word_size = 1 + self.accelerator = accelerator + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + raise NotImplementedError("Not implemented for Aria.") + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def load_video(self, video_path, max_frames_num): + if isinstance(video_path, list): + video_path = video_path[0] + return read_video_pyav_pil(video_path, num_frm=max_frames_num) + # if type(video_path) == str: + # vr = VideoReader(video_path, ctx=cpu(0)) + # else: + # vr = VideoReader(video_path[0], ctx=cpu(0)) + # total_frame_num = len(vr) + # uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int) + # frame_idx = uniform_sampled_frames.tolist() + # spare_frames = vr.get_batch(frame_idx).asnumpy() + # spare_frames = [Image.fromarray(x) for x in spare_frames] + # return spare_frames # (frames, height, width, channels) + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = self.flatten(visuals) + if len(visuals) == 0: + task_type = "text" + elif isinstance(visuals[0], PIL.Image.Image): + task_type = "image" + elif isinstance(visuals[0], str): + task_type = "video" + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + + assert self.batch_size_per_gpu == 1, "Do not support batch_size_per_gpu > 1 for now" + text_context = contexts[0] + text_context = text_context.replace("\n\n", "\n") + + context = [] + + if task_type == "video": + try: + visuals = self.load_video(visuals, self.max_frames_num) + except Exception as e: + res.append("") + eval_logger.info(f"Error {e} when loading video : {visuals}") + pbar.update(1) + + if DEFAULT_IMAGE_TOKEN not in context: + context += [{"text": None, "type": "image"}] * len(visuals) + context += [{"text": "\n" + text_context, "type": "text"}] + else: + assert text_context.count(DEFAULT_IMAGE_TOKEN) == len(visuals) + for i, text_chunk in enumerate(text_context.split(DEFAULT_IMAGE_TOKEN)): + context += [{"text": text_chunk, "type": "text"}] + if i < len(visuals): + context += [{"text": None, "type": "image"}] * len(visuals) + context += [{"text": "\n", "type": "text"}] + + # Apply chat template + messages = [{"role": "user", "content": context}] + + text = self._image_processor.apply_chat_template(messages, add_generation_prompt=True) + + # removing redundant placeholders + text = re.sub(r"", "", text) + text = re.sub(r"", "", text) + + eval_logger.debug("DEBUGGING FOR ARIA:" + text) + + if self.accelerator.is_main_process and doc_id[0] % 100 == 0: + eval_logger.debug(f"Prompt for doc ID {doc_id[0]}:\n\n{text}\n") + + if task_type == "video": + inputs = self._image_processor(images=visuals, text=text, return_tensors="pt", max_image_size=490) + else: + inputs = self._image_processor(images=visuals, text=text, return_tensors="pt", max_image_size=980) + + inputs["pixel_values"] = inputs["pixel_values"].to(self.model.dtype) + inputs = {k: v.to(self._device) for k, v in inputs.items()} + + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + gen_kwargs["do_sample"] = False + gen_kwargs["max_new_tokens"] = 1024 + + if "until" in gen_kwargs: + gen_kwargs.pop("until") + + eval_logger.debug(f"generate kwargs: {gen_kwargs}") + + try: + with torch.inference_mode(), torch.cuda.amp.autocast(dtype=torch.bfloat16): + output = self.model.generate( + **inputs, + stop_strings=["<|im_end|>"], + tokenizer=self._image_processor.tokenizer, + **gen_kwargs, + ) + output_ids = output[0][inputs["input_ids"].shape[1] :] + text_outputs = self._image_processor.decode(output_ids, skip_special_tokens=True).replace("<|im_end|>", "") + + ### Basic Model-wise Parsing for CoT-alike Outputs + """ + keywords = [ + "Answer:", + "answer is:", "choice is:", "option is:", + "Answer is:", "Choice is:", "Option is:", + "answer is", "choice is", "option is", + "Answer is", "Choice is", "Option is" + ] + + for keyword in keywords: + if keyword in text_outputs: + eval_logger.debug(f"ARIA Original generated output simplified by keyword [{keyword}]: {text_outputs}") + text_outputs = text_outputs.split(keyword, 1)[-1] + break + """ + eval_logger.debug(f"Generated output: {text_outputs}") + except Exception as ex: + eval_logger.debug(f"Generation failed: {ex}") + if self.accelerator.is_main_process and doc_id[0] % 100 == 0: + eval_logger.debug(f"Generated text for doc ID {doc_id[0]}:\n\n{text_outputs}\n") + + res.append(text_outputs) + self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs) + pbar.update(1) + # reorder this group of results back to original unsorted form + res = re_ords.get_original(res) + + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for LLaVAHF") diff --git a/lmms_eval/models/auroracap.py b/lmms_eval/models/auroracap.py new file mode 100644 index 0000000..48f8028 --- /dev/null +++ b/lmms_eval/models/auroracap.py @@ -0,0 +1,541 @@ +import copy +import json +import logging +import os +import os.path as osp +from typing import List, Optional, Tuple, Union + +import av +import numpy as np +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from huggingface_hub import snapshot_download +from peft import PeftModel +from PIL import Image +from tqdm import tqdm +from transformers import ( + AutoModel, + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + CLIPImageProcessor, +) + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.load_video import read_video_pyav +from lmms_eval.utils import stop_sequences_criteria + +try: + from lmms_eval.models.aurora_xtuner.model.aurora import ( + AuroraEncoder, + AuroraModel, + AuroraSigEncoder, + ) + from lmms_eval.models.aurora_xtuner.utils import PROMPT_TEMPLATE +except ImportError: + eval_logger.error("AuroraCap is not installed. Please install AuroraCap to use this model by `git clone https://github.com/rese1f/aurora.git` and link `src/xtuner/xtuner` to `lmms_eval/models/aurora_xtuner`") +import warnings + +warnings.filterwarnings("ignore") + +eval_logger = logging.getLogger("lmms-eval") + +try: + from llava.constants import ( + DEFAULT_IM_END_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IMAGE_TOKEN, + IGNORE_INDEX, + IMAGE_TOKEN_INDEX, + ) + from llava.conversation import SeparatorStyle, conv_templates + from llava.mm_utils import get_model_name_from_path, tokenizer_image_token +except ImportError: + eval_logger.error("LLaVA is not installed. Please install LLaVA to use this model.") + + +@register_model("auroracap") +class AuroraCap(lmms): + """ + auroracap Model + """ + + def __init__( + self, + pretrained_llm: str = "meta-llama/Meta-Llama-3-8B-Instruct", + pretrained_vit: str = "google/siglip-so400m-patch14-384", + pretrained: str = "model/PATH", + resolution: int = 378, + token_merge_ratio: float = 0.4, + device: Optional[str] = "cuda", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: Optional[Union[int, str]] = 1, + conv_template="vicuna_v1", # vicuna_v1", + video_decode_backend: str = "pyav", + max_frames_num: int = 16, + slowfast: bool = False, + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + else: + self._device = device + + pretrained_pth = snapshot_download(repo_id=pretrained) if not osp.isdir(pretrained) else pretrained + pretrained_llm = pretrained_pth + pretrained_vit = osp.join(pretrained_pth, "visual_encoder") + + self._model = AuroraModel( + slowfast=slowfast, + llm=AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=pretrained_llm, + trust_remote_code=True, + torch_dtype=torch.float16, + ), + visual_encoder=AuroraEncoder.from_pretrained( + pretrained_model_name_or_path=pretrained_vit, + torch_dtype=torch.float16, + ), + ) + + projector_path = osp.join(pretrained_pth, "projector") + self.model.projector = AutoModel.from_pretrained(projector_path, torch_dtype=torch.float16, trust_remote_code=True) + + self._image_processor = CLIPImageProcessor.from_pretrained( + pretrained_model_name_or_path="laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", # use standard CLIP processor + trust_remote_code=True, + size=resolution, + crop_size=resolution, + ) + self._tokenizer = AutoTokenizer.from_pretrained( + pretrained_model_name_or_path=pretrained_llm, + trust_remote_code=True, + padding_side="right", + ) + # compute token merge ratio settings + self.patch_size = self._model.visual_encoder.config.patch_size + self.num_layers = self._model.visual_encoder.config.num_hidden_layers + self.token_merge_ratio = token_merge_ratio + + self._config = self._model.config + self.model.eval() + self.model.tie_weights() + self.batch_size_per_gpu = int(batch_size) + self.conv_template = conv_template + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + self._model.visual_encoder = accelerator.prepare(self.model.visual_encoder) + self._model.projector = accelerator.prepare(self.model.projector) + else: # DistributedType.MULTI_GPU + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self._model.visual_encoder = accelerator.prepare_model(self.model.visual_encoder, evaluation_mode=True) + self._model.projector = accelerator.prepare_model(self.model.projector, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.model.to(self._device) + self._rank = 0 + self._word_size = 1 + + # For Video Caption + self.video_decode_backend = video_decode_backend + self.max_frames_num = int(max_frames_num) + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + def pad_sequence(self, input_ids, batch_first, padding_value): + if self.tokenizer.padding_side == "left": + input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids] + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value) + if self.tokenizer.padding_side == "left": + input_ids = torch.flip(input_ids, [1]) + return input_ids + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def process_images(self, images, image_processor, model_cfg): + image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None) + new_images = [] + if image_aspect_ratio == "pad": + for image in images: + image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean)) + image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0] + new_images.append(image) + elif image_aspect_ratio == "anyres": + for image in images: + image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints) + new_images.append(image) + else: + return image_processor(images, return_tensors="pt")["pixel_values"] + if all(x.shape == new_images[0].shape for x in new_images): + new_images = torch.stack(new_images, dim=0) + return new_images + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + if type(doc_to_target) == str: + continuation = doc_to_target + else: + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + if visuals: + image = self.process_images(visuals, self._image_processor, self._config) + if type(image) is list: + image = [_image.to(dtype=torch.float16, device=self.device) for _image in image] + else: + image = image.to(dtype=torch.float16, device=self.device) + else: + image = None + + prompts_input = contexts[0] + + if image is not None and len(image) != 0 and DEFAULT_IMAGE_TOKEN not in prompts_input: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visuals) + image_tokens = " ".join(image_tokens) + prompts_input = image_tokens + "\n" + contexts[0] + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], prompts_input) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + # Add the answer of the second role + conv.messages[1][1] = continuation + + prompt = conv.get_prompt() + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + labels = input_ids.clone() + # Context part no need to calculate for loss + labels[0, : contxt_id.shape[1]] = -100 + with torch.inference_mode(): + data = dict() + data["pixel_values"] = image_tensor + data["input_ids"] = input_ids + data["attention_mask"] = attention_masks + self.model.visual_encoder.reset_tome_r(self.token_merge_ratio) + output = self.model(data, mode="tensor") + + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + pbar.close() + return res + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def load_video(self, video_path, max_frames_num): + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + spare_frames = vr.get_batch(frame_idx).asnumpy() + return spare_frames # (frames, height, width, channels) + + def extract_keyframes(self, video_path, keyframes): + container = av.open(video_path) + video_stream = container.streams.video[0] + fps = video_stream.average_rate + time_base = video_stream.time_base + frames = [] + + for keyframe in keyframes: + keyframe_time = float(keyframe) + frame_number = int(keyframe_time * fps) + container.seek(int(keyframe_time / time_base)) + found = False + for packet in container.demux(video=0): + for frame in packet.decode(): + if frame.index >= frame_number: + frames.append(frame) + found = True + break + if found: + break + + if not found: + container.seek(-1, any_frame=False) + for packet in container.demux(video=0): + for frame in packet.decode(): + pass + frames.append(frame) + + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] # the length of visuals is 1, equal to batchsize + visuals = self.flatten(visuals) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + + # Set default values for until and max_new_tokens + until = [self.tok_decode(self.eot_token_id)] + + # Update values from gen_kwargs if present + if "until" in gen_kwargs: + until = gen_kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + + if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__: + # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation + self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") + eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}") + # encode, pad, and truncate contexts for this batch + if visuals: + if isinstance(visuals[0], dict): + video_path = visuals[0]["video_path"] + keyframe = visuals[0]["keyframe"] + video = self.extract_keyframes(video_path, keyframe) + image_tensor = self.process_images(video, self._image_processor, self._config).cuda() + elif isinstance(visuals, list): + print(visuals[0]) + if isinstance(visuals[0], Image.Image): + image_tensor = self.process_images(visuals, self._image_processor, self._config) + else: + if visuals[0].endswith("mp4"): + if self.video_decode_backend == "decord": + video = self.load_video(visuals[0], self.max_frames_num) + elif self.video_decode_backend == "pyav": + video = read_video_pyav(visuals[0], num_frm=self.max_frames_num) + image_tensor = self.process_images(video, self._image_processor, self._config).cuda() + elif visuals[0].endswith("mkv"): + assert self.video_decode_backend == "pyav", "we only tested this case, decord may not work" + video = read_video_pyav(visuals[0], num_frm=self.max_frames_num) + image_tensor = self.process_images(video, self._image_processor, self._config).cuda() + + if type(image_tensor) is list: + image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor] + else: + image_tensor = image_tensor.to(dtype=torch.float16, device=self.device) + + else: + image_tensor = None + + question_input = [] + + for visual, context in zip(visuals, contexts): + if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + if isinstance(visuals[0], dict): + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(video) + elif isinstance(visuals, list): + if isinstance(visuals[0], Image.Image): + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN] + else: + if visual.endswith("mp4") or visual.endswith("mkv"): + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(video) + + image_tokens = " ".join(image_tokens) + question = image_tokens + "\n" + context + + else: + question = context + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + question_input.append(prompt_question) + + # The above for loop has bugs. When there is no visuals, e.g. pure text, + # there will be no for loop execute resulting in an empty question_input (because no visuals) + # Scenario 1 won't even be execute + if len(visuals) == 0: + for context in contexts: + question = context + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + question_input.append(prompt_question) + + # preconfigure gen_kwargs with defaults + if isinstance(visuals[0], dict): + gen_kwargs["image_sizes"] = [video[idx].size for idx in range(len(video))] + elif isinstance(visuals, list): + if isinstance(visuals[0], Image.Image): + gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + else: + if visuals[0].endswith("mp4"): + gen_kwargs["image_sizes"] = [video[idx].size for idx in range(len(video))] + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input] + pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device) + attention_masks = input_ids.ne(pad_token_ids).to(self.device) + # These steps are not in LLaVA's original code, but are necessary for generation to work + try: + data = dict() + if isinstance(visuals[0], dict): + data["pixel_values"] = image_tensor.unsqueeze(0) + elif isinstance(visuals, list): + if isinstance(visuals[0], Image.Image): + data["pixel_values"] = image_tensor + else: + if visuals[0].endswith("mp4") or visuals[0].endswith("mkv"): + data["pixel_values"] = image_tensor.unsqueeze(0) + + data["input_ids"] = input_ids + data["attention_mask"] = attention_masks + self.model.visual_encoder.reset_tome_r(self.token_merge_ratio) + output = self.model(data, mode="inference") + cont = self.model.llm.generate( + **output, + do_sample=True if gen_kwargs["temperature"] > 0 else False, + temperature=gen_kwargs["temperature"], + top_p=gen_kwargs["top_p"], + num_beams=gen_kwargs["num_beams"], + max_new_tokens=gen_kwargs["max_new_tokens"], + ) + text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True) + except Exception as e: + eval_logger.error(f"Error {e} in generating") + cont = "" + text_outputs = [""] + + print(text_outputs) + + res.extend(text_outputs) + self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs) + pbar.update(1) + # reorder this group of results back to original unsorted form + res = re_ords.get_original(res) + + pbar.close() + return res diff --git a/lmms_eval/models/batch_gpt4.py b/lmms_eval/models/batch_gpt4.py new file mode 100755 index 0000000..e624310 --- /dev/null +++ b/lmms_eval/models/batch_gpt4.py @@ -0,0 +1,207 @@ +# Standard library imports +import base64 +import json +import os +import time +from copy import deepcopy +from io import BytesIO + +import numpy as np +import requests as url_requests + +# Related third-party imports +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState +from loguru import logger as eval_logger +from openai import OpenAI +from PIL import Image +from tqdm import tqdm + +# Local application/library specific imports +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +# Conditional imports +try: + from decord import VideoReader, cpu +except ImportError: + eval_logger.warning("Decord is not installed. Video input will not be supported.") + +# Constants and global configurations +API_TYPE = os.getenv("API_TYPE", "openai") +NUM_SECONDS_TO_SLEEP = 5 + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } +else: + API_URL = "YOUR_API_URL" + API_KEY = "YOUR_API_KEY" + + +@register_model("batch_gpt4") +class BatchGPT4(lmms): + def __init__( + self, + model_version: str = "gpt-4o", + api_key: str = API_KEY, + api_url: str = API_URL, + modality: str = "image", + max_frames_num: int = 10, + timeout: int = 120, + **kwargs, + ) -> None: + super().__init__() + # Manually set a image token for GPT4V so that we can search for it + # and split the text and image + # Here we just use the same token as llava for convenient + self.model_version = model_version + self.modality = modality + self.max_frames_num = max_frames_num + self.image_token = "" + self.timeout = timeout + + self.api_key = api_key + self.api_url = api_url + self.client = OpenAI(api_key=api_key) + + accelerator = Accelerator() + assert accelerator.state.local_process_index == 0, "BatchGPT4 does not support distributed inference." + assert accelerator.state.num_processes == 1, "BatchGPT4 does not support distributed inference." + + # Function to encode the image + def encode_image(self, image: Image): + output_buffer = BytesIO() + image.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + return base64_str + + # Function to encode the video + def encode_video(self, video_path, for_get_frames_num): + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, for_get_frames_num, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + frames = vr.get_batch(frame_idx).asnumpy() + + base64_frames = [] + for frame in frames: + img = Image.fromarray(frame) + output_buffer = BytesIO() + img.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + base64_frames.append(base64_str) + + return base64_frames + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests): + # Prepare the batch requests data + requests_data = {} + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Batch Preparing") + for idx, (contexts, gen_kwargs, doc_to_visual, doc_id, task, split) in enumerate([reg.args for reg in requests]): + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + imgs = [] + for visual in visuals: + if self.modality == "image": + img = self.encode_image(visual) + imgs.append(img) + elif self.modality == "video": + frames = self.encode_video(visual, self.max_frames_num) + imgs.extend(frames) + + messages = [] + if self.image_token not in contexts: + messages.append({"role": "user", "content": contexts}) + for img in imgs: + messages.append({"role": "user", "content": f"data:image/jpeg;base64,{img}"}) + else: + contexts_split = contexts.split(self.image_token) + for idx, context in enumerate(contexts_split): + if idx < len(imgs): + messages.append({"role": "user", "content": context}) + messages.append({"role": "user", "content": f"data:image/jpeg;base64,{imgs[idx]}"}) + if len(contexts_split) > len(imgs): + messages.append({"role": "user", "content": contexts_split[-1]}) + + requests_data[f"request-{idx}"] = {"model": self.model_version, "messages": messages, "max_tokens": gen_kwargs.get("max_new_tokens", 1024)} + pbar.update(1) + + file_path = os.getenv("HF_HOME", "~/.cache/huggingface") + f"/batchinput_{len(requests_data)}.jsonl" + file_path = self.create_batch_input_file(requests_data, file_path) + file_id = self.upload_input_file(file_path) + + batch_response = self.create_batch(file_id, metadata={"description": "Batch Processing for GPT-4"}) + batch_status = self.check_batch_status(batch_response.id) + while True: + batch_status = self.check_batch_status(batch_response.id) + if batch_status.status == "completed": + eval_logger.info("Batch processing completed.") + batch_results = self.retrieve_batch_results(batch_status.output_file_id) + res = [result["response"]["choices"][0]["message"]["content"] for result in json.loads(batch_results)] + return res + elif batch_status.status == "failed": + eval_logger.info("Batch processing failed.") + res = ["Batch failed"] * len(requests) + return res + else: + eval_logger.info(f"Batch status: {batch_status.status}. Retrying in {NUM_SECONDS_TO_SLEEP} seconds.") + time.sleep(NUM_SECONDS_TO_SLEEP) + + def loglikelihood(self, requests): + # TODO + assert False, "GPT4V not support" + + def create_batch_input_file(self, requests_data, file_path="batchinput.jsonl"): + with open(file_path, "w") as file: + for request_id, data in requests_data.items(): + json_record = json.dumps({"custom_id": request_id, "method": "POST", "url": "/v1/chat/completions", "body": data}) + file.write(json_record + "\n") + return file_path + + def upload_input_file(self, file_path): + with open(file_path, "rb") as file: + response = self.client.files.create(file=file, purpose="batch") + return response.id + + def create_batch(self, file_id, metadata=None): + if metadata is None: + metadata = {} + response = self.client.batches.create(input_file_id=file_id, endpoint="/v1/chat/completions", completion_window="24h", metadata=metadata) + return response + + def check_batch_status(self, batch_id): + return self.client.batches.retrieve(batch_id) + + def retrieve_batch_results(self, file_id): + return self.client.files.content(file_id) + + def cancel_batch(self, batch_id): + return self.client.batches.cancel(batch_id) + + def list_batches(self, limit=10): + return self.client.batches.list(limit=limit) + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for BatchGPT4") diff --git a/lmms_eval/models/cambrian.py b/lmms_eval/models/cambrian.py new file mode 100644 index 0000000..855b87f --- /dev/null +++ b/lmms_eval/models/cambrian.py @@ -0,0 +1,313 @@ +# code is migrated from https://github.com/kangreen0210/LIME-M/blob/main/lmms_eval/models/cambrian.py +import os +import uuid +import warnings +from typing import List, Optional, Tuple, Union + +import torch +from accelerate import Accelerator, DistributedType +from PIL import Image +from tqdm import tqdm +from transformers import PreTrainedTokenizer + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +warnings.simplefilter("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + +try: + from cambrian.conversation import conv_templates + from cambrian.mm_utils import ( + get_model_name_from_path, + process_images, + tokenizer_image_token, + ) + from cambrian.model.builder import load_pretrained_model +except ImportError: + eval_logger.error("Cambrian is not installed. Please install it by running `pip install cambrian`.") + +# Model Constants +IMAGE_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_IM_START_TOKEN = "" +DEFAULT_IM_END_TOKEN = "" + + +def process(image, question, tokenizer, image_processor, model_config, conv_mode): + qs = question + + if model_config.mm_use_im_start_end: + qs = f"{DEFAULT_IM_START_TOKEN}{DEFAULT_IMAGE_TOKEN}{DEFAULT_IM_END_TOKEN}\n{qs}" + else: + qs = f"{DEFAULT_IMAGE_TOKEN}\n{qs}" + + conv = conv_templates[conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + image_size = [image.size] + image_tensor = process_images([image], image_processor, model_config) + + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() + + return input_ids, image_tensor, image_size, prompt + + +def make_context( + tokenizer: PreTrainedTokenizer, + query: str, + history: List[Tuple[str, str]] = None, + system: str = "", + max_window_size: int = 6144, + chat_format: str = "chatml", +): + if history is None: + history = [] + + if chat_format == "chatml": + im_start, im_end = "<|im_start|>", "<|im_end|>" + im_start_tokens = [tokenizer.im_start_id] + im_end_tokens = [tokenizer.im_end_id] + nl_tokens = tokenizer.encode("\n") + + def _tokenize_str(role, content): + return f"{role}\n{content}", tokenizer.encode(role, allowed_special=set(tokenizer.IMAGE_ST)) + nl_tokens + tokenizer.encode(content, allowed_special=set(tokenizer.IMAGE_ST)) + + system_text, system_tokens_part = _tokenize_str("system", system) + system_tokens = im_start_tokens + system_tokens_part + im_end_tokens + + raw_text = "" + context_tokens = [] + + for turn_query, turn_response in reversed(history): + query_text, query_tokens_part = _tokenize_str("user", turn_query) + query_tokens = im_start_tokens + query_tokens_part + im_end_tokens + if turn_response is not None: + response_text, response_tokens_part = _tokenize_str("assistant", turn_response) + response_tokens = im_start_tokens + response_tokens_part + im_end_tokens + + next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens + prev_chat = f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}" + else: + next_context_tokens = nl_tokens + query_tokens + nl_tokens + prev_chat = f"\n{im_start}{query_text}{im_end}\n" + + current_context_size = len(system_tokens) + len(next_context_tokens) + len(context_tokens) + if current_context_size < max_window_size: + context_tokens = next_context_tokens + context_tokens + raw_text = prev_chat + raw_text + else: + break + + context_tokens = system_tokens + context_tokens + raw_text = f"{im_start}{system_text}{im_end}" + raw_text + context_tokens += nl_tokens + im_start_tokens + _tokenize_str("user", query)[1] + im_end_tokens + nl_tokens + im_start_tokens + tokenizer.encode("assistant") + nl_tokens + raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" + + elif chat_format == "raw": + raw_text = query + context_tokens = tokenizer.encode(raw_text) + else: + raise NotImplementedError(f"Unknown chat format {chat_format!r}") + + return raw_text, context_tokens + + +@register_model("cambrian") +class Cambrian(lmms): + def __init__( + self, + pretrained: str = "nyu-visionx/cambrian-8b", + device: Optional[str] = "cuda", + device_map="auto", + batch_size: Optional[Union[int, str]] = 1, + trust_remote_code: Optional[bool] = True, + use_cache=True, + **kwargs, + ) -> None: + super().__init__() + assert not kwargs, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + self._device = torch.device(f"cuda:{accelerator.local_process_index}") if accelerator.num_processes > 1 else device + + self.model_name = get_model_name_from_path(pretrained) + tokenizer, model, self.image_processor, context_len = load_pretrained_model(pretrained, None, self.model_name, device_map=self._device) + + self.conv_mode = {"cambrian-8b": "llama_3", "cambrian-13b": "vicuna_v1", "cambrian-34b": "chatml_direct"}.get(self.model_name) + + if not self.conv_mode: + raise ValueError(f"Unsupported model: {self.model_name}") + + self._model = model + self._tokenizer = tokenizer + self._model.eval() + self.batch_size_per_gpu = int(batch_size) + self.use_cache = use_cache + self._rank = 0 + self._world_size = 1 + + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU], "Unsupported distributed type. Only DDP and FSDP are supported." + self._model = accelerator.prepare(self.model) if accelerator.distributed_type == DistributedType.FSDP else accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.model.to(self._device) + + self.accelerator = accelerator + + @property + def model(self): + return self.accelerator.unwrap_model(self._model) if hasattr(self, "accelerator") else self._model + + @property + def eot_token_id(self): + return self.tokenizer.eos_token_id + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + continuation = doc_to_target if isinstance(doc_to_target, str) else doc_to_target(self.task_dict[task][split][doc_id]) + visuals = self.flatten([doc_to_visual(self.task_dict[task][split][doc_id])]) + + query = [] + visual_paths = [] + for visual in visuals: + name = uuid.uuid4().hex.upper()[0:6] + visual_path = f"/tmp/{name}.png" + visual.save(visual_path) + visual_paths.append(visual_path) + query.append({"image": visual_path}) + + context_query = query.copy() + context_query.append({"text": contexts}) + query.append({"text": contexts + continuation}) + + context_query = self.tokenizer.from_list_format(context_query) + query = self.tokenizer.from_list_format(query) + + _, context_tokens = make_context( + self.tokenizer, context_query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format + ) + context_tokens = torch.tensor([context_tokens]) + + _, continuation_tokens = make_context(self.tokenizer, query, history=None, system="You are a helpful assistant", max_window_size=self.model.generation_config.max_window_size, chat_format=self.model.generation_config.chat_format) + continuation_tokens = torch.tensor([continuation_tokens]).to(self.model.device) + attn_mask = torch.ones_like(continuation_tokens).to(self.model.device) + labels = continuation_tokens.clone().to(self.model.device) + labels[:, : context_tokens.shape[1]] = -100 + + with torch.inference_mode(): + outputs = self.model(input_ids=continuation_tokens, labels=labels, attention_mask=attn_mask) + + loss = outputs.loss + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = continuation_tokens[:, context_tokens.shape[1] :] + greedy_tokens = greedy_tokens[:, context_tokens.shape[1] : continuation_tokens.shape[1]] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + + pbar.close() + return res + + @staticmethod + def flatten(input_list): + return [item for sublist in input_list for item in sublist] + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + toks = self.tokenizer.encode(x[0]) + return -len(toks), x[0] + + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = self.flatten([doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]) + + visual_paths = [] + for visual in visuals: + name = uuid.uuid4().hex.upper()[0:6] + visual_path = f"/xpfs/public/gezhang/zk/lmms-eval/lmms_eval/tmp/{name}.png" + visual.save(visual_path) + visual_paths.append(visual_path) + + gen_kwargs = all_gen_kwargs[0] + until = [self.tokenizer.decode(self.eot_token_id)] + + if "until" in gen_kwargs: + until = gen_kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + + gen_kwargs.setdefault("image_sizes", [visuals[0].size] if visuals else None) + gen_kwargs.setdefault("max_new_tokens", 1024) + gen_kwargs.setdefault("temperature", 0) + gen_kwargs.setdefault("top_p", None) + gen_kwargs.setdefault("num_beams", 1) + + until.append("<|eot_id|>") + + image = Image.open(visual_paths[0]).convert("RGB") + question = contexts[0] + + input_ids, image_tensor, image_sizes, prompt = process(image, question, self.tokenizer, self.image_processor, self.model.config, self.conv_mode) + input_ids = input_ids.to(device=self.model.device, non_blocking=True) + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + images=image_tensor, + image_sizes=image_sizes, + do_sample=gen_kwargs["temperature"] > 0, + temperature=gen_kwargs["temperature"], + num_beams=gen_kwargs["num_beams"], + max_new_tokens=gen_kwargs["max_new_tokens"], + use_cache=True, + ) + + text_outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + + for term in until: + if term: + text_outputs = text_outputs.split(term)[0] + + print(text_outputs) + res.append(text_outputs) + + for visual_path in visual_paths: + try: + os.remove(visual_path) + except OSError: + pass + + pbar.update(1) + + res = re_ords.get_original(res) + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for Cambrian") diff --git a/lmms_eval/models/claude.py b/lmms_eval/models/claude.py new file mode 100644 index 0000000..ce5ed6b --- /dev/null +++ b/lmms_eval/models/claude.py @@ -0,0 +1,276 @@ +import base64 +import json +import os +import time +from copy import deepcopy +from io import BytesIO +from typing import List, Tuple, Union + +from accelerate import Accelerator, DistributedType +from PIL import Image +from tqdm import tqdm + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +NUM_SECONDS_TO_SLEEP = 5 + +from loguru import logger + +from lmms_eval.models.model_utils.load_video import read_video_pyav_pil + +eval_logger = logger + +try: + import anthropic + import numpy as np + from decord import VideoReader, cpu +except Exception as e: + eval_logger.warning(f"Error importing claude: {e}") + +API_URL = os.getenv("ANTHROPIC_API_URL", "https://api.anthropic.com/v1/complete") +API_KEY = os.getenv("ANTHROPIC_API_KEY", "YOUR_API_KEY") + + +@register_model("claude") +class Claude(lmms): + def __init__( + self, + model_version: str = "claude-3-opus-20240229", + image_token: str = "", # Use to separate interleaved image and text + system_prompt: str = "", # Whether you want some special system prompt here + modality: str = "image", + max_frames_num: int = 10, + continual_mode: bool = False, + response_persistent_folder: str = None, + fps=None, + **kwargs, + ) -> None: + super().__init__() + self.model_version = model_version + self.image_token = image_token + self.system_prompt = system_prompt + self.modality = modality + self.max_frames_num = max_frames_num + self.fps = fps + + self.continual_mode = continual_mode + if self.continual_mode: + if response_persistent_folder is None: + raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.") + + os.makedirs(response_persistent_folder, exist_ok=True) + self.response_persistent_folder = response_persistent_folder + self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json") + + if os.path.exists(self.response_persistent_file): + with open(self.response_persistent_file, "r") as f: + self.response_cache = json.load(f) + self.cache_mode = "resume" + else: + self.response_cache = {} + self.cache_mode = "start" + + accelerator = Accelerator() + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.accelerator = accelerator + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + + self.device = self.accelerator.device + + def encode_image(self, image): + output_buffer = BytesIO() + image.save(output_buffer, format="JPEG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + return base64_str + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def get_image_size(self, image): + # Create a BytesIO object to store the image bytes + img_byte_array = BytesIO() + + # Save the image to the BytesIO object + image.save(img_byte_array, format="PNG") + + # Get the size of the BytesIO object + img_size = img_byte_array.tell() + + return img_size + + # The max file size is 5MB for claude + def shrink_image_to_file_size(self, img: Image, max_file_size=4838990) -> Image: + # Get the current size of the image + original_size = self.get_image_size(img) + + # If the image size is already smaller than the desired size, return + if original_size <= max_file_size: + return img + + # Calculate the ratio to shrink the image + # Somehow I found out sqrt ratio is not enough to shrink the image + # below threshold, so I guess we do more + shrink_ratio = min(0.9, max_file_size / original_size) + + # Resize the image with the calculated ratio + new_width = int(img.width * shrink_ratio) + new_height = int(img.height * shrink_ratio) + img = img.resize((new_width, new_height), Image.LANCZOS) + + return self.shrink_image_to_file_size(img, max_file_size) + + def encode_video(self, video_path): + # vr = VideoReader(video_path, ctx=cpu(0)) + # total_frame_num = len(vr) + # uniform_sampled_frames = np.linspace(0, total_frame_num - 1, self.max_frames_num, dtype=int) + # frame_idx = uniform_sampled_frames.tolist() + # frames = vr.get_batch(frame_idx).asnumpy() + frames = read_video_pyav_pil(video_path, num_frm=self.max_frames_num, fps=self.fps) + + base64_frames = [] + for img in frames: + output_buffer = BytesIO() + img.save(output_buffer, format="JPEG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + base64_frames.append(f"{base64_str}") + + return base64_frames + + def generate_until(self, requests) -> List[str]: + client = anthropic.Anthropic() + + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + empty_image_block = { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/jpeg", + }, + } + empty_text_block = {"type": "text"} + empty_messages = [ + { + "role": "user", + "content": [], + } + ] + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + ###################### CONTINUAL MODE ###################### + if self.continual_mode is True and self.cache_mode == "resume": + doc_uuid = f"{task}___{split}___{doc_id}" + if doc_uuid in self.response_cache: + response_text = self.response_cache[doc_uuid] + if response_text: + res.append(response_text) + pbar.update(1) + continue + + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + imgs = [] + for visual in visuals: + if isinstance(visual, str) and os.path.exists(visual): # Assuming visual is a path to a video + visual = self.encode_video(visual) + for img in visual: + imgs.append(img) + else: + visual = self.shrink_image_to_file_size(visual) + img = self.encode_image(visual) + imgs.append(img) + + messages = deepcopy(empty_messages) + + if self.image_token not in contexts: + for img in imgs: + image_block = deepcopy(empty_image_block) + image_block["source"]["data"] = img + messages[0]["content"].append(image_block) + text_block = deepcopy(empty_text_block) + text_block["text"] = contexts + messages[0]["content"].append(text_block) + else: + contexts = contexts.split(self.image_token) + for idx, img in enumerate(imgs): + text_block = deepcopy(empty_text_block) + image_block = deepcopy(empty_image_block) + text_block["text"] = contexts + messages[0]["content"].append(text_block) + image_block["source"]["data"] = img + messages[0]["content"].append(image_block) + + # If n image tokens are in the contexts + # contexts will be splitted into n+1 chunks + # Manually add it into the messages + text_block = deepcopy(empty_text_block) + text_block["text"] = contexts + messages["content"].append(text_block) + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if gen_kwargs["max_new_tokens"] > 4096: + gen_kwargs["max_new_tokens"] = 4096 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs or gen_kwargs["top_p"] is None: + gen_kwargs["top_p"] = 1 + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + response_text = "" + + for attempt in range(5): + retry_flag = True + try: + message = client.messages.create(model=self.model_version, max_tokens=gen_kwargs["max_new_tokens"], system=self.system_prompt, temperature=gen_kwargs["temperature"], top_p=gen_kwargs["top_p"], messages=messages) + retry_flag = False + response_text = message.content[0].text + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") + if attempt < 5 - 1: # If we have retries left, sleep and then continue to next attempt + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}") + pbar.update(1) + continue + if not retry_flag: + break + eval_logger.info("Retrying...") + + res.append(response_text) + pbar.update(1) + + ###################### CONTINUAL MODE ###################### + if self.continual_mode is True: # Cache the response + doc_uuid = f"{task}___{split}___{doc_id}" + self.response_cache[doc_uuid] = response_text + with open(self.response_persistent_file, "w") as f: + json.dump(self.response_cache, f, indent=4) + + pbar.close() + + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + assert False, "Not supported for claude" + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for Claude") diff --git a/lmms_eval/models/cogvlm2.py b/lmms_eval/models/cogvlm2.py new file mode 100644 index 0000000..ece7d01 --- /dev/null +++ b/lmms_eval/models/cogvlm2.py @@ -0,0 +1,229 @@ +import warnings +from typing import List, Optional, Tuple, Union + +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + + +@register_model("cogvlm2") +class CogVLM2(lmms): + """ + CogVLM2 Model + """ + + def __init__( + self, + pretrained: str = "THUDM/cogvlm2-llama3-chinese-chat-19B", + device: Optional[str] = "cuda", + dtype: Optional[Union[str, torch.dtype]] = torch.bfloat16, + batch_size: Optional[Union[int, str]] = 1, + trust_remote_code: Optional[bool] = True, + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + else: + self._device = device + self.dtype = dtype + self._model = AutoModelForCausalLM.from_pretrained(pretrained, trust_remote_code=trust_remote_code, torch_dtype=dtype, device_map=self._device) + self._tokenizer = AutoTokenizer.from_pretrained(pretrained, trust_remote_code=trust_remote_code) + self._config = self._model.config + self.model.eval() + self.model.tie_weights() + self.batch_size_per_gpu = int(batch_size) + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self._rank = 0 + self._word_size = 1 + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "We have not implemented this function for CogVLM2 yet" + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = self.flatten(visuals) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + + # Set default values for until and max_new_tokens + until = [self.tok_decode(self.eot_token_id)] + + # Update values from gen_kwargs if present + if "until" in gen_kwargs: + until = gen_kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + assert self.batch_size_per_gpu == 1, "Do not support batch_size_per_gpu > 1 for now" + assert len(visuals) == 1, "CogVLM2 interface does not support bn_image > 1 for now" + context = contexts[0] + if "" in context: + context = context.replace("", "") + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + image = visuals[0] + input_by_model = self.model.build_conversation_input_ids(self.tokenizer, query=context, history=[], images=[image]) + + inputs = { + "input_ids": input_by_model["input_ids"].unsqueeze(0).to(self.device), + "token_type_ids": input_by_model["token_type_ids"].unsqueeze(0).to(self.device), + "attention_mask": input_by_model["attention_mask"].unsqueeze(0).to(self.device), + "images": [[input_by_model["images"][0].to(self.device).to(self.dtype)]], + } + if "cross_images" in input_by_model and input_by_model["cross_images"]: + inputs["cross_images"] = [[input_by_model["cross_images"][0].to(self.device).to(self.dtype)]] + + try: + outputs = self.model.generate(**inputs, **gen_kwargs) + outputs = outputs[:, inputs["input_ids"].shape[1] :] + response = self.tokenizer.decode(outputs[0]) + response = response.split("")[0] + response = response.split("<|end_of_text|>")[0] + + context = [{"role": "user", "content": context}, {"role": "assistant", "content": response}] + except Exception as e: + eval_logger.error(f"Error {e} in generating") + cont = "" + res.append(response) + self.cache_hook.add_partial("generate_until", (context, gen_kwargs), response) + pbar.update(1) + # reorder this group of results back to original unsorted form + res = re_ords.get_original(res) + + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for CogVLM2") diff --git a/lmms_eval/models/from_log.py b/lmms_eval/models/from_log.py new file mode 100644 index 0000000..f9f88f9 --- /dev/null +++ b/lmms_eval/models/from_log.py @@ -0,0 +1,119 @@ +import json +import os +import re +from datetime import datetime +from typing import List, Tuple + +from accelerate import Accelerator, DistributedType +from loguru import logger as eval_logger +from tqdm import tqdm + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + + +@register_model("from_log") +class FromLog(lmms): + def __init__( + self, + logs: str = "logs", + model_name: str = None, + model_args: str = None, + have_limits: bool = False, + **kwargs, + ) -> None: + super().__init__() + + self.logs = {} + + log_folders = logs.split(",") + + def matched_model(_model_args): + if model_name and model_name != _model_args["model"]: + return False + + if model_args: + _model_args_list = model_args.split(",") + + for _model_arg in _model_args_list: + if _model_arg not in _model_args["model_args"]: + return False + + if not have_limits and _model_args["limit"] is not None: + return False + + return True + + for log_folder in log_folders: + for root, dirs, files in os.walk(log_folder): + for file in files: + if file.endswith(".json"): + try: + log_file = os.path.join(root, file) + + with open(log_file, "r") as f: + log_data = json.load(f) + + # check if model is matched + _model_args = log_data["args"] + if not matched_model(_model_args): + raise Exception("Model not matched") + + # load logs + logs = {} + for data in log_data["logs"]: + id = data["doc_id"] + response = data["resps"][0] + logs[id] = response + + task = log_data["model_configs"]["task"] + + pattern = re.compile(r"\d{4}_\d{4}") + + if "time" in log_data: + log_time = log_data["time"] + elif pattern.search(os.path.abspath(log_file)): + log_time = pattern.findall(os.path.abspath(log_file))[-1] + else: + log_time = "unknown" + + if task not in self.logs or (self.logs[task]["time"] == "unknown" or datetime.strptime(log_time, "%m%d_%H%M") > datetime.strptime(self.logs[task]["time"], "%m%d_%H%M")): + self.logs[task] = {"time": log_time, "logs": logs} + + except Exception as e: + pass + + accelerator = Accelerator() + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.accelerator = accelerator + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + + self.device = self.accelerator.device + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + response = self.logs[task]["logs"][doc_id] + res.append(response[0]) + pbar.update(1) + + pbar.close() + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "not support" + + def generate_until_multi_round(self, requests) -> List[str]: + return generate_until(self, requests) diff --git a/lmms_eval/models/fuyu.py b/lmms_eval/models/fuyu.py new file mode 100755 index 0000000..6c9e0e0 --- /dev/null +++ b/lmms_eval/models/fuyu.py @@ -0,0 +1,269 @@ +import warnings + +warnings.simplefilter("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore") + +from typing import List, Optional, Tuple, Union + +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from loguru import logger as eval_logger +from PIL import Image +from tqdm import tqdm +from transformers import ( + AutoTokenizer, + FuyuForCausalLM, + FuyuImageProcessor, + FuyuProcessor, +) + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + + +@register_model("fuyu") +class Fuyu(lmms): + """ + Fuyu Model + """ + + def __init__( + self, + pretrained: str = "adept/fuyu-8b", + device: Optional[str] = "cuda", + max_new_tokens: int = 256, + batch_size: Optional[Union[int, str]] = 1, + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + else: + self._device = device + + self._model = FuyuForCausalLM.from_pretrained(pretrained, torch_dtype=torch.bfloat16, device_map=self.device) + self.model.eval() + self.model.tie_weights() + self._tokenizer = AutoTokenizer.from_pretrained(pretrained) + self._config = self.model.config + + self.image_processor = FuyuImageProcessor() + self.processor = FuyuProcessor(image_processor=self.image_processor, tokenizer=self.tokenizer) + self.max_new_tokens = max_new_tokens + self.batch_size_per_gpu = int(batch_size) + accelerator = Accelerator() + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.model.to(self._device) + self._rank = 0 + self._word_size = 1 + + """if accelerator.num_processes > 1: + assert accelerator.distributed_type in [ + DistributedType.FSDP, + DistributedType.MULTI_GPU, + ], "Unsupported distributed type provided. Only DDP and FSDP are supported." + if accelerator.distributed_type == DistributedType.FSDP: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes""" + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + # Assuming max_length is the sum of max context tokens and max new tokens + return self.tokenizer.model_max_length + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def flatten(self, input, only_get_first=False): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + if only_get_first: + break + return new_list + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = self.flatten(visuals, only_get_first=True) + gen_kwargs = all_gen_kwargs[0] + + # if isinstance(visuals[0], list): + # visuals = [visuals[idx][0] for idx in range(len(visuals))] # get the first image in multi-image scenarios. + + # assert len(contexts) == self.batch_size_per_gpu, f"Expected contexts batch size {self.batch_size_per_gpu}, got {len(contexts)}" + # assert len(visuals) == self.batch_size_per_gpu, f"Expected visuals batch size {self.batch_size_per_gpu}, got {len(visuals)}" + formatted_contexts = [f"{context}\n" for context in contexts] + model_inputs = self.processor(text=formatted_contexts, images=visuals, device=self.device) + for k, v in model_inputs.items(): + model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v] + + for index in range(len(model_inputs["image_patches"])): + model_inputs["image_patches"][index] = model_inputs["image_patches"][index].to(dtype=next(self.model.parameters()).dtype) + + # preconfigure gen_kwargs with defaults + gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 256 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + # generation_output = self.model.generate( + # **model_inputs, temperature=gen_kwargs["temperature"], max_new_tokens=gen_kwargs["max_new_tokens"], top_p=gen_kwargs["top_p"], num_beams=gen_kwargs["num_beams"], pad_token_id=self.tokenizer.eos_token_id + # ) + generation_output = self.model.generate(**model_inputs, max_new_tokens=gen_kwargs["max_new_tokens"], pad_token_id=self.tokenizer.eos_token_id) + generation_texts = self.processor.batch_decode(generation_output, skip_special_tokens=True) + response = [gen_text.split("\x04")[1].strip(" ").strip("\n") for gen_text in generation_texts] + res.extend(response) + pbar.update(1) + + pbar.close() + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + if type(doc_to_target) == str: + continuation = doc_to_target + else: + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + formatted_contexts = [f"{contexts}\n"] + formatted_continuation = [f"{contexts}\n{continuation}"] + model_inputs = self.processor(text=formatted_continuation, images=visuals, device=self.device) + for k, v in model_inputs.items(): + model_inputs[k] = v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else [vv.to(self.device, non_blocking=True) for vv in v] + + for index in range(len(model_inputs["image_patches"])): + model_inputs["image_patches"][index] = model_inputs["image_patches"][index].to(dtype=next(self.model.parameters()).dtype) + + labels = model_inputs["input_ids"].clone() + contxt_id = self.processor(text=formatted_contexts, return_tensors="pt")["input_ids"] + labels[: len(contxt_id)] = -100 + with torch.inference_mode(): + outputs = self.model(**model_inputs, labels=labels) + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = model_inputs["input_ids"][:, contxt_id.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : model_inputs["input_ids"].shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + + pbar.close() + return res + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for Fuyu") diff --git a/lmms_eval/models/gemini_api.py b/lmms_eval/models/gemini_api.py new file mode 100644 index 0000000..72892e7 --- /dev/null +++ b/lmms_eval/models/gemini_api.py @@ -0,0 +1,201 @@ +import io +import json +import os +import time +from typing import List, Tuple + +from accelerate import Accelerator, DistributedType +from loguru import logger as eval_logger +from PIL import Image +from tqdm import tqdm + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +try: + import google.generativeai as genai + from google.generativeai.types import HarmBlockThreshold, HarmCategory + + NUM_SECONDS_TO_SLEEP = 30 + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") + genai.configure(api_key=GOOGLE_API_KEY) + +except Exception as e: + eval_logger.error(f"Error importing generativeai: {str(e)}") + genai = None + + +@register_model("gemini_api") +class GeminiAPI(lmms): + def __init__( + self, + model_version: str = "gemini-1.5-pro", + modality: str = "image", + timeout: int = 120, + continual_mode: bool = False, + response_persistent_folder: str = None, # We will cache the Gemini API response in this path and use it for future requests + **kwargs, + ) -> None: + super().__init__() + self.model_version = model_version + self.timeout = timeout + self.model = genai.GenerativeModel(model_version) + self.continual_mode = continual_mode + if self.continual_mode and response_persistent_folder is None: + raise ValueError("Continual mode requires a persistent path for the response. We will cache the Gemini API response in this path and use it for future requests. Please provide a valid path.") + self.response_persistent_folder = response_persistent_folder + if not os.path.exists(self.response_persistent_folder): + os.makedirs(self.response_persistent_folder) + self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json") + + if os.path.exists(self.response_persistent_file): + with open(self.response_persistent_file, "r") as f: + self.response_cache = json.load(f) + self.cache_mode = "resume" + else: + self.response_cache = {} + self.cache_mode = "start" + + accelerator = Accelerator() + if accelerator.num_processes > 1: + assert self.continual_mode is False, "Continual mode is not supported with distributed inference." + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.accelerator = accelerator + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + + self.device = self.accelerator.device + + self.modality = modality + + self.video_pool = [] + + def free_video(self): + for video in self.video_pool: + video.delete() + self.video_pool = [] + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def get_image_size(self, image): + # Create a BytesIO object to store the image bytes + img_byte_array = io.BytesIO() + + # Save the image to the BytesIO object + image.save(img_byte_array, format="PNG") + + # Get the size of the BytesIO object + img_size = img_byte_array.tell() + + return img_size + + def encode_video(self, video_path): + uploaded_obj = genai.upload_file(path=video_path) + time.sleep(5) + self.video_pool.append(uploaded_obj) + return uploaded_obj + + def convert_video(self, images): + for idx, img in enumerate(images): + if self.modality == "video" and isinstance(img, str): + try: + images[idx] = self.encode_video(img) + except Exception as e: + eval_logger.error(f"Error converting video: {str(e)}") + return images + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + def get_uuid(task, split, doc_id): + return f"{task}___{split}___{doc_id}" + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + if self.continual_mode and self.cache_mode == "resume": + doc_uuid = get_uuid(task, split, doc_id) + if doc_uuid in self.response_cache: + content = self.response_cache[doc_uuid] + if content: + res.append(content) + pbar.update(1) + continue + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + + config = genai.GenerationConfig( + max_output_tokens=gen_kwargs["max_new_tokens"], + temperature=gen_kwargs["temperature"], + ) + + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + visuals = self.convert_video(visuals) + + message = [contexts] + visuals + + content = "" + + for attempt in range(5): + try: + content = self.model.generate_content( + message, + generation_config=config, + safety_settings={ + HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, + HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, + HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, + HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, + }, + ) + content = content.text + break + except Exception as e: + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}") + if isinstance(e, ValueError): + try: + eval_logger.info(f"Prompt feed_back: {content.prompt_feedback}") + content = "" + break + except Exception: + pass + if attempt < 4 - 1: # If we have retries left, sleep and then continue to next attempt + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty + eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}") + content = "" + res.append(content) + pbar.update(1) + + self.free_video() + + if self.continual_mode is True: # Cache the response + doc_uuid = get_uuid(task, split, doc_id) + self.response_cache[doc_uuid] = content + with open(self.response_persistent_file, "w") as f: + json.dump(self.response_cache, f) + + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for Gemini API") + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "Gemini API not support" diff --git a/lmms_eval/models/gpt4v.py b/lmms_eval/models/gpt4v.py new file mode 100755 index 0000000..261b139 --- /dev/null +++ b/lmms_eval/models/gpt4v.py @@ -0,0 +1,227 @@ +import base64 +import json +import os +import time +from copy import deepcopy +from io import BytesIO +from typing import List, Tuple + +import numpy as np +import requests as url_requests +from accelerate import Accelerator, DistributedType +from tqdm import tqdm + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.load_video import read_video_pyav_base64 + +try: + from decord import VideoReader, cpu +except ImportError: + pass + +from PIL import Image + +API_TYPE = os.getenv("API_TYPE", "openai") +# API_TYPE = "azure" +NUM_SECONDS_TO_SLEEP = 30 +from loguru import logger as eval_logger + +if API_TYPE == "openai": + API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions") + API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY") + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + } +elif API_TYPE == "azure": + API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken") + API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY") + headers = { + "api-key": API_KEY, + "Content-Type": "application/json", + } + + +@register_model("gpt4v") +class GPT4V(lmms): + def __init__( + self, + # model_version: str = "gpt-4-vision-preview", + modality: str = "video", + max_frames_num: int = 32, + fps: float = None, + timeout: int = 120, + continual_mode: bool = False, + response_persistent_folder: str = None, + **kwargs, + ) -> None: + super().__init__() + # Manually set a image token for GPT4V so that we can search for it + # and split the text and image + # Here we just use the same token as llava for convenient + # self.model_version = model_version + self.modality = modality + self.max_frames_num = max_frames_num + self.image_token = "" + self.timeout = timeout + self.continual_mode = continual_mode + self.fps = fps + if self.continual_mode: + if response_persistent_folder is None: + raise ValueError("Continual mode requires a persistent path for the response. Please provide a valid path.") + + os.makedirs(response_persistent_folder, exist_ok=True) + self.response_persistent_folder = response_persistent_folder + self.response_persistent_file = os.path.join(self.response_persistent_folder, f"{self.model_version}_response.json") + + if os.path.exists(self.response_persistent_file): + with open(self.response_persistent_file, "r") as f: + self.response_cache = json.load(f) + self.cache_mode = "resume" + else: + self.response_cache = {} + self.cache_mode = "start" + + accelerator = Accelerator() + # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.accelerator = accelerator + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + + self.device = self.accelerator.device + + # Function to encode the image + def encode_image(self, image: Image): + output_buffer = BytesIO() + image.save(output_buffer, format="PNG") + byte_data = output_buffer.getvalue() + base64_str = base64.b64encode(byte_data).decode("utf-8") + return base64_str + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + if self.continual_mode is True and self.cache_mode == "resume": + doc_uuid = f"{task}___{split}___{doc_id}" + if doc_uuid in self.response_cache: + response_text = self.response_cache[doc_uuid] + if response_text: + res.append(response_text) + pbar.update(1) + continue + + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + imgs = [] # multiple images or frames for video + for visual in visuals: + if self.modality == "image": + img = self.encode_image(visual) + imgs.append(img) + elif self.modality == "video": + # frames = self.encode_video(visual, self.max_frames_num) + # imgs.extend(frames) + try: + frames = read_video_pyav_base64(visual, num_frm=self.max_frames_num, fps=self.fps) + imgs.extend(frames) + except Exception as e: + # Log the error and skip to the next visual + eval_logger.error(f"Error {e} in encoding video for {visual}") + continue # Skip this visual and continue with the others + + payload = {"messages": []} + if API_TYPE == "openai": + payload["model"] = self.model_version + + response_json = {"role": "user", "content": []} + # When there is no image token in the context, append the image to the text + if self.image_token not in contexts: + payload["messages"].append(deepcopy(response_json)) + payload["messages"][0]["content"].append({"type": "text", "text": contexts}) + for img in imgs: + payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}) + else: + contexts = contexts.split(self.image_token) + for idx, img in enumerate(imgs): + payload["messages"].append(deepcopy(response_json)) + payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]}) + payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}) + + # If n image tokens are in the contexts + # contexts will be splitted into n+1 chunks + # Manually add it into the payload + payload["messages"].append(deepcopy(response_json)) + payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]}) + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 4096 + if gen_kwargs["max_new_tokens"] > 4096: + gen_kwargs["max_new_tokens"] = 4096 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + payload["max_tokens"] = gen_kwargs["max_new_tokens"] + payload["temperature"] = gen_kwargs["temperature"] + + NUM_ATTEMPTS = 5 + + for attempt in range(NUM_ATTEMPTS): + try: + response = url_requests.post(API_URL, headers=headers, json=payload, timeout=self.timeout) + response_data = response.json() + + response_text = response_data["choices"][0]["message"]["content"].strip() + break # If successful, break out of the loop + + except Exception as e: + try: + error_msg = response.json() + except: + error_msg = "" + + eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}.\nReponse: {error_msg}") + if attempt < NUM_ATTEMPTS - 1: + time.sleep(NUM_SECONDS_TO_SLEEP) + else: # If this was the last attempt, log and return empty string + eval_logger.error(f"All {NUM_ATTEMPTS} attempts failed. Last error message: {str(e)}.\nResponse: {response.json()}") + response_text = "" + res.append(response_text) + pbar.update(1) + + if self.continual_mode is True: # Cache the response + doc_uuid = f"{task}___{split}___{doc_id}" + self.response_cache[doc_uuid] = response_text + with open(self.response_persistent_file, "w") as f: + json.dump(self.response_cache, f) + + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for GPT4V") + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "GPT4V not support" diff --git a/lmms_eval/models/idefics2.py b/lmms_eval/models/idefics2.py new file mode 100644 index 0000000..eac3499 --- /dev/null +++ b/lmms_eval/models/idefics2.py @@ -0,0 +1,234 @@ +import warnings +from typing import List, Optional, Tuple, Union + +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from tqdm import tqdm +from transformers import AutoProcessor, Idefics2ForConditionalGeneration + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + +DEFAULT_IMAGE_TOKEN = "" +try: + import flash_attn + + best_fit_attn_implementation = "flash_attention_2" +except ImportError: + best_fit_attn_implementation = "eager" + + +@register_model("idefics2") +class Idefics2(lmms): + """ + Idefics2 Model for Hugging Face Transformers: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics2/modeling_idefics2.py + + Example usage: + + accelerate launch --num_processes=8 -m lmms_eval \ + --model idefics2 \ + --model_args pretrained=HuggingFaceM4/idefics2-8b \ + --tasks mme \ + --batch_size 1 \ + --output_path ./logs/ \ + --log_samples + """ + + def __init__( + self, + pretrained: str = "HuggingFaceM4/idefics2-8b", + revision: str = "main", + device: str = "cuda", + dtype: Optional[Union[str, torch.dtype]] = "float16", + batch_size: int = 1, + trust_remote_code: Optional[bool] = False, + attn_implementation: Optional[str] = best_fit_attn_implementation, + device_map: str = "", + use_cache: bool = True, + do_image_splitting: bool = False, + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + if accelerator.num_processes > 1 and device_map == "": + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + else: + self._device = torch.device(device) + self.device_map = device_map + if isinstance(dtype, str) and dtype != "auto": + dtype = getattr(torch, dtype) + self._model = Idefics2ForConditionalGeneration.from_pretrained(pretrained, revision=revision, torch_dtype=dtype, device_map=self.device_map, trust_remote_code=trust_remote_code, attn_implementation=attn_implementation) + self._processor = AutoProcessor.from_pretrained(pretrained, do_image_splitting=do_image_splitting, revision=revision, trust_remote_code=trust_remote_code) + + self._tokenizer = self._processor.tokenizer + self._config = self._model.config + self.batch_size_per_gpu = int(batch_size) + self.use_cache = use_cache + if accelerator.num_processes > 1 and device_map == "": + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with pipeline parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._word_size = 1 + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + raise NotImplementedError("Loglikelihood is not implemented for Idefics2 model") + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visuals, doc_id, tasks, splits = zip(*chunk) + visuals = [doc_to_visual(self.task_dict[task][split][ids]) for ids, task, split, doc_to_visual in zip(doc_id, tasks, splits, doc_to_visuals)] + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + # + until = gen_kwargs.pop("until", None) + image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio", None) + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + + prompts = [] + for context, visual in zip(contexts, visuals): + content = [] + if DEFAULT_IMAGE_TOKEN not in context: + for image in visual: + content.append({"type": "image"}) + content.append({"type": "text", "text": context}) + message = [{"role": "user", "content": content}] + prompt = self._processor.apply_chat_template(message, add_generation_prompt=True) + prompts.append(prompt) + inputs = self._processor(text=prompts, images=visuals, padding=True, return_tensors="pt") + inputs = {k: v.to(self.device) for k, v in inputs.items()} + output_ids = self.model.generate(**inputs, **gen_kwargs) + # only retain the generated text + for output_id, input_id in zip(output_ids, inputs["input_ids"]): + generated_id = output_id[len(input_id) :] + generated_text = self.tokenizer.decode(generated_id, skip_special_tokens=True) + + res.append(generated_text) + pbar.update(1) + # reorder this group of results back to original unsorted form + res = re_ords.get_original(res) + + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for Idefics2") diff --git a/lmms_eval/models/instructblip.py b/lmms_eval/models/instructblip.py new file mode 100755 index 0000000..dc3b092 --- /dev/null +++ b/lmms_eval/models/instructblip.py @@ -0,0 +1,230 @@ +import copy +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import transformers +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from tqdm import tqdm +from transformers import InstructBlipForConditionalGeneration, InstructBlipProcessor + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.tasks.mmmu.utils_group_img import process_images +from lmms_eval.utils import stop_sequences_criteria + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + + +@register_model("instructblip") +class InstructBLIP(lmms): + """ + InstructBLIP Model + """ + + def __init__( + self, + pretrained: str = "Salesforce/instructblip-vicuna-7b", + device: Optional[str] = "cuda", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: Optional[Union[int, str]] = 1, + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + else: + self._device = device + self._model = InstructBlipForConditionalGeneration.from_pretrained(pretrained, device_map=self._device) + self._image_processor = InstructBlipProcessor.from_pretrained(pretrained) + self._tokenizer = self._image_processor.tokenizer + self._config = self._model.config + self.model.eval() + self.model.tie_weights() + self.batch_size_per_gpu = int(batch_size) + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + else: + self.model.to(self._device) + self._rank = 0 + self._word_size = 1 + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + assert False, "We have not implemented this function for InstructBLIP yet" + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] + visuals = self.flatten(visuals) + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + + # Set default values for until and max_new_tokens + until = [self.tok_decode(self.eot_token_id)] + + # Update values from gen_kwargs if present + if "until" in gen_kwargs: + until = gen_kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + assert self.batch_size_per_gpu == 1, "Do not support batch_size_per_gpu > 1 for now" + context = contexts[0] + if "" in context: + # instruct blip does not expect the tag + context = context.replace("", "") + # Set trunction equals true here, the max length for qformer tokenizer is 512 + # if not truncate, some questions will cause size mismatch + # The transformer implementation can't handle multi images for blip + # Concat it into one image + if len(visuals) > 1: + visuals = [process_images(visuals)] + inputs = self._image_processor(images=visuals, text=context, return_tensors="pt", truncation=True).to(self.device) + + gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))] + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + try: + cont = self.model.generate( + **inputs, + do_sample=True if gen_kwargs["temperature"] > 0 else False, + temperature=gen_kwargs["temperature"], + top_p=gen_kwargs["top_p"], + num_beams=gen_kwargs["num_beams"], + max_new_tokens=gen_kwargs["max_new_tokens"], + ) + except Exception as e: + eval_logger.error(f"Error {e} in generating") + cont = "" + text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0].strip() + res.append(text_outputs) + self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs) + pbar.update(1) + # reorder this group of results back to original unsorted form + res = re_ords.get_original(res) + + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for InstructBlip") diff --git a/lmms_eval/models/internvideo2.py b/lmms_eval/models/internvideo2.py new file mode 100644 index 0000000..3d2bbca --- /dev/null +++ b/lmms_eval/models/internvideo2.py @@ -0,0 +1,366 @@ +import logging +import os +from typing import List, Tuple + +import decord +import numpy as np +import torch +import torchvision.transforms as T +from accelerate import Accelerator, DistributedType +from decord import VideoReader, cpu + +decord.bridge.set_bridge("torch") +import torch.nn.functional as F +from PIL import Image +from tqdm import tqdm +from transformers import AutoModel, AutoTokenizer + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +eval_logger = logging.getLogger("eval_logger") + + +from datetime import timedelta + +from accelerate.state import AcceleratorState +from accelerate.utils import InitProcessGroupKwargs + +DEFAULT_GEN_KWARGS = dict( + num_beams=1, + max_new_tokens=1024, + do_sample=False, +) + +# def get_index(num_frames, num_segments): +# seg_size = float(num_frames - 1) / num_segments +# start = int(seg_size / 2) +# offsets = np.array([ +# start + int(np.round(seg_size * idx)) for idx in range(num_segments) +# ]) +# return offsets + + +def get_index(max_frame, num_segments, fps, first_idx=0, bound=None): + if bound: + start, end = bound[0], bound[1] + if start is None: + start, end = -100000, 100000 + else: + start, end = -100000, 100000 + start_idx = max(first_idx, round(start * fps)) + end_idx = min(round(end * fps), max_frame) + seg_size = float(end_idx - start_idx) / num_segments + frame_indices = np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]) + return frame_indices + + +def load_image(image_path, resolution=224, hd_num=6): + image = Image.open(image_path).convert("RGB") + image_tensor = T.PILToTensor()(image).unsqueeze(0) + image_tensor = HD_transform_no_padding(image_tensor.float(), image_size=resolution, hd_num=hd_num) + T_, C, H, W = image_tensor.shape + + mean = (0.485, 0.456, 0.406) + std = (0.229, 0.224, 0.225) + + transform = T.Compose([T.Lambda(lambda x: x.float().div(255.0)), T.Normalize(mean, std)]) + image_tensor = transform(image_tensor).cuda() + + sub_img = image_tensor.reshape(1, T_, 3, H // resolution, resolution, W // resolution, resolution).permute(0, 3, 5, 1, 2, 4, 6).reshape(-1, T_, 3, resolution, resolution).contiguous() + + glb_img = F.interpolate(image_tensor.float(), size=(resolution, resolution), mode="bicubic", align_corners=False).to(sub_img.dtype).unsqueeze(0) + + image_tensor = torch.cat([sub_img, glb_img]) # .unsqueeze(0) + return image_tensor + + +def load_video(video_path, num_segments=16, return_msg=False, resolution=224, hd_num=6, padding=False): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + num_frames = len(vr) - 1 + + frame_indices = get_index(max_frame=num_frames, num_segments=num_segments, fps=float(vr.get_avg_fps()), first_idx=0, bound=None) + mean = (0.485, 0.456, 0.406) + std = (0.229, 0.224, 0.225) + + transform = T.Compose([T.Lambda(lambda x: x.float().div(255.0)), T.Normalize(mean, std)]) + + frames = vr.get_batch(frame_indices) + frames = frames.permute(0, 3, 1, 2) + + if padding: + frames = HD_transform_padding(frames.float(), image_size=resolution, hd_num=hd_num) + else: + frames = HD_transform_no_padding(frames.float(), image_size=resolution, hd_num=hd_num) + + frames = transform(frames) + T_, C, H, W = frames.shape + + sub_img = frames.reshape(1, T_, 3, H // resolution, resolution, W // resolution, resolution).permute(0, 3, 5, 1, 2, 4, 6).reshape(-1, T_, 3, resolution, resolution).contiguous() + + glb_img = F.interpolate(frames.float(), size=(resolution, resolution), mode="bicubic", align_corners=False).to(sub_img.dtype).unsqueeze(0) + + frames = torch.cat([sub_img, glb_img]).unsqueeze(0) + + if return_msg: + fps = float(vr.get_avg_fps()) + sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) + # " " should be added in the start and end + msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." + return frames, msg + else: + return frames + + +def HD_transform_padding(frames, image_size=224, hd_num=6): + def _padding_224(frames): + _, _, H, W = frames.shape + tar = int(np.ceil(H / 224) * 224) + top_padding = (tar - H) // 2 + bottom_padding = tar - H - top_padding + left_padding = 0 + right_padding = 0 + + padded_frames = F.pad(frames, pad=[left_padding, right_padding, top_padding, bottom_padding], mode="constant", value=255) + return padded_frames + + _, _, H, W = frames.shape + trans = False + if W < H: + frames = frames.flip(-2, -1) + trans = True + width, height = H, W + else: + width, height = W, H + + ratio = width / height + scale = 1 + while scale * np.ceil(scale / ratio) <= hd_num: + scale += 1 + scale -= 1 + new_w = int(scale * image_size) + new_h = int(new_w / ratio) + + resized_frames = F.interpolate(frames, size=(new_h, new_w), mode="bicubic", align_corners=False) + padded_frames = _padding_224(resized_frames) + + if trans: + padded_frames = padded_frames.flip(-2, -1) + + return padded_frames + + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def HD_transform_no_padding(frames, image_size=224, hd_num=6, fix_ratio=(2, 1)): + min_num = 1 + max_num = hd_num + _, _, orig_height, orig_width = frames.shape + aspect_ratio = orig_width / orig_height + + # calculate the existing video aspect ratio + target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + if fix_ratio: + target_aspect_ratio = fix_ratio + else: + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the frames + resized_frame = F.interpolate(frames, size=(target_height, target_width), mode="bicubic", align_corners=False) + return resized_frame + + +@register_model("InternVideo2") +class InternVideo2(lmms): + def __init__( + self, + pretrained: str = "OpenGVLab/InternVideo2_chat_8B_HD", + modality: str = "video", + device: str = "cuda:0", + device_map: str = "cuda:0", + batch_size: str = "1", + num_segments: str = "8", + hd_num: str = "6", + **kwargs, + ): + super().__init__() + self.path = pretrained + self.instruction = "Carefully watch the video and pay attention to the cause and sequence of events, the detail and movement of objects, and the action and pose of persons.\n" + + self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True, use_fast=False) + self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, trust_remote_code=True).eval().cuda() + batch_size = int(batch_size) + self.num_segments = int(num_segments) + self.hd_num = int(hd_num) + assert batch_size == 1, f"Batch size should be 1 for InternVideo2, but got {batch_size}." + self.batch_size_per_gpu = batch_size + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + self.accelerator = accelerator + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + elif accelerator.num_processes == 1 and device_map == "auto": + self._device = torch.device(device) + self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._world_size = 1 + + self.modality = modality + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + if "until" in gen_kwargs: + gen_kwargs.pop("until") + for k, v in DEFAULT_GEN_KWARGS.items(): + if k not in gen_kwargs: + gen_kwargs[k] = v + + pop_keys = [] + for k, v in gen_kwargs.items(): + if k not in DEFAULT_GEN_KWARGS: + pop_keys.append(k) + + for k in pop_keys: + gen_kwargs.pop(k) + + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + if self.modality == "image": + image_path = visuals[0] + pixel_values = load_image(image_path, resolution=224, hd_num=self.hd_num) + pixel_values = pixel_values.to(torch.bfloat16).cuda() + question = contexts + response, history = self.model.chat(self.tokenizer, msg="", user_prompt=question, media_type="image", media_tensor=pixel_values, instruction=None, chat_history=[], return_history=True, **gen_kwargs) + elif self.modality == "video": + assert len(visuals) == 1, f"Only one video is supported, but got {len(visuals)} videos. [META-INFO]{visuals}" + video_path = visuals[0] + if "mvbench" in task: + answer_prompt = "Best Option:(" + else: + answer_prompt = None + pixel_values = load_video(video_path, num_segments=self.num_segments, return_msg=False, resolution=224, hd_num=self.hd_num) + pixel_values = pixel_values.to(torch.bfloat16).cuda() + question = self.instruction + contexts + response, history = self.model.chat( + self.tokenizer, + msg="", + user_prompt=question, + media_type="video", + media_tensor=pixel_values, + instruction=self.instruction, + chat_history=[], + return_history=True, + generation_config=gen_kwargs, + answer_prompt=answer_prompt, + debug_conv=False, + ) + res.append(response) + pbar.update(1) + pbar.close() + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + assert False, "Not implemented yet." + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for InternVideo2") diff --git a/lmms_eval/models/internvl.py b/lmms_eval/models/internvl.py new file mode 100644 index 0000000..093aa60 --- /dev/null +++ b/lmms_eval/models/internvl.py @@ -0,0 +1,496 @@ +import math +import os +import subprocess +from datetime import timedelta +from pathlib import Path +from typing import List, Optional, Tuple, Union + +import numpy as np +import requests +import torch +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState +from huggingface_hub import snapshot_download +from PIL import Image +from tqdm import tqdm +from transformers import AutoConfig + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.utils import stop_sequences_criteria + +wd = Path(__file__).parent.parent.parent.resolve() +import sys + +sys.path.append(os.path.join(str(wd), "InternVL", "internvl_chat")) +from loguru import logger as eval_logger + +if not hasattr(eval_logger, "internvl_warning_logged"): + eval_logger.internvl_warning_logged = False + +try: + from internvl.model.internlm2.modeling_internlm2 import InternLM2ForCausalLM + from internvl.model.internvl_chat import InternVLChatModel + from internvl.model.internvl_chat.configuration_internvl_chat import ( + InternVLChatConfig, + ) + from internvl.model.internvl_chat.modeling_intern_vit import InternVisionModel + from internvl.train.dataset import build_transform, dynamic_preprocess +except ImportError: + eval_logger.debug("InternVL is not installed. Please install InternVL to use this model.") + if not eval_logger.internvl_warning_logged: + eval_logger.debug("InternVL is not installed. Please install InternVL to use this model.") + eval_logger.internvl_warning_logged = True + +import re +import warnings +from typing import Any, List, Optional, Tuple, Union + +import torch.utils.checkpoint +from huggingface_hub import snapshot_download +from peft import LoraConfig, get_peft_model +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers import ( + AutoModel, + AutoTokenizer, + GenerationConfig, + LlamaForCausalLM, + LlamaTokenizer, +) +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel + + +@register_model("internvl") +class InternVLChat(lmms): + # config_class = InternVLChatConfig + main_input_name = "pixel_values" + _no_split_modules = ["InternVisionEncoderLayer", "LlamaDecoderLayer"] + + """ + 0. Install lmms-eval + cd lmms-eval + pip install -e . + + How to Install InternVL: + 1. Clone the InternVL repository: + git clone https://github.com/OpenGVLab/InternVL.git + + 2. Install the requirements: + pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118 + + 3. Install flash-attn==2.3.6: + pip install flash-attn==2.3.6 --no-build-isolation + """ + + """ + How to download the pretrained model: + 1. Download the pretrained model from hugginface: + cd pretrained/ + # pip install -U huggingface_hub + huggingface-cli download --resume-download --local-dir-use-symlinks False OpenGVLab/InternVL-Chat-V1-5 --local-dir InternVL-Chat-V1-5 + + 2. the pretrained model should be in the following directory: + pretrained + ā””ā”€ā”€ InternVL-Chat-V1-5 + """ + + # + # The above steps can be optional, I add snapshot download, so now can just use hf repo_id + # model_args pretrained=OpenGVLab/InternVL-Chat-V1-5 + # + + """ + InternVL-Chat-V1-5 Model for OpenGVLab https://github.com/OpenGVLab/InternVL/blob/main/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py + Example usage: + + accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval \ + --model internvl \ + --model_args pretrained=OpenGVLab/InternVL-Chat-V1-5 \ + --tasks llava_wilder_small \ + --batch_size 1 \ + --output_path ./logs/ \ + --log_samples + """ + + def __init__( + self, + config=None, + pretrained: str = "OpenGVLab/InternVL-Chat-V1-5", + truncation: Optional[bool] = True, + device: Optional[str] = "cuda:0", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: Optional[Union[int, str]] = 1, + trust_remote_code: Optional[bool] = False, + revision=None, + device_map="cuda:0", + conv_template="vicuna_v1", + use_cache=True, + truncate_context=False, # whether to truncate the context in generation, set it False for LLaVA-1.6 + customized_config=None, # ends in json + dynamic=True, + load_in_8bit=False, + vision_model=None, + language_model=None, + max_num=12, + **kwargs, + ) -> None: + super().__init__() + + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + elif accelerator.num_processes == 1 and device_map == "auto": + self._device = torch.device(device) + self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + + self.dynamic = dynamic # dynamic image_size + self.max_num = max_num + if accelerator.is_main_process: + cache_dir = snapshot_download(repo_id=pretrained, cache_dir="cache_dir", local_dir="cache_dir", local_dir_use_symlinks=False) + accelerator.wait_for_everyone() + # So what I did is that I let main process to download the repo, and then + # other process can just simply read from this repo + cache_dir = snapshot_download(repo_id=pretrained, cache_dir="cache_dir", local_dir="cache_dir", local_dir_use_symlinks=False) + config = InternVLChatConfig.from_pretrained(cache_dir) + tokenizer = AutoTokenizer.from_pretrained(cache_dir, trust_remote_code=True, use_fast=False) + model = InternVLChatModel.from_pretrained(cache_dir, low_cpu_mem_usage=True, config=config, torch_dtype=torch.bfloat16, load_in_8bit=load_in_8bit).eval() + if not load_in_8bit: + model = model.cuda() + # self.model=model + # self.device=self._device + self._tokenizer = tokenizer + # self.tokenizer=tokenizer + self._model = model + self._config = self._model.config + self.use_thumbnail = self.model.config.use_thumbnail + self.model.eval() + self.model.tie_weights() + self.truncation = truncation + self.batch_size_per_gpu = int(batch_size) + self.conv_template = conv_template + self.use_cache = use_cache + self.truncate_context = truncate_context + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._world_size = 1 + + # from internvl model + + self.image_size = config.force_image_size or config.vision_config.image_size + + def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05): + lora_config = LoraConfig( + r=r, + target_modules=["attn.qkv", "attn.proj", "mlp.fc1", "mlp.fc2"], + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + ) + self.vision_model = get_peft_model(self.vision_model, lora_config) + self.vision_model.print_trainable_parameters() + + def wrap_llm_lora(self, r=128, lora_alpha=256, lora_dropout=0.05): + lora_config = LoraConfig( + r=r, target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj", "mlp.gate_proj", "mlp.down_proj", "mlp.up_proj"], lora_alpha=lora_alpha, lora_dropout=lora_dropout, task_type="CAUSAL_LM" + ) + self.language_model = get_peft_model(self.language_model, lora_config) + self.language_model.enable_input_require_grads() + self.language_model.print_trainable_parameters() + + def pixel_shuffle(self, x, scale_factor=0.5): + n, w, h, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) + x = x.view(n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor))) + if self.ps_version == "v1": + warnings.warn("In ps_version 'v1', the height and width have not been swapped back, " "which results in a transposed image.") + else: + x = x.permute(0, 2, 1, 3).contiguous() + return x + + def noised_embed(self, vit_embeds, noise_alpha=5): + dims = torch.tensor(vit_embeds.size(1) * vit_embeds.size(2)) + mag_norm = noise_alpha / torch.sqrt(dims) + noise = torch.zeros_like(vit_embeds).uniform_(-mag_norm, mag_norm) + return vit_embeds + noise + + def extract_feature(self, pixel_values): + if self.select_layer == -1: + vit_embeds = self.vision_model(pixel_values=pixel_values, output_hidden_states=False, return_dict=True).last_hidden_state + else: + vit_embeds = self.vision_model(pixel_values=pixel_values, output_hidden_states=True, return_dict=True).hidden_states[self.select_layer] + vit_embeds = vit_embeds[:, 1:, :] + + if self.training and self.neftune_alpha is not None: + vit_embeds = self.noised_embed(vit_embeds, self.neftune_alpha) + + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + vit_embeds = self.mlp1(vit_embeds) # .to(pixel_values.device) + return vit_embeds + + def multi_image_chat(self, tokenizer, pixel_values, image_counts, question, generation_config, history=None, return_history=False, IMG_START_TOKEN="", IMG_END_TOKEN="", IMG_CONTEXT_TOKEN=""): + img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + self.img_context_token_id = img_context_token_id + if tokenizer.convert_tokens_to_ids("<|im_end|>") != 0: + eos_token_id = tokenizer.convert_tokens_to_ids("<|im_end|>") # 92542, InternLM2 + else: + eos_token_id = tokenizer.eos_token_id + + from internvl.conversation import get_conv_template + + template = get_conv_template(self.template) + + if history is None: + history = [] + image_tokens = "" + image_bs = pixel_values.shape[0] + # print(f"dynamic ViT batch size: {image_bs}, image_counts: {image_counts}") + for idx, image_count in enumerate(image_counts): + image_tokens += f" (图{idx+1}):" + IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * image_count + IMG_END_TOKEN + question = image_tokens + "\n" + question + else: + for old_question, old_answer in history: + template.append_message(template.roles[0], old_question) + template.append_message(template.roles[1], old_answer) + template.append_message(template.roles[0], question) + template.append_message(template.roles[1], None) + query = template.get_prompt() + model_inputs = tokenizer(query, return_tensors="pt") + input_ids = model_inputs["input_ids"].cuda() + attention_mask = model_inputs["attention_mask"].cuda() + generation_config["eos_token_id"] = eos_token_id + + generation_output = self.generate(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, **generation_config) + response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0] + response = response.split("<|im_end|>")[0].strip() # for InternLM2 + history.append((question, response)) + if return_history: + return response, history + else: + query_to_print = query.replace(image_tokens, "") + # print(query_to_print, response) + return response + return response + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + try: + return self.tokenizer.decode(tokens) + except: + return self.tokenizer.decode([tokens]) + + def post_processing(self, response): + response = response.replace("\n", "").replace("äøę˜Æ", "No").replace("ę˜Æ", "Yes").replace("否", "No") + response = response.lower().replace("true", "yes").replace("false", "no") + pattern = re.compile(r"[\u4e00-\u9fa5]") + response = re.sub(pattern, "", response) + return response + + @torch.no_grad() + def generate( + self, + pixel_values: Optional[torch.FloatTensor] = None, + input_ids: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + visual_features: Optional[torch.FloatTensor] = None, + generation_config: Optional[GenerationConfig] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **generate_kwargs, + ) -> torch.LongTensor: + assert self.img_context_token_id is not None + if pixel_values is not None: + if visual_features is not None: + vit_embeds = visual_features + else: + vit_embeds = self.extract_feature(pixel_values) + + input_embeds = self.language_model.get_input_embeddings()(input_ids) + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + input_ids = input_ids.reshape(B * N) + selected = input_ids == self.img_context_token_id + assert selected.sum() != 0 + input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) + + input_embeds = input_embeds.reshape(B, N, C) + else: + input_embeds = self.language_model.get_input_embeddings()(input_ids) + + outputs = self.language_model.generate( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + generation_config=generation_config, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + use_cache=True, + **generate_kwargs, + ) + + return outputs + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def load_image(self, flattened_visuals, input_size=224): + assert flattened_visuals[0].mode == "RGB" + image = flattened_visuals[0].convert("RGB") + transform = build_transform(is_train=False, input_size=input_size) + if self.dynamic: + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=self.use_thumbnail, max_num=self.max_num) + else: + images = [image] + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] # [B, N] + flattened_visuals = self.flatten(batched_visuals) + try: + pixel_values = self.load_image(flattened_visuals, self.image_size).cuda().to(torch.bfloat16) + except IndexError: + pixel_values = None + gen_kwargs = all_gen_kwargs[0] + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + generation_config = dict( + do_sample=False, + top_k=50, + top_p=gen_kwargs["top_p"], + num_beams=gen_kwargs["num_beams"], + max_new_tokens=gen_kwargs["max_new_tokens"], + eos_token_id=self.tokenizer.eos_token_id, + ) + question = contexts[0] + response = self.model.chat(tokenizer=self.tokenizer, pixel_values=pixel_values, question=question, generation_config=generation_config) + # TODO(choiszt) try batch_chat for multiple inputs + response = self.post_processing(response) + res.append(response) + self.cache_hook.add_partial("generate_until", (question, gen_kwargs), response) + pbar.update(1) + res = re_ords.get_original(res) + return res + # print(chunk) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + pass + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for InternVL") diff --git a/lmms_eval/models/internvl2.py b/lmms_eval/models/internvl2.py new file mode 100644 index 0000000..c21df9b --- /dev/null +++ b/lmms_eval/models/internvl2.py @@ -0,0 +1,334 @@ +import logging +from typing import List, Tuple + +import numpy as np +import torch +import torchvision.transforms as T +from accelerate import Accelerator, DistributedType +from decord import VideoReader, cpu +from PIL import Image +from torchvision.transforms.functional import InterpolationMode +from tqdm import tqdm +from transformers import AutoModel, AutoTokenizer + +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +eval_logger = logging.getLogger("eval_logger") + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + +DEFAULT_GEN_KWARGS = dict( + num_beams=1, + max_new_tokens=1024, + do_sample=False, +) + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD)]) + return transform + + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float("inf") + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + +def load_image(image, input_size=448, max_num=6): + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + +def get_index(bound, fps, max_frame, first_idx=0, num_segments=32): + if bound: + start, end = bound[0], bound[1] + else: + start, end = -100000, 100000 + start_idx = max(first_idx, round(start * fps)) + end_idx = min(round(end * fps), max_frame) + seg_size = float(end_idx - start_idx) / num_segments + frame_indices = np.array([int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)]) + return frame_indices + + +def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + max_frame = len(vr) - 1 + fps = float(vr.get_avg_fps()) + + pixel_values_list, num_patches_list = [], [] + transform = build_transform(input_size=input_size) + frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments) + for frame_index in frame_indices: + img = Image.fromarray(vr[frame_index].asnumpy()).convert("RGB") + img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(tile) for tile in img] + pixel_values = torch.stack(pixel_values) + num_patches_list.append(pixel_values.shape[0]) + pixel_values_list.append(pixel_values) + pixel_values = torch.cat(pixel_values_list) + return pixel_values, num_patches_list + + +import math +from datetime import timedelta + +from accelerate.state import AcceleratorState +from accelerate.utils import InitProcessGroupKwargs + + +# The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors. +def split_model(model_name, num_layers=None): + device_map = {} + world_size = torch.cuda.device_count() + if num_layers is None: + num_layers = { + "InternVL2_5-1B": 24, + "InternVL2_5-2B": 24, + "InternVL2_5-4B": 36, + "InternVL2_5-8B": 32, + "InternVL2_5-26B": 48, + "InternVL2_5-38B": 64, + "InternVL2_5-78B": 80, + "InternVL2-1B": 24, + "InternVL2-2B": 24, + "InternVL2-4B": 32, + "InternVL2-8B": 32, + "InternVL2-26B": 48, + "InternVL2-40B": 60, + "InternVL2-Llama3-76B": 80, + }[model_name] + # Since the first GPU will be used for ViT, treat it as half a GPU. + num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5)) + num_layers_per_gpu = [num_layers_per_gpu] * world_size + num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5) + layer_cnt = 0 + for i, num_layer in enumerate(num_layers_per_gpu): + for j in range(num_layer): + device_map[f"language_model.model.layers.{layer_cnt}"] = i + layer_cnt += 1 + device_map["vision_model"] = 0 + device_map["mlp1"] = 0 + device_map["language_model.model.tok_embeddings"] = 0 + device_map["language_model.model.embed_tokens"] = 0 + device_map["language_model.output"] = 0 + device_map["language_model.model.norm"] = 0 + device_map["language_model.lm_head"] = 0 + device_map[f"language_model.model.layers.{num_layers - 1}"] = 0 + + return device_map + + +@register_model("internvl2") +class InternVL2(lmms): + def __init__( + self, + pretrained: str = "OpenGVLab/InternVL2-2B", + modality: str = "image", + device: str = "cuda:0", + device_map: str = "cuda:0", + batch_size: str = "1", + num_frame: int = 32, + num_layers=None, + **kwargs, + ): + super().__init__() + + self.path = pretrained + self.num_frame = num_frame + + batch_size = int(batch_size) + assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}." + self.batch_size_per_gpu = batch_size + + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + self.accelerator = accelerator + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + elif accelerator.num_processes == 1 and device_map == "auto": + self._device = torch.device(device) + device_map = split_model(pretrained.split("/")[-1], num_layers=num_layers) + self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + + self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, device_map=device_map).eval() + self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True, device_map=device_map) + + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._world_size = 1 + + self.modality = modality + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + if "until" in gen_kwargs: + gen_kwargs.pop("until") + for k, v in DEFAULT_GEN_KWARGS.items(): + if k not in gen_kwargs: + gen_kwargs[k] = v + + pop_keys = [] + for k, v in gen_kwargs.items(): + if k not in DEFAULT_GEN_KWARGS: + pop_keys.append(k) + + for k in pop_keys: + gen_kwargs.pop(k) + + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + if self.modality == "image": + if visuals: + visuals = [load_image(visual).to(torch.bfloat16).cuda() for visual in visuals] + pixel_values = torch.cat(visuals, dim=0) + num_patches_list = [visual.size(0) for visual in visuals] + image_tokens = [""] * len(visuals) + image_tokens = " ".join(image_tokens) + contexts = image_tokens + "\n" + contexts + else: + pixel_values = None + num_patch_list = None + response, history = self.model.chat(self.tokenizer, pixel_values, contexts, gen_kwargs, num_patches_list=num_patches_list, history=None, return_history=True) + elif self.modality == "video": + assert len(visuals) == 1, f"Only one video is supported, but got {len(visuals)} videos." + video_path = visuals[0] + pixel_values, num_patches_list = load_video(video_path, num_segments=self.num_frame) + pixel_values = pixel_values.to(torch.bfloat16).cuda() + video_prefix = "".join([f"Frame{i+1}: \n" for i in range(len(num_patches_list))]) + question = video_prefix + contexts + response, history = self.model.chat(self.tokenizer, pixel_values, question, gen_kwargs, num_patches_list=num_patches_list, history=None, return_history=True) + res.append(response) + pbar.update(1) + pbar.close() + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + assert False, "Not implemented yet." + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for InternVL2") diff --git a/lmms_eval/models/llama_vid.py b/lmms_eval/models/llama_vid.py new file mode 100644 index 0000000..6094518 --- /dev/null +++ b/lmms_eval/models/llama_vid.py @@ -0,0 +1,282 @@ +import math +import os +import subprocess +from datetime import timedelta +from typing import List, Optional, Tuple, Union + +import numpy as np +import requests +import torch +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState +from decord import VideoReader, cpu +from huggingface_hub import snapshot_download +from loguru import logger as eval_logger +from tqdm import tqdm +from transformers import AutoConfig + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.load_video import read_video_pyav +from lmms_eval.utils import stop_sequences_criteria + +try: + from llamavid.constants import ( + DEFAULT_IM_END_TOKEN, + DEFAULT_IM_START_TOKEN, + DEFAULT_IMAGE_TOKEN, + IMAGE_TOKEN_INDEX, + ) + from llamavid.conversation import SeparatorStyle, conv_templates + from llamavid.model.builder import load_pretrained_model + from llava.mm_utils import ( + KeywordsStoppingCriteria, + get_model_name_from_path, + tokenizer_image_token, + ) +except ImportError: + eval_logger.debug("LLaMA-Video is not installed. Please install LLaMA-Video to use this model.") + + +@register_model("llama_vid") +class LLaMAVid(lmms): + def __init__( + self, + pretrained: str = "YanweiLi/llama-vid-7b-full-224-video-fps-1", + truncation: Optional[bool] = True, + device: Optional[str] = "cuda:0", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: Optional[Union[int, str]] = 1, + trust_remote_code: Optional[bool] = False, + revision=None, + attn_implementation=( + "sdpa" if torch.__version__ > "2.1.2" else "eager" + ), # inference implementation for attention, can be "sdpa", "eager", "flash_attention_2". Seems FA2 is not effective during inference: https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5 + device_map="cuda:0", + conv_template="vicuna_v1", + use_cache=True, + truncate_context=False, + num_frames: int = 100, + **kwargs, + ) -> None: + super().__init__() + + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + elif accelerator.num_processes == 1 and device_map == "auto": + self._device = torch.device(device) + self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + + self.pretrained = pretrained + self.model_path = snapshot_download(self.pretrained) + self.model_name = get_model_name_from_path(pretrained) + self.num_frames = num_frames + if not os.path.exists("./model_zoo/LAVIS/eva_vit_g.pth") and accelerator.is_main_process: + eval_logger.info("\n\n Eva Encoder is not found for LLaMA-VID. Download automatically to the folder ./model_zoo/LAVIS") + cache_path = "model_zoo/LAVIS" + os.makedirs(cache_path, exist_ok=True) + subprocess.run(["wget https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth -O ./model_zoo/LAVIS/eva_vit_g.pth"], shell=True) + + accelerator.wait_for_everyone() + self._tokenizer, self._model, self.image_processor, self._max_length = load_pretrained_model( + self.model_path, + None, + self.model_name, + device_map=self.device_map, + ) + + self._config = self._model.config + self.model.eval() + self.model.tie_weights() + self.truncation = truncation + self.batch_size_per_gpu = int(batch_size) + self.conv_template = conv_template + self.use_cache = use_cache + self.truncate_context = truncate_context + # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._world_size = 1 + + def download_file(self, url, folder_path): + # Create the folder if it doesn't exist + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + # Extract filename from URL + filename = url.split("/")[-1] + + # Define path to save the file + file_path = os.path.join(folder_path, filename) + + # Send a GET request to the URL + response = requests.get(url) + + # Check if request was successful (status code 200) + if response.status_code == 200: + # Save the file to the specified folder + with open(file_path, "wb") as f: + f.write(response.content) + print(f"File downloaded successfully to {file_path}") + else: + print(f"Failed to download file. Status code: {response.status_code}") + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def load_video(self, video_path): + vr = VideoReader(video_path, ctx=cpu(0)) + total_frame_num = len(vr) + fps = round(vr.get_avg_fps()) + frame_idx = [i for i in range(0, len(vr), fps)] + spare_frames = vr.get_batch(frame_idx).asnumpy() + return spare_frames + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests) -> List[str]: + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + videos = [] + for visual in visuals: + video = read_video_pyav(visual, num_frm=self.num_frames) + video = self.image_processor.preprocess(video, return_tensors="pt")["pixel_values"].half().cuda() + video = [video] + videos += video + qs = contexts + if self.model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs + else: + qs = DEFAULT_IMAGE_TOKEN + "\n" + qs + + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda() + + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + keywords = [stop_str] + stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids) + + cur_prompt = contexts + with torch.inference_mode(): + self.model.update_prompt([[cur_prompt]]) + output_ids = self.model.generate(input_ids, images=video, do_sample=True, temperature=0.2, max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria]) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids") + outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[: -len(stop_str)] + outputs = outputs.strip() + pbar.update(1) + res.append(outputs) + + return res + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + return super().loglikelihood(requests) + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for LLaMAVid") diff --git a/lmms_eval/models/llama_vision.py b/lmms_eval/models/llama_vision.py new file mode 100644 index 0000000..c67f796 --- /dev/null +++ b/lmms_eval/models/llama_vision.py @@ -0,0 +1,225 @@ +import warnings +from typing import List, Optional, Tuple, Union + +import numpy as np +import PIL +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from decord import VideoReader, cpu +from torchvision.transforms.functional import to_pil_image +from tqdm import tqdm +from transformers import AutoConfig, AutoProcessor, MllamaForConditionalGeneration + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.models.model_utils.load_video import read_video_pyav_pil + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + +DEFAULT_IMAGE_TOKEN = "<|image|>" + + +@register_model("llama_vision") +class LlamaVision(lmms): + def __init__( + self, + pretrained: str = "meta-llama/Llama-3.2-11B-Vision", + revision: str = "main", + device: str = "cuda", + dtype: Optional[Union[str, torch.dtype]] = "auto", + batch_size: int = 1, + trust_remote_code: Optional[bool] = True, + attn_implementation: Optional[str] = None, + device_map: str = "", + max_frames_num: Optional[int] = 32, + fps: Optional[int] = None, + max_image_size: Optional[int] = None, + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator = Accelerator() + if accelerator.num_processes > 1 and device_map == "": + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + else: + self._device = torch.device(device) + self.device_map = device_map + if isinstance(dtype, str) and dtype != "auto": + dtype = getattr(torch, dtype) + + self.fps = fps + self.max_frames_num = max_frames_num + self.max_image_size = max_image_size + self._model = MllamaForConditionalGeneration.from_pretrained(pretrained, revision=revision, torch_dtype=dtype, device_map=self.device_map, trust_remote_code=trust_remote_code, attn_implementation=attn_implementation) + self.model.eval() + self.processor = AutoProcessor.from_pretrained(pretrained) + if accelerator.num_processes > 1 and device_map == "": + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with pipeline parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._word_size = 1 + self.accelerator = accelerator + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + return self.tokenizer.decode(tokens) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + assert False, "Not implemented" + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def load_video(self, video_path, max_frames_num): + if type(video_path) == str: + vr = VideoReader(video_path, ctx=cpu(0)) + else: + vr = VideoReader(video_path[0], ctx=cpu(0)) + total_frame_num = len(vr) + uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int) + frame_idx = uniform_sampled_frames.tolist() + spare_frames = vr.get_batch(frame_idx).asnumpy() + return spare_frames # (frames, height, width, channels) + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + + messages = [{"role": "user", "content": []}] + images = [] + + for visual in visuals: + if isinstance(visual, str): + frames = read_video_pyav_pil(visual, num_frm=self.max_frames_num, fps=self.fps, max_image_size=self.max_image_size) + images.extend(frames) + # frames = self.load_video(visual, self.max_frames_num) + # frames = torch.from_numpy(frames).permute(0, 3, 1, 2) + # images.extend([to_pil_image(frame) for frame in frames]) + elif isinstance(visual, PIL.Image.Image): + images.append(visual) + + for _ in range(len(images)): + messages[-1]["content"].append({"type": "image"}) + messages[-1]["content"].append({"type": "text", "text": contexts}) + prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True) + inputs = self.processor(images, prompt, return_tensors="pt").to(self.model.device) + + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + if "do_sample" not in gen_kwargs: + gen_kwargs["do_sample"] = False + + with torch.no_grad(): + output = self.model.generate( + **inputs, + max_new_tokens=gen_kwargs["max_new_tokens"], + temperature=gen_kwargs["temperature"], + do_sample=gen_kwargs["do_sample"], + ) + output = output[:, inputs["input_ids"].shape[-1] :] + res.append(self.processor.decode(output[0])) + + pbar.update(1) + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for LLaVAHF") diff --git a/lmms_eval/models/llava.py b/lmms_eval/models/llava.py new file mode 100755 index 0000000..2de73cf --- /dev/null +++ b/lmms_eval/models/llava.py @@ -0,0 +1,430 @@ +import torch + +torch.backends.cuda.matmul.allow_tf32 = True + + +import copy +import warnings +from datetime import timedelta +from typing import List, Optional, Tuple, Union + +from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs +from accelerate.state import AcceleratorState +from packaging import version +from tqdm import tqdm + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model +from lmms_eval.utils import stop_sequences_criteria + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + +try: + from llava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX + from llava.conversation import conv_templates + from llava.mm_utils import ( + get_model_name_from_path, + process_images, + tokenizer_image_token, + ) + from llava.model.builder import load_pretrained_model +except Exception as e: + eval_logger.debug("LLaVA is not installed. Please install LLaVA to use this model.\nError: %s" % e) + +# inference implementation for attention, can be "sdpa", "eager", "flash_attention_2". Seems FA2 is not effective during inference: https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5 +# if is_flash_attn_2_available: +# best_fit_attn_implementation = "flash_attention_2" # flash_attn has a bug that says: ERROR Error query and key must have the same dtype in generating + +if version.parse(torch.__version__) >= version.parse("2.1.2"): + best_fit_attn_implementation = "sdpa" +else: + best_fit_attn_implementation = "eager" + + +@register_model("llava") +class Llava(lmms): + """ + Llava Model + """ + + def __init__( + self, + pretrained: str = "liuhaotian/llava-v1.5-7b", + truncation: Optional[bool] = True, + device: Optional[str] = "cuda:0", + batch_size: Optional[Union[int, str]] = 1, + model_name=None, + attn_implementation=best_fit_attn_implementation, + device_map="cuda:0", + conv_template="vicuna_v1", + use_cache=True, + tie_weights: bool = True, + truncate_context=False, # whether to truncate the context in generation, set it False for LLaVA-1.6 + customized_config=None, # ends in json + **kwargs, + ) -> None: + super().__init__() + # Do not use kwargs for now + assert kwargs == {}, f"Unexpected kwargs: {kwargs}" + + accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52)) + accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs]) + self.accelerator = accelerator + if accelerator.num_processes > 1: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + elif accelerator.num_processes == 1 and device_map == "auto": + self._device = torch.device(device) + self.device_map = device_map + else: + self._device = torch.device(f"cuda:{accelerator.local_process_index}") + self.device_map = f"cuda:{accelerator.local_process_index}" + + llava_model_args = { + "multimodal": True, + } + if customized_config is not None: + llava_model_args["customized_config"] = customized_config + if attn_implementation is not None: + llava_model_args["attn_implementation"] = attn_implementation + if "use_flash_attention_2" in kwargs: + llava_model_args["use_flash_attention_2"] = kwargs["use_flash_attention_2"] + model_name = model_name if model_name is not None else get_model_name_from_path(pretrained) + try: + # Try to load the model with the multimodal argument + self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, model_name, device_map=self.device_map, **llava_model_args) + except TypeError: + # for older versions of LLaVA that don't have multimodal argument + llava_model_args.pop("multimodal", None) + self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, model_name, device_map=self.device_map, **llava_model_args) + self._config = self._model.config + self.model.eval() + if tie_weights: + self.model.tie_weights() + + self.truncation = truncation + self.batch_size_per_gpu = int(batch_size) + self.conv_template = conv_template + self.use_cache = use_cache + self.truncate_context = truncate_context + # assert self.batch_size_per_gpu == 1, "Llava currently does not support batched generation. See https://github.com/haotian-liu/LLaVA/issues/754. HF Llava also has this issue." + if accelerator.num_processes > 1: + assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported." + # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model + # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works + # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work. + if accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs = { + "train_micro_batch_size_per_gpu": self.batch_size_per_gpu, + "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes, + } + AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs) + eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0") + + if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED: + self._model = accelerator.prepare(self.model) + else: + self._model = accelerator.prepare_model(self.model, evaluation_mode=True) + self.accelerator = accelerator + if self.accelerator.is_local_main_process: + eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism") + self._rank = self.accelerator.local_process_index + self._world_size = self.accelerator.num_processes + elif accelerator.num_processes == 1 and device_map == "auto": + eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism") + self._rank = 0 + self._word_size = 1 + else: + eval_logger.info(f"Using single device: {self._device}") + self.model.to(self._device) + self._rank = 0 + self._world_size = 1 + + @property + def config(self): + # return the associated transformers.AutoConfig for the given pretrained model. + return self._config + + @property + def tokenizer(self): + return self._tokenizer + + @property + def model(self): + # returns the model, unwrapping it if using Accelerate + if hasattr(self, "accelerator"): + return self.accelerator.unwrap_model(self._model) + else: + return self._model + + @property + def eot_token_id(self): + # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* + return self.tokenizer.eos_token_id + + @property + def max_length(self): + return self._max_length + + def pad_sequence(self, input_ids, batch_first, padding_value): + if self.tokenizer.padding_side == "left": + input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids] + input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value) + if self.tokenizer.padding_side == "left": + input_ids = torch.flip(input_ids, [1]) + return input_ids + + @property + def batch_size(self): + return self.batch_size_per_gpu + + @property + def device(self): + return self._device + + @property + def rank(self): + return self._rank + + @property + def world_size(self): + return self._world_size + + def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]: + """ """ + add_special_tokens = False if add_special_tokens is None else add_special_tokens + encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens) + # left-truncate the encoded context to be at most `left_truncate_len` tokens long + if left_truncate_len: + encoding = encoding[-left_truncate_len:] + return encoding + + def tok_decode(self, tokens): + try: + return self.tokenizer.decode(tokens) + except: + return self.tokenizer.decode([tokens]) + + def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]: + # TODO + res = [] + pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding") + + for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]: + # encode, pad, and truncate contexts for this batch + if type(doc_to_target) == str: + continuation = doc_to_target + else: + continuation = doc_to_target(self.task_dict[task][split][doc_id]) + visuals = [doc_to_visual(self.task_dict[task][split][doc_id])] + visuals = self.flatten(visuals) + image_sizes = [[visual.size[0], visual.size[1]] for visual in visuals] + if visuals: + image = process_images(visuals, self._image_processor, self._config) + if type(image) is list: + image = [_image.to(dtype=torch.float16, device=self.device) for _image in image] + else: + image = image.to(dtype=torch.float16, device=self.device) + else: + image = None + + prompts_input = contexts[0] if isinstance(contexts, list) else contexts + + if image is not None and len(image) != 0 and DEFAULT_IMAGE_TOKEN not in prompts_input: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visuals) + image_tokens = " ".join(image_tokens) + prompts_input = image_tokens + "\n" + (contexts[0] if isinstance(contexts, list) else contexts) + + # This is much safer for llama3, as we now have some object type in it + if "llama_3" in self.conv_template: + conv = copy.deepcopy(conv_templates[self.conv_template]) + else: + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], prompts_input) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + # Add the answer of the second role + conv.messages[1][1] = continuation + + prompt = conv.get_prompt() + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + labels = input_ids.clone() + # Context part no need to calculate for loss + labels[0, : contxt_id.shape[1]] = -100 + with torch.inference_mode(): + outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True, image_sizes=image_sizes) + loss = outputs["loss"] + # loss = torch.exp(loss) + logits = outputs["logits"] + greedy_tokens = logits.argmax(dim=-1) + cont_toks = input_ids[:, contxt_id.shape[1] :] # [1, seq] + greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]] # [1, seq] + max_equal = (greedy_tokens == cont_toks).all() + res.append((float(loss.item()), bool(max_equal))) + pbar.update(1) + pbar.close() + return res + + def flatten(self, input): + new_list = [] + for i in input: + for j in i: + new_list.append(j) + return new_list + + def generate_until(self, requests: List[Instance]) -> List[str]: + res = [] + + def _collate(x): + # the negative sign on len(toks) sorts descending - this has a few advantages: + # - time estimates will always be over not underestimates, which is more useful for planning + # - to know the size of a batch when going through the list, you know the first one is always the batch + # padded context length. this is useful to simplify the batching logic and more importantly to make + # automatic adaptive batches much much easier to implement + # - any OOMs will happen right away rather than near the end + toks = self.tok_encode(x[0]) + return -len(toks), x[0] + + # we group requests by their generation_kwargs, + # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling + # in the same batch. + re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True) + chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None) + num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1 + pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding") + for chunk in chunks: + contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk) + task = task[0] + split = split[0] + batched_visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id] # [B, N] + flattened_visuals = self.flatten(batched_visuals) # [B*N] + # we assume all gen kwargs in the batch are the same + # this is safe to assume because the `grouper` object ensures it. + gen_kwargs = all_gen_kwargs[0] + + # Set default values for until and max_new_tokens + until = [self.tok_decode(self.eot_token_id)] + + # Update values from gen_kwargs if present + if "until" in gen_kwargs: + until = gen_kwargs.pop("until") + if isinstance(until, str): + until = [until] + elif not isinstance(until, list): + raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}") + + if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__: + # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation + self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio") + eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}") + # encode, pad, and truncate contexts for this batch + if flattened_visuals: + image_tensor = process_images(flattened_visuals, self._image_processor, self._config) + if type(image_tensor) is list: + image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor] + else: + image_tensor = image_tensor.to(dtype=torch.float16, device=self.device) + else: + image_tensor = None + + # prompts_input = contexts[0] + + question_input = [] + + for visual, context in zip(batched_visuals, contexts): + if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context: + """ + Three senarios: + 1. No image, and there for, no image token should be added. + 2. image token is already specified in the context, so we don't need to add it. + 3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line. + """ + image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN] + image_tokens = " ".join(image_tokens) + question = image_tokens + "\n" + context + else: + question = context + # This is much safer for llama3, as we now have some object type in it + if "llama_3" in self.conv_template: + conv = copy.deepcopy(conv_templates[self.conv_template]) + else: + conv = conv_templates[self.conv_template].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt_question = conv.get_prompt() + question_input.append(prompt_question) + + # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device) + # preconfigure gen_kwargs with defaults + gen_kwargs["image_sizes"] = [flattened_visuals[idx].size for idx in range(len(flattened_visuals))] + if "max_new_tokens" not in gen_kwargs: + gen_kwargs["max_new_tokens"] = 1024 + if "temperature" not in gen_kwargs: + gen_kwargs["temperature"] = 0 + if "top_p" not in gen_kwargs: + gen_kwargs["top_p"] = None + if "num_beams" not in gen_kwargs: + gen_kwargs["num_beams"] = 1 + + input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input] + pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id + input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device) + attention_masks = input_ids.ne(pad_token_ids).to(self.device) + # These steps are not in LLaVA's original code, but are necessary for generation to work + # TODO: attention to this major generation step... + try: + cont = self.model.generate( + input_ids, + attention_mask=attention_masks, + pad_token_id=pad_token_ids, + images=image_tensor, + image_sizes=gen_kwargs["image_sizes"], + do_sample=True if gen_kwargs["temperature"] > 0 else False, + temperature=gen_kwargs["temperature"], + top_p=gen_kwargs["top_p"], + num_beams=gen_kwargs["num_beams"], + max_new_tokens=gen_kwargs["max_new_tokens"], + use_cache=self.use_cache, + ) + text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True) + except Exception as e: + raise e + eval_logger.error(f"Error {e} in generating") + cont = "" + text_outputs = [""] + + # cont_toks_list = cont.tolist() + # for cont_toks, context in zip(cont_toks_list, contexts): + # discard context + left-padding toks if using causal decoder-only LMM + # if self.truncate_context: + # cont_toks = cont_toks[input_ids.shape[1] :] + # use secondary stop seqs to cut off should-have-been-stopped content post-hoc + # if self.truncate_context: + # for term in until: + # if len(term) > 0: + # # ignore '' separator, + # # for seq2seq case where self.tok_decode(self.eot_token_id) = '' + # text_outputs = text_outputs.split(term)[0] + res.extend(text_outputs) + self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs) + pbar.update(1) + # reorder this group of results back to original unsorted form + res = re_ords.get_original(res) + + pbar.close() + return res + + def generate_until_multi_round(self, requests) -> List[str]: + raise NotImplementedError("TODO: Implement multi-round generation for LLaVA") diff --git a/lmms_eval/models/llava_hf.py b/lmms_eval/models/llava_hf.py new file mode 100644 index 0000000..9c8bdfb --- /dev/null +++ b/lmms_eval/models/llava_hf.py @@ -0,0 +1,390 @@ +import warnings +from typing import List, Optional, Tuple, Union + +import numpy as np +import PIL +import torch +from accelerate import Accelerator, DistributedType +from accelerate.state import AcceleratorState +from decord import VideoReader, cpu +from tqdm import tqdm +from transformers import ( + AutoConfig, + AutoProcessor, + LlavaForConditionalGeneration, + LlavaNextForConditionalGeneration, +) + +from lmms_eval import utils +from lmms_eval.api.instance import Instance +from lmms_eval.api.model import lmms +from lmms_eval.api.registry import register_model + +warnings.filterwarnings("ignore") + +from loguru import logger as eval_logger + +DEFAULT_IMAGE_TOKEN = "" +DEFAULT_VIDEO_TOKEN = "