From bdf6bf200402b83b723c652c5f1a87d5fed57c97 Mon Sep 17 00:00:00 2001 From: Michael Wyatt Date: Thu, 25 Jul 2024 13:29:13 -0700 Subject: [PATCH] Update model support (#429) Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> Co-authored-by: Ammar Ahmad Awan Co-authored-by: Logan Adams --- README.md | 18 ++++----- requirements/requirements-dev.txt | 3 ++ tests/test_model_support.py | 63 ++++++++++++------------------- 3 files changed, 36 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index f666762a..476c7cc1 100644 --- a/README.md +++ b/README.md @@ -85,18 +85,18 @@ Under-the-hood MII is powered by [DeepSpeed-Inference](https://github.com/micros # Supported Models -MII currently supports over 20,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures: +MII currently supports over 37,000 models across eight popular model architectures. We plan to add additional models in the near term, if there are specific model architectures you would like supported please [file an issue](https://github.com/microsoft/DeepSpeed-MII/issues) and let us know. All current models leverage Hugging Face in our backend to provide both the model weights and the model's corresponding tokenizer. For our current release we support the following model architectures: model family | size range | ~model count ------ | ------ | ------ -[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 300 -[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 19,000 -[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 900 -[mistral](https://huggingface.co/models?other=mistral) | 7B | 6,000 -[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 1,100 -[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 1,300 -[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 200 -[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 200 +[falcon](https://huggingface.co/models?other=falcon) | 7B - 180B | 500 +[llama](https://huggingface.co/models?other=llama) | 7B - 65B | 52,000 +[llama-2](https://huggingface.co/models?other=llama-2) | 7B - 70B | 1,200 +[mistral](https://huggingface.co/models?other=mistral) | 7B | 23,000 +[mixtral (MoE)](https://huggingface.co/models?other=mixtral) | 8x7B | 2,900 +[opt](https://huggingface.co/models?other=opt) | 0.1B - 66B | 2,100 +[phi-2](https://huggingface.co/models?other=phi) | 2.7B | 1,500 +[qwen](https://huggingface.co/models?other=qwen) | 7B - 72B | 500 ## MII Legacy Model Support diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt index 1d69f875..4b7bb770 100644 --- a/requirements/requirements-dev.txt +++ b/requirements/requirements-dev.txt @@ -1,5 +1,8 @@ clang-format==16.0.2 +einops pre-commit>=2.20.0 pytest pytest-forked sentencepiece +tiktoken +transformers-stream-generator diff --git a/tests/test_model_support.py b/tests/test_model_support.py index be49044a..fb554206 100644 --- a/tests/test_model_support.py +++ b/tests/test_model_support.py @@ -11,25 +11,16 @@ CheckpointEngineBase, HuggingFaceCheckpointEngine, ) -from transformers import AutoConfig, AutoModel, GenerationConfig +from transformers import AutoConfig, AutoModelForCausalLM, GenerationConfig from typing import Iterable, Tuple -class RandomWeightsCheckpointEngine(CheckpointEngineBase): - - # When using AutoModel.from_config() to load the model, the layer names are - # often missing a prefix. We default to adding "model." as the prefix, but - # others can be specified here. - layer_prefix_map = {"falcon": "transformer."} - - # When using AutoModel.from_config() to load the model, the lm_head layer is - # not generated. We default to populating this with the - # "embed_tokens.weight" layer, but others can be specified here. - lm_head_layer_map = {"falcon": "word_embeddings.weight"} - +class ZeroWeightsCheckpointEngine(CheckpointEngineBase): + """ Generates weight with all zeros for a given model for testing purposes. """ def __init__(self, model_name_or_path: str, auth_token: str = None) -> None: self.model_name_or_path = model_name_or_path - self.model_config = AutoConfig.from_pretrained(self.model_name_or_path) + self.model_config = AutoConfig.from_pretrained(self.model_name_or_path, + trust_remote_code=True) if hasattr(self.model_config, "max_position_embeddings"): self.model_config.max_seq_length = self.model_config.max_position_embeddings else: @@ -40,37 +31,21 @@ def __init__(self, model_name_or_path: str, auth_token: str = None) -> None: except OSError: self.model_config.max_seq_length = 2048 - def _get_layer_prefix(self) -> str: - for model_type, prefix in self.layer_prefix_map.items(): - if model_type in self.model_name_or_path.lower(): - return prefix - return "model." - - def _get_lm_head_layer(self) -> str: - for model_type, layer in self.lm_head_layer_map.items(): - if model_type in self.model_name_or_path.lower(): - return layer - return "embed_tokens.weight" - def parameters(self) -> Iterable[Tuple[str, torch.Tensor]]: - layer_prefix = self._get_layer_prefix() - lm_head_layer = self._get_lm_head_layer() - # Load with meta device is faster with deepspeed.OnDevice(dtype=torch.float16, device="meta"): - model = AutoModel.from_config(self.model_config) + model = AutoModelForCausalLM.from_config(self.model_config, + trust_remote_code=True) for param_name, param in model.state_dict().items(): - yield layer_prefix + param_name, torch.zeros(param.shape) - if param_name == lm_head_layer: - yield "lm_head.weight", torch.zeros(param.shape) + yield param_name, torch.zeros(param.shape) @pytest.fixture(scope="module", autouse=True) def inject_checkpoint_engine(): # Inject the random weihts checkpoint engine deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = ( - RandomWeightsCheckpointEngine) + ZeroWeightsCheckpointEngine) yield None # Restore the original checkpoint engine deepspeed.inference.v2.engine_factory.HuggingFaceCheckpointEngine = ( @@ -81,16 +56,26 @@ def inject_checkpoint_engine(): "model_name", [ "tiiuae/falcon-7b", + "huggyllama/llama-7b", "NousResearch/Llama-2-7b-hf", "NousResearch/Hermes-2-Pro-Mistral-7B", "cloudyu/Mixtral_11Bx2_MoE_19B", "facebook/opt-125m", + "microsoft/phi-2", + "Qwen/Qwen-7B-Chat", + "Qwen/Qwen1.5-0.5B", + ], + ids=[ + "falcon", + "llama", + "llama-2", + "mistral", + "mixtral", + "opt", + "phi-2", + "qwen", + "qwen-2" ], - ids=["falcon", - "llama", - "mistral", - "mixtral", - "opt"], ) def test_model(pipeline, query): outputs = pipeline(query, max_new_tokens=16)