huggingface · regisss · Jan 28, 2025 · Dec 19, 2024 · Dec 23, 2024 · Dec 23, 2024
@@ -279,7 +279,7 @@ The following model architectures, tasks and device distributions have been vali
 | Mllama     | <li>LoRA</li> | :heavy_check_mark: | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 | MiniCPM3 |   | <li>Single card</li> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Baichuan2 | <li>DeepSpeed</li> | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| DeepSeek-V2 |   | :heavy_check_mark: | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| DeepSeek-V2 | :heavy_check_mark: | :heavy_check_mark: | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | ChatGLM | <li>DeepSpeed</li> | <li>Single card</li> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 
 

@@ -107,7 +107,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | Mllama     | <div style="text-align:left"><li>LoRA</li></div> |✅      | <li>[image to text](https://github.com/huggingface/optimum-habana/tree/main/examples/image-to-text)</li> |
 | MiniCPM3 |   | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
-| DeepSeek-V2 |   | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
+| DeepSeek-V2 | ✅ | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 | ChatGLM     | <div style="text-align:left"><li>DeepSpeed</li></div> |  <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
 
 - Diffusers

@@ -184,6 +184,36 @@ python ../gaudi_spawn.py \
     --logging_steps 20
 ```
 
+### Multi-card Training with Deepspeed (DeepSeek-V2-Lite)
+```bash
+python ../gaudi_spawn.py --world_size 8 --use_deepspeed run_clm.py
+    --config_name deepseek-ai/DeepSeek-V2-Lite
+    --tokenizer_name deepseek-ai/DeepSeek-V2-Lite
+    --dataset_name tatsu-lab/alpaca
+    --block_size 4096
+    --do_train
+    --num_train_epochs 1
+    --max_steps 10
+    --per_device_train_batch_size 1
+    --gradient_accumulation_steps 1
+    --use_flash_attention True
+    --attn_softmax_bf16 False
+    --gradient_checkpointing
+    --learning_rate 2.4e-4
+    --gaudi_config_name Habana/gpt2
+    --bf16
+    --save_strategy no
+    --no_save_last_ckpt
+    --output_dir /root/deepseek-v2-lite
+    --overwrite_output_dir
+    --logging_strategy steps
+    --logging_dir /root/deepseek-v2-lite/log
+    --logging_steps 1
+    --evaluation_strategy no
+    --use_habana
+    --use_lazy_mode
+    --deepspeed llama2_ds_zero3_config.json
+```
 
 ## Multi-Node Training with Deepspeed (GPT-NeoX)
 

@@ -156,6 +156,32 @@ class ModelArguments:
             )
         },
     )
+    attn_softmax_bf16: bool = field(
+        default=False,
+        metadata={"help": ("Whether to run attention softmax layer in bf16 precision for fine-tuning.")},
+    )
+    use_flash_attention: bool = field(
+        default=False,
+        metadata={"help": ("Whether to use Habana flash attention for fine-tuning.")},
+    )
+    flash_attention_recompute: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable recompute in Habana flash attention for fine-tuning."
+                " It is applicable only when use_flash_attention is True."
+            )
+        },
+    )
+    flash_attention_causal_mask: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable causal mask in Habana flash attention for fine-tuning."
+                " It is applicable only when use_flash_attention is True."
+            )
+        },
+    )
     low_cpu_mem_usage: bool = field(
         default=False,
         metadata={
@@ -482,6 +508,14 @@ def main():
         if len(tokenizer) > embedding_size:
             model.resize_token_embeddings(len(tokenizer))
 
+    # We need to add these fused kernels config
+    if model_args.attn_softmax_bf16:
+        model.generation_config.attn_softmax_bf16 = True
+    if model_args.use_flash_attention:
+        model.generation_config.use_flash_attention = True
+        model.generation_config.flash_attention_recompute = model_args.flash_attention_recompute
+        model.generation_config.flash_attention_causal_mask = model_args.flash_attention_causal_mask
+
     # Preprocessing the datasets.
     # First we tokenize all the texts.
     if training_args.do_train:

@@ -1092,8 +1092,9 @@ def generate(
                 "gemma2",
                 "baichuan",
                 "chatglm",
+                "deepseek_v2",
             ], (
-                "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2, qwen2_moe, gemma, gemma2, starcoder2, baichuan and chatglm at the moment"
+                "reuse_cache only supported by llama, mistral, falcon, mixtral, phi, qwen2, qwen2_moe, gemma, gemma2, starcoder2, baichuan, chatglm and deepseek_v2 at the moment"
             )
             if not generation_config.bucket_internal:
                 assert generation_config.bucket_size <= 0, (
@@ -1300,6 +1301,7 @@ def generate(
                 "gemma2",
                 "qwen2_moe",
                 "baichuan",
+                "deepseek_v2",
             ]:
                 if (
                     hasattr(self.config, "max_position_embeddings")