Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Mamba model using kernel to improve the performance #1689

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ The following model architectures, tasks and device distributions have been vali
| Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
| DeepSeek-V2 | | :heavy_check_mark: | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
| ChatGLM | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
| Mamba | | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
</div>

- Diffusers:
Expand Down
1 change: 1 addition & 0 deletions docs/source/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
| Baichuan2 | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
| DeepSeek-V2 | | ✅ | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
| ChatGLM | <div style="text-align:left"><li>DeepSpeed</li></div> | <div style="text-align:left"><li>Single card</li></div> | <li>[language modeling](https://github.com/huggingface/optimum-habana/tree/main/examples/language-modeling)</li><li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |
| Mamba | | <div style="text-align:left"><li>Single card</li></div> | <li>[text generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)</li> |

- Diffusers

Expand Down
2 changes: 1 addition & 1 deletion examples/image-to-text/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,7 @@ def preprocess(self, image, prompt=None, timeout=None):
throughput = total_new_tokens_generated / duration
logger.info(f"result = {result}")
logger.info(
f"time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
f"time = {(end - start) * 1000 / args.n_iterations}ms, Throughput (including tokenization) = {throughput} tokens/second"
)

# Store results if necessary
Expand Down
2 changes: 1 addition & 1 deletion examples/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def main():
else:
model = AutoModelForCausalLM.from_config(config, trust_remote_code=model_args.trust_remote_code)
n_params = sum({p.data_ptr(): p.numel() for p in model.parameters()}.values())
logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")

# We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch
# on a small vocab and want a smaller embedding size, remove this test.
Expand Down
4 changes: 2 additions & 2 deletions examples/pytorch-image-models/train_hpu_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1092,7 +1092,7 @@ def main():

if utils.is_primary(args):
_logger.info(
f'Scheduled epochs: {num_epochs}. LR stepped per {"epoch" if lr_scheduler.t_in_epochs else "update"}.'
f"Scheduled epochs: {num_epochs}. LR stepped per {'epoch' if lr_scheduler.t_in_epochs else 'update'}."
)

results = []
Expand Down Expand Up @@ -1324,7 +1324,7 @@ def _backward(_loss):
if utils.is_primary(args):
_logger.info(
f"Train: {epoch} [{update_idx:>4d}/{updates_per_epoch} "
f"({100. * (update_idx + 1) / updates_per_epoch:>3.0f}%)] "
f"({100.0 * (update_idx + 1) / updates_per_epoch:>3.0f}%)] "
f"Loss: {losses_m.val:#.3g} ({losses_m.avg:#.3g}) "
f"Time: {update_time_m.val:.3f}s, {update_sample_count / update_time_m.val:>7.2f}/s "
f"({update_time_m.avg:.3f}s, {update_sample_count / update_time_m.avg:>7.2f}/s) "
Expand Down
4 changes: 2 additions & 2 deletions examples/pytorch-image-models/train_hpu_lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,7 @@ def main():

if utils.is_primary(args):
_logger.info(
f'Scheduled epochs: {num_epochs}. LR stepped per {"epoch" if lr_scheduler.t_in_epochs else "update"}.'
f"Scheduled epochs: {num_epochs}. LR stepped per {'epoch' if lr_scheduler.t_in_epochs else 'update'}."
)

results = []
Expand Down Expand Up @@ -1325,7 +1325,7 @@ def _backward(_loss):
if utils.is_primary(args):
_logger.info(
f"Train: {epoch} [{update_idx:>4d}/{updates_per_epoch} "
f"({100. * (update_idx + 1) / updates_per_epoch:>3.0f}%)] "
f"({100.0 * (update_idx + 1) / updates_per_epoch:>3.0f}%)] "
f"Loss: {losses_m.val:#.3g} ({losses_m.avg:#.3g}) "
f"Time: {update_time_m.val:.3f}s, {update_sample_count / update_time_m.val:>7.2f}/s "
f"({update_time_m.avg:.3f}s, {update_sample_count / update_time_m.avg:>7.2f}/s) "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ def main():
# E.g. characters, such as `,` and `.` do not really have an acoustic characteristic
# that could be easily picked up by the model
chars_to_ignore_regex = (
f'[{"".join(data_args.chars_to_ignore).replace(" ", "")}]' if data_args.chars_to_ignore is not None else None
f"[{''.join(data_args.chars_to_ignore).replace(' ', '')}]" if data_args.chars_to_ignore is not None else None
)
text_column_name = data_args.text_column_name

Expand Down
6 changes: 3 additions & 3 deletions examples/stable-diffusion/image_to_image_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,12 +370,12 @@ def main():
logger.info(f"Saving images in {image_save_dir.resolve()}...")
if args.ldm3d:
for i, rgb in enumerate(outputs.rgb):
rgb.save(image_save_dir / f"rgb_{i+1}.png")
rgb.save(image_save_dir / f"rgb_{i + 1}.png")
for i, depth in enumerate(outputs.depth):
depth.save(image_save_dir / f"depth_{i+1}.png")
depth.save(image_save_dir / f"depth_{i + 1}.png")
else:
for i, image in enumerate(outputs.images):
image.save(image_save_dir / f"image_{i+1}.png")
image.save(image_save_dir / f"image_{i + 1}.png")
else:
logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")

Expand Down
6 changes: 3 additions & 3 deletions examples/stable-diffusion/text_to_image_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,12 +687,12 @@ def main():
logger.info(f"Saving images in {image_save_dir.resolve()}...")
if args.ldm3d:
for i, rgb in enumerate(outputs.rgb):
rgb.save(image_save_dir / f"rgb_{i+1}.png")
rgb.save(image_save_dir / f"rgb_{i + 1}.png")
for i, depth in enumerate(outputs.depth):
depth.save(image_save_dir / f"depth_{i+1}.png")
depth.save(image_save_dir / f"depth_{i + 1}.png")
else:
for i, image in enumerate(outputs.images):
image.save(image_save_dir / f"image_{i+1}.png")
image.save(image_save_dir / f"image_{i + 1}.png")
else:
logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,7 @@ def load_model_hook(models, input_dir):
lora_state_dict = FluxPipeline.lora_state_dict(input_dir)

transformer_state_dict = {
f'{k.replace("transformer.", "")}': v for k, v in lora_state_dict.items() if k.startswith("transformer.")
f"{k.replace('transformer.', '')}": v for k, v in lora_state_dict.items() if k.startswith("transformer.")
}
transformer_state_dict = convert_unet_state_dict_to_peft(transformer_state_dict)
incompatible_keys = set_peft_model_state_dict(transformer_, transformer_state_dict, adapter_name="default")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def save_model_card(
for i, image in enumerate(images):
image.save(os.path.join(repo_folder, f"image_{i}.png"))
img_str += f"""
- text: '{validation_prompt if validation_prompt else ' ' }'
- text: '{validation_prompt if validation_prompt else " "}'
output:
url:
"image_{i}.png"
Expand Down Expand Up @@ -1083,7 +1083,7 @@ def load_model_hook(models, input_dir):

lora_state_dict, network_alphas = LoraLoaderMixin.lora_state_dict(input_dir)

unet_state_dict = {f'{k.replace("unet.", "")}': v for k, v in lora_state_dict.items() if k.startswith("unet.")}
unet_state_dict = {f"{k.replace('unet.', '')}": v for k, v in lora_state_dict.items() if k.startswith("unet.")}
unet_state_dict = convert_unet_state_dict_to_peft(unet_state_dict)
incompatible_keys = set_peft_model_state_dict(unet_, unet_state_dict, adapter_name="default")
if incompatible_keys is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -884,9 +884,9 @@ def main(args):
# download the dataset.
if args.dataset_name is not None:
if len(args.mediapipe) > 0:
assert (
args.resolution == args.crop_resolution
), f"To use hardware pipe, --resolution ({args.resolution}) must equal --crop_resolution ({args.crop_resolution})"
assert args.resolution == args.crop_resolution, (
f"To use hardware pipe, --resolution ({args.resolution}) must equal --crop_resolution ({args.crop_resolution})"
)
if args.local_rank == 0:
if not os.path.exists(args.mediapipe):
os.mkdir(args.mediapipe)
Expand Down Expand Up @@ -1532,7 +1532,7 @@ def compute_time_ids(original_size, crops_coords_top_left):
image_save_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Saving images in {image_save_dir.resolve()}...")
for i, image in enumerate(images):
image.save(image_save_dir / f"image_{epoch}_{i+1}.png")
image.save(image_save_dir / f"image_{epoch}_{i + 1}.png")
else:
logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")

Expand Down
6 changes: 3 additions & 3 deletions examples/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,9 +559,9 @@ def main():
return

if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
assert (
data_args.lang is not None
), f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
assert data_args.lang is not None, (
f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --lang argument"
)

tokenizer.src_lang = data_args.lang
tokenizer.tgt_lang = data_args.lang
Expand Down
12 changes: 6 additions & 6 deletions examples/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,9 @@ def __post_init__(self):
train_extension = self.train_file.split(".")[-1]
assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
validation_extension = self.validation_file.split(".")[-1]
assert (
validation_extension == train_extension
), "`validation_file` should have the same extension (csv or json) as `train_file`."
assert validation_extension == train_extension, (
"`validation_file` should have the same extension (csv or json) as `train_file`."
)


@dataclass
Expand Down Expand Up @@ -338,9 +338,9 @@ def main():
if data_args.test_file is not None:
train_extension = data_args.train_file.split(".")[-1]
test_extension = data_args.test_file.split(".")[-1]
assert (
test_extension == train_extension
), "`test_file` should have the same extension (csv or json) as `train_file`."
assert test_extension == train_extension, (
"`test_file` should have the same extension (csv or json) as `train_file`."
)
data_files["test"] = data_args.test_file
else:
raise ValueError("Need either a GLUE task or a test file for `do_predict`.")
Expand Down
12 changes: 12 additions & 0 deletions examples/text-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,18 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
> --sdp_on_bf16
> ```

To run Mamba-130m inference on 1 Gaudi2 card, use the following command, for example if default custom kernel path is in /root/.cache/huggingface/hub/models--Habana--mamba/blobs/libcustom_tpc_perf_lib.so, if libcustom_tpc_perf_lib.so is in different folder, set accordingly,
```bash
GC_KERNEL_PATH=/root/.cache/huggingface/hub/models--Habana--mamba/blobs/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH python run_generation.py \
--model_name_or_path state-spaces/mamba-130m-hf \
--max_input_tokens 128 \
--max_new_tokens 128 \
--bf16 \
--use_hpu_graphs \
--use_kv_cache \
--batch_size 1024
```

### Use any dataset from the Hugging Face Hub

You can also provide the name of a dataset from the Hugging Face Hub to perform generation on it with the argument `--dataset_name`.
Expand Down
18 changes: 9 additions & 9 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
profiling_record_shapes=args.profiling_record_shapes,
).cpu()
first_token_time = iteration_times[0] + encode_duration
logger.info(f"Time to first token = {first_token_time*1000}ms")
logger.info(f"Time to first token = {first_token_time * 1000}ms")
return tokenizer.batch_decode(outputs, skip_special_tokens=True)

from optimum.habana.utils import HabanaProfile
Expand All @@ -541,10 +541,10 @@ def compute_valid_sequence_lengths_tensor(input_tokens):
if dyn_prompt_lens is None or len(set(dyn_prompt_lens)) == 1:
for i in range(args.warmup):
if dyn_prompt_lens is None:
print(f"Warming up iteration {i+1}/{args.warmup}", flush=True)
print(f"Warming up iteration {i + 1}/{args.warmup}", flush=True)
generate(None, args.reduce_recompile)
else:
print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i+1}/{args.warmup}", flush=True)
print(f"Warming up for shape {dyn_prompt_lens[0]} iteration {i + 1}/{args.warmup}", flush=True)
generate(dyn_prompt_lens[0], args.reduce_recompile)
else:
if args.bucket_size > 0:
Expand All @@ -559,7 +559,7 @@ def rounder(x):
for i in range(args.warmup):
lst = list(range(min_prompt_len, max_sentence_len + 1, args.bucket_size))
for sz in lst:
print(f"Warming up for shape {sz - 1} iteration {i+1}/{args.warmup}", flush=True)
print(f"Warming up for shape {sz - 1} iteration {i + 1}/{args.warmup}", flush=True)
generate(sz - 1, args.reduce_recompile)
torch_hpu.synchronize()
compilation_duration = time.perf_counter() - t0
Expand All @@ -586,12 +586,12 @@ def rounder(x):
all_inputs = []
all_outputs = []
for i, input_sentence in enumerate(zip(input_sentences)):
print(f"input {i+1}: {input_sentence}")
print(f"input {i + 1}: {input_sentence}")
all_inputs.append(input_sentence)
for j, output in enumerate(
zip(generated[args.num_return_sequences * i : args.num_return_sequences * (i + 1)])
):
print(f"output {i+1}.{j+1}: {output}")
print(f"output {i + 1}.{j + 1}: {output}")
all_outputs.append(output)
print()

Expand Down Expand Up @@ -747,10 +747,10 @@ def generate_dataset(batch):
duration += time.perf_counter() - t0
total_new_tokens_generated += args.batch_size * args.max_new_tokens
print(separator)
print(f"Batch n°{i+1}")
print(f"Input: {prompt[:args.batch_size]}")
print(f"Batch n°{i + 1}")
print(f"Input: {prompt[: args.batch_size]}")
print(
f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[:args.batch_size*args.num_return_sequences]}"
f"Output: {tokenizer.batch_decode(outputs, skip_special_tokens=True)[: args.batch_size * args.num_return_sequences]}"
)
print(separator)
if args.run_partial_dataset and args.n_iterations == i + 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ def main():

duration = 0
for iteration in range(args.n_iterations):
logger.info(f"Running inference iteration {iteration+1}...")
logger.info(f"Running inference iteration {iteration + 1}...")
t0 = time.perf_counter()
output = pipe(input_sentences)
duration += time.perf_counter() - t0

for i, (input_sentence, generated_text) in enumerate(zip(input_sentences, output)):
print(f"Prompt[{iteration+1}][{i+1}]: {input_sentence}")
print(f"Generated Text[{iteration+1}][{i+1}]: {repr(generated_text)}\n")
print(f"Prompt[{iteration + 1}][{i + 1}]: {input_sentence}")
print(f"Generated Text[{iteration + 1}][{i + 1}]: {repr(generated_text)}\n")

throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration
print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ def main():
duration += time.perf_counter() - t0

for i, (question, answer) in enumerate(zip(input_questions, responses)):
print(f"Question[{iteration+1}][{i+1}]: {question['question']}")
print(f"Response[{iteration+1}][{i+1}]: {answer}\n")
print(f"Question[{iteration + 1}][{i + 1}]: {question['question']}")
print(f"Response[{iteration + 1}][{i + 1}]: {answer}\n")

throughput = args.n_iterations * args.batch_size * args.max_new_tokens / duration
print(f"Inference Duration (for {args.n_iterations} iterations): {duration} seconds")
Expand Down
2 changes: 1 addition & 1 deletion examples/text-to-speech/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def main():
text, batch_size=args.batch_size, forward_params=forward_params, generate_kwargs=generate_kwargs
)
end = time.time()
logger.info(f"speech = {speech} time = {(end-start) * 1000 / args.n_iterations }ms")
logger.info(f"speech = {speech} time = {(end - start) * 1000 / args.n_iterations}ms")
sf.write("speech.wav", speech[0]["audio"].squeeze(), samplerate=speech[0]["sampling_rate"])


Expand Down
2 changes: 1 addition & 1 deletion examples/visual-question-answering/run_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def main():
with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=autocast_enable):
result = generator(model_input, batch_size=args.batch_size, topk=args.topk)
end = time.time()
logger.info(f"result = {result}, time = {(end-start) * 1000/args.n_iterations}ms")
logger.info(f"result = {result}, time = {(end - start) * 1000 / args.n_iterations}ms")


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions optimum/habana/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,9 @@ def __init__(

if kwargs_handlers is not None:
for handler in kwargs_handlers:
assert isinstance(
handler, KwargsHandler
), f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
assert isinstance(handler, KwargsHandler), (
f"Unsupported kwargs handler passed: {handler}, must be one that inherits `accelerate.utils.KwargsHandler`."
)
if isinstance(handler, DistributedDataParallelKwargs):
if self.ddp_handler is not None:
raise ValueError("You can only pass one `DistributedDataParallelKwargs` in `kwargs_handler`.")
Expand Down
Loading