Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test for deepseek_coder #1309

Merged
merged 1 commit into from
Feb 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
#
# SPDX-License-Identifier: Apache-2.0
import pytest

import forge
from forge.verify.verify import verify

from test.models.pytorch.multimodal.deepseek_coder.utils.model_utils import (
DeepSeekWrapper,
download_model_and_tokenizer,
generate_no_cache,
pad_inputs,
)
from test.models.utils import Framework, Source, Task, build_module_name


@pytest.mark.nightly
@pytest.mark.parametrize("variant", ["deepseek-coder-1.3b-instruct"])
def test_deepseek_inference_no_cache(record_forge_property, variant):

# Build Module Name
module_name = build_module_name(
framework=Framework.PYTORCH, model="deepseek", variant=variant, task=Task.QA, source=Source.HUGGINGFACE
)

# Record Forge Property
record_forge_property("model_name", module_name)

# Load Model and Tokenizer
model_name = f"deepseek-ai/{variant}"
model, tokenizer, inputs = download_model_and_tokenizer(model_name)
framework_model = DeepSeekWrapper(model)
framework_model.eval()

padded_inputs, seq_len = pad_inputs(inputs)

# Forge compile framework model
compiled_model = forge.compile(framework_model, sample_inputs=[padded_inputs], module_name=module_name)

# Model Verification
verify([padded_inputs], framework_model, compiled_model)

generated_text = generate_no_cache(
max_new_tokens=512, model=compiled_model, inputs=padded_inputs, seq_len=seq_len, tokenizer=tokenizer
)
print(generated_text)


@pytest.mark.parametrize("variant", ["deepseek-coder-1.3b-instruct"])
def test_deepseek_inference_no_cache_cpu(variant):
model_name = f"deepseek-ai/{variant}"
model, tokenizer, inputs = download_model_and_tokenizer(model_name)

framework_model = DeepSeekWrapper(model)
framework_model.eval()

padded_inputs, seq_len = pad_inputs(inputs)

generated_text = generate_no_cache(
max_new_tokens=512, model=framework_model, inputs=padded_inputs, seq_len=seq_len, tokenizer=tokenizer
)
print(generated_text)
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC

# SPDX-License-Identifier: Apache-2.0
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask


def generate_no_cache(max_new_tokens, model, inputs, seq_len, tokenizer):
"""
Generates text autoregressively without using a KV cache, iteratively predicting one token at a time.
The function stops generation if the maximum number of new tokens is reached or an end-of-sequence (EOS) token is encountered.

Args:
max_new_tokens (int): The maximum number of new tokens to generate.
model (torch.nn.Module): The language model used for token generation.
inputs (torch.Tensor): Input tensor of shape (batch_size, seq_len), representing tokenized text.
seq_len (int): The current sequence length before generation starts.
tokenizer: The tokenizer used to decode token IDs into text.

Returns:
str: The generated text after decoding the new tokens.
"""
current_pos = seq_len

for _ in range(max_new_tokens):
logits = model(inputs)

# Get only the logits corresponding to the last valid token
if isinstance(logits, list):
logits = logits[0]
next_token_logits = logits[:, current_pos - 1, :]
next_token_id = torch.argmax(next_token_logits, dim=-1)
# Stop if EOS token is encountered
if next_token_id.item() == tokenizer.eos_token_id:
break

inputs[:, current_pos] = next_token_id

current_pos += 1 # Move to next position

# Decode valid tokens
valid_tokens = inputs[:, seq_len:current_pos].view(-1).tolist()
answer = tokenizer.decode(valid_tokens, skip_special_tokens=True)

return answer


def pad_inputs(inputs, max_new_tokens=512):
batch_size, seq_len = inputs.shape
max_seq_len = seq_len + max_new_tokens
padded_inputs = torch.zeros((batch_size, max_seq_len), dtype=inputs.dtype, device=inputs.device)
padded_inputs[:, :seq_len] = inputs
return padded_inputs, seq_len


def download_model_and_tokenizer(model_name, **kwargs):
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

# Prepare input sentence
messages = [{"role": "user", "content": "write a bubble sort algorithm in python."}]
inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)

return model, tokenizer, inputs


class DeepSeekWrapper(torch.nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
self.embed_tokens = model.model.embed_tokens

def forward(self, input_tensor, attention_mask=None, past_key_values=None):
inputs_embeds = self.embed_tokens(input_tensor)
past_key_values_length = past_key_values[0][0].shape[-2] if past_key_values is not None else 0
causal_attention_mask = _prepare_4d_causal_attention_mask(
attention_mask, input_tensor.shape, inputs_embeds, past_key_values_length
)
return self.model(input_ids=input_tensor, attention_mask=causal_attention_mask).logits
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import forge

from test.models.pytorch.multimodal.deepseek.utils.model import (
from test.models.pytorch.multimodal.deepseek_math.utils.model_utils import (
DeepSeekWrapper,
download_model_and_tokenizer,
generation,
Expand All @@ -19,6 +19,7 @@ def test_deepseek_inference_no_cache_cpu(variant):
model, tokenizer, input_ids = download_model_and_tokenizer(model_name)

framework_model = DeepSeekWrapper(model)
framework_model.eval()

generated_text = generation(
max_new_tokens=200, compiled_model=framework_model, input_ids=input_ids, tokenizer=tokenizer
Expand All @@ -39,6 +40,7 @@ def test_deepseek_inference(record_forge_property, variant):
model_name = f"deepseek-ai/{variant}"
model, tokenizer, input_ids = download_model_and_tokenizer(model_name)
framework_model = DeepSeekWrapper(model)
framework_model.eval()

compiled_model = forge.compile(framework_model, sample_inputs=[input_ids], module_name=module_name)
generated_text = generation(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import forge
from forge.verify.compare import compare_with_golden

from test.models.pytorch.multimodal.deepseek.utils.model import (
from test.models.pytorch.multimodal.deepseek_math.utils.model_utils import (
DeepSeekWrapper_decoder,
download_model_and_tokenizer,
)
Expand Down Expand Up @@ -67,6 +67,7 @@ def test_deepseek_prefil_on_device_decode_on_cpu(variant):
# This is the part of the model needed for prefill; model without the last Linear layer (lm_head)
model_decoder = model.get_decoder()
model_decoder = DeepSeekWrapper_decoder(model_decoder)
model_decoder.eval()
compiled_decoder = forge.compile(model_decoder, sample_inputs=input_ids)

# Prefill Phase - Process the initial prompt on device
Expand Down
Loading