Skip to content

Commit

Permalink
Fixed to use only useful logits in SequenceGroup and Sampler
Browse files Browse the repository at this point in the history
  • Loading branch information
AsyaPronina committed Jan 10, 2025
1 parent 7b1a495 commit cecbce9
Showing 1 changed file with 22 additions and 5 deletions.
27 changes: 22 additions & 5 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,6 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
m_kvcache_total = kMaxPromptLen + kMinResponseLen;
std::string generate_hint = pop_or_default<std::string>(pipeline_config, "GENERATE_HINT", "FAST_COMPILE");

update_config(pipeline_config, {"NPU_USE_NPUW", "YES"});
update_config(pipeline_config, {"NPUW_LLM", "YES"});
Expand All @@ -733,7 +732,6 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(

update_config(pipeline_config, {"NPUW_LLM_MAX_PROMPT_LEN", kMaxPromptLen});
update_config(pipeline_config, {"NPUW_LLM_MIN_RESPONSE_LEN", kMinResponseLen});
update_config(pipeline_config, {"NPUW_LLM_GENERATE_HINT", generate_hint});

// NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
Expand All @@ -743,6 +741,7 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(

rename_key(pipeline_config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG");
rename_key(pipeline_config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG");
rename_key(pipeline_config, "GENERATE_HINT", "NPUW_LLM_GENERATE_HINT");

// Replace CACHE_DIR option if NPUW is enabled
set_npuw_cache_dir(pipeline_config);
Expand Down Expand Up @@ -864,20 +863,31 @@ EncodedResults StatefulLLMPipeline::generate(

m_request.infer();

auto logits = m_request.get_tensor("logits");
auto padded_logits = m_request.get_tensor("logits");
// FIXME: Here is workaround to get only useful units of returned logits.
// If SliceOut is applied, there will be only 1 useful logit returned,
// nothing is required here.
// Other way, model will return logits of full context length,
// as internally prefill model is specially reshaped to return them.
// Fix should be done on OpenVINO side, so the model should return only
// useful logits of input prompt length, dropping the implementation-related
// padding ones.
auto padded_sequence_len = padded_logits.get_shape()[1];
auto logits = make_tensor_slice(padded_logits, 1, padded_sequence_len - input_ids.get_size(), padded_sequence_len);
int64_t output_sequence_len = logits.get_shape().at(1);

// Swap max_new_token to get_max_new_token()
auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(input_ids.get_size());

sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
sequence_group->schedule_tokens(output_sequence_len);

// NB: Controls what tokens are ready to be pushed into the streamer
// Set max_new_tokens here via get_max_new_token(prompt)
GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());

// logits -> pass only logic units
SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
stream_generated_tokens(streamer_ptr, handle);

Expand Down Expand Up @@ -1340,6 +1350,8 @@ EncodedResults StatelessLLMPipeline::generate(
auto logits = m_prefill_request.get_tensor("logits");
int64_t output_sequence_len = logits.get_shape().at(1);

// TODO: to pass input_ids to say that there is room for generation
// But max_prompt_size == input_ids.size()
auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
Expand All @@ -1351,6 +1363,11 @@ EncodedResults StatelessLLMPipeline::generate(

SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
stream_generated_tokens(streamer_ptr, handle);
// std::cout << "LOGITS" << std::endl;
// for (auto i = 0; i < output_sequence_len; ++i) {
// std::cout << logits.data<float>()[i] << ",";
// }
// std::cout << std::endl;

// Outputs: logits, ...
const auto kStartOutputKVCacheLayers = 1u;
Expand Down

0 comments on commit cecbce9

Please sign in to comment.