Fixed to use only useful logits in SequenceGroup and Sampler

openvinotoolkit · Jan 10, 2025 · cecbce9 · cecbce9
1 parent 7b1a495
commit cecbce9
Showing 1 changed file with 22 additions and 5 deletions.
diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
@@ -722,7 +722,6 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
     const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
     const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
     m_kvcache_total = kMaxPromptLen + kMinResponseLen;
-    std::string generate_hint = pop_or_default<std::string>(pipeline_config, "GENERATE_HINT", "FAST_COMPILE");
 
     update_config(pipeline_config, {"NPU_USE_NPUW", "YES"});
     update_config(pipeline_config, {"NPUW_LLM", "YES"});
@@ -733,7 +732,6 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
 
     update_config(pipeline_config, {"NPUW_LLM_MAX_PROMPT_LEN", kMaxPromptLen});
     update_config(pipeline_config, {"NPUW_LLM_MIN_RESPONSE_LEN", kMinResponseLen});
-    update_config(pipeline_config, {"NPUW_LLM_GENERATE_HINT", generate_hint});
 
     // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model
     if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" ||
@@ -743,6 +741,7 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
 
     rename_key(pipeline_config, "PREFILL_CONFIG", "NPUW_LLM_PREFILL_CONFIG");
     rename_key(pipeline_config, "GENERATE_CONFIG", "NPUW_LLM_GENERATE_CONFIG");
+    rename_key(pipeline_config, "GENERATE_HINT", "NPUW_LLM_GENERATE_HINT");
 
     // Replace CACHE_DIR option if NPUW is enabled
     set_npuw_cache_dir(pipeline_config);
@@ -864,20 +863,31 @@ EncodedResults StatefulLLMPipeline::generate(
 
     m_request.infer();
 
-    auto logits = m_request.get_tensor("logits");
+    auto padded_logits = m_request.get_tensor("logits");
+    // FIXME: Here is workaround to get only useful units of returned logits.
+    //        If SliceOut is applied, there will be only 1 useful logit returned,
+    //        nothing is required here.
+    //        Other way, model will return logits of full context length,
+    //        as internally prefill model is specially reshaped to return them.
+    //        Fix should be done on OpenVINO side, so the model should return only
+    //        useful logits of input prompt length, dropping the implementation-related
+    //        padding ones.
+    auto padded_sequence_len = padded_logits.get_shape()[1];
+    auto logits = make_tensor_slice(padded_logits, 1, padded_sequence_len - input_ids.get_size(), padded_sequence_len);
     int64_t output_sequence_len = logits.get_shape().at(1);
 
-    // Swap max_new_token to get_max_new_token()
     auto sequence_group = std::make_shared<SequenceGroup>(
         0 /* request_id */, input_ids, config, 1 /* block_size */);
-    sequence_group->update_processed_tokens_num(input_ids.get_size());
+
+    sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
     sequence_group->schedule_tokens(output_sequence_len);
 
     // NB: Controls what tokens are ready to be pushed into the streamer
     // Set max_new_tokens here via get_max_new_token(prompt)
     GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
         sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());
 
+    // logits -> pass only logic units
     SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
     stream_generated_tokens(streamer_ptr, handle);
 
@@ -1340,6 +1350,8 @@ EncodedResults StatelessLLMPipeline::generate(
     auto logits = m_prefill_request.get_tensor("logits");
     int64_t output_sequence_len = logits.get_shape().at(1);
 
+    // TODO: to pass input_ids to say that there is room for generation
+    //       But max_prompt_size == input_ids.size()
     auto sequence_group = std::make_shared<SequenceGroup>(
         0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
     sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
@@ -1351,6 +1363,11 @@ EncodedResults StatelessLLMPipeline::generate(
 
     SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
     stream_generated_tokens(streamer_ptr, handle);
+    // std::cout << "LOGITS" << std::endl;
+    // for (auto i = 0; i < output_sequence_len; ++i) {
+    //     std::cout << logits.data<float>()[i] << ",";
+    // }
+    // std::cout << std::endl;
 
     // Outputs: logits, ...
     const auto kStartOutputKVCacheLayers = 1u;