triton-inference-server · Tabrizian · Jan 14, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 12, 2024
diff --git a/src/model.py b/src/model.py
@@ -264,12 +264,21 @@ async def generate(self, request):
                     self.logger.log_info("[vllm] Successfully cancelled the request")
                     break
                 if stream:
-                    response_sender.send(self.create_response(output))
+                    if output.finished:
+                        response_sender.send(
+                            self.create_response(output),
+                            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
+                        )
+                    else:
+                        response_sender.send(self.create_response(output))
                 else:
                     last_output = output
 
             if not stream:
-                response_sender.send(self.create_response(last_output))
+                response_sender.send(
+                    self.create_response(last_output),
+                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
+                )
 
         except Exception as e:
             self.logger.log_info(f"[vllm] Error generating stream: {e}")
@@ -280,10 +289,11 @@ async def generate(self, request):
             response = pb_utils.InferenceResponse(
                 output_tensors=[triton_output_tensor], error=error
             )
-            response_sender.send(response)
+            response_sender.send(
+                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
             raise e
         finally:
-            response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
             self.ongoing_request_count -= 1
 
     def execute(self, requests):