From 31280656f7721c926085e4ada3d92c54354a199e Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Thu, 11 Jan 2024 14:51:56 -0500
Subject: [PATCH 1/3] Piggyback final flag as a part of final response

---
 src/model.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/model.py b/src/model.py
index f21f2a35..67e8784d 100644
--- a/src/model.py
+++ b/src/model.py
@@ -264,7 +264,10 @@ async def generate(self, request):
                     self.logger.log_info("[vllm] Successfully cancelled the request")
                     break
                 if stream:
-                    response_sender.send(self.create_response(output))
+                    if output.finished:
+                        response_sender.send(self.create_response(output), flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                    else:
+                        response_sender.send(self.create_response(output))
                 else:
                     last_output = output
 
@@ -280,10 +283,9 @@ async def generate(self, request):
             response = pb_utils.InferenceResponse(
                 output_tensors=[triton_output_tensor], error=error
             )
-            response_sender.send(response)
+            response_sender.send(response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
             raise e
         finally:
-            response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
             self.ongoing_request_count -= 1
 
     def execute(self, requests):

From e81cfd93ce7b6c4b80ea788d769b7f242134fe43 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Thu, 11 Jan 2024 14:59:38 -0500
Subject: [PATCH 2/3] format

---
 src/model.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/model.py b/src/model.py
index 67e8784d..868b4e91 100644
--- a/src/model.py
+++ b/src/model.py
@@ -265,7 +265,10 @@ async def generate(self, request):
                     break
                 if stream:
                     if output.finished:
-                        response_sender.send(self.create_response(output), flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                        response_sender.send(
+                            self.create_response(output),
+                            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
+                        )
                     else:
                         response_sender.send(self.create_response(output))
                 else:
@@ -283,7 +286,9 @@ async def generate(self, request):
             response = pb_utils.InferenceResponse(
                 output_tensors=[triton_output_tensor], error=error
             )
-            response_sender.send(response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            response_sender.send(
+                response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL
+            )
             raise e
         finally:
             self.ongoing_request_count -= 1

From 1e9a11f3a75487be3d2f1a9371f545a9c0d1b3ca Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <itabrizian@nvidia.com>
Date: Fri, 12 Jan 2024 11:09:56 -0500
Subject: [PATCH 3/3] Fix non streaming mode

---
 src/model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/model.py b/src/model.py
index 868b4e91..80f51320 100644
--- a/src/model.py
+++ b/src/model.py
@@ -275,7 +275,10 @@ async def generate(self, request):
                     last_output = output
 
             if not stream:
-                response_sender.send(self.create_response(last_output))
+                response_sender.send(
+                    self.create_response(last_output),
+                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
+                )
 
         except Exception as e:
             self.logger.log_info(f"[vllm] Error generating stream: {e}")