From 31280656f7721c926085e4ada3d92c54354a199e Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 11 Jan 2024 14:51:56 -0500 Subject: [PATCH 1/3] Piggyback final flag as a part of final response --- src/model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/model.py b/src/model.py index f21f2a35..67e8784d 100644 --- a/src/model.py +++ b/src/model.py @@ -264,7 +264,10 @@ async def generate(self, request): self.logger.log_info("[vllm] Successfully cancelled the request") break if stream: - response_sender.send(self.create_response(output)) + if output.finished: + response_sender.send(self.create_response(output), flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + else: + response_sender.send(self.create_response(output)) else: last_output = output @@ -280,10 +283,9 @@ async def generate(self, request): response = pb_utils.InferenceResponse( output_tensors=[triton_output_tensor], error=error ) - response_sender.send(response) + response_sender.send(response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) raise e finally: - response_sender.send(flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) self.ongoing_request_count -= 1 def execute(self, requests): From e81cfd93ce7b6c4b80ea788d769b7f242134fe43 Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Thu, 11 Jan 2024 14:59:38 -0500 Subject: [PATCH 2/3] format --- src/model.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/model.py b/src/model.py index 67e8784d..868b4e91 100644 --- a/src/model.py +++ b/src/model.py @@ -265,7 +265,10 @@ async def generate(self, request): break if stream: if output.finished: - response_sender.send(self.create_response(output), flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send( + self.create_response(output), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) else: response_sender.send(self.create_response(output)) else: @@ -283,7 +286,9 @@ async def generate(self, request): response = pb_utils.InferenceResponse( output_tensors=[triton_output_tensor], error=error ) - response_sender.send(response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL) + response_sender.send( + response, flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL + ) raise e finally: self.ongoing_request_count -= 1 From 1e9a11f3a75487be3d2f1a9371f545a9c0d1b3ca Mon Sep 17 00:00:00 2001 From: Iman Tabrizian Date: Fri, 12 Jan 2024 11:09:56 -0500 Subject: [PATCH 3/3] Fix non streaming mode --- src/model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/model.py b/src/model.py index 868b4e91..80f51320 100644 --- a/src/model.py +++ b/src/model.py @@ -275,7 +275,10 @@ async def generate(self, request): last_output = output if not stream: - response_sender.send(self.create_response(last_output)) + response_sender.send( + self.create_response(last_output), + flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL, + ) except Exception as e: self.logger.log_info(f"[vllm] Error generating stream: {e}")