From 9a67e253ab4a20d6b1465b6ea1132dc568fd7b0c Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Wed, 15 Jan 2025 07:23:24 -0800 Subject: [PATCH 01/50] updated with skeleton / straw man structure --- examples/hello_world/README.md | 0 examples/hello_world/deploy/.gitkeep | 0 examples/hello_world/docs/.gitkeep | 0 examples/hello_world/models/.gitkeep | 0 examples/hello_world/operators/.gitkeep | 0 examples/hello_world/scripts/.gitkeep | 0 6 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/hello_world/README.md create mode 100644 examples/hello_world/deploy/.gitkeep create mode 100644 examples/hello_world/docs/.gitkeep create mode 100644 examples/hello_world/models/.gitkeep create mode 100644 examples/hello_world/operators/.gitkeep create mode 100644 examples/hello_world/scripts/.gitkeep diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md new file mode 100644 index 00000000..e69de29b diff --git a/examples/hello_world/deploy/.gitkeep b/examples/hello_world/deploy/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/hello_world/docs/.gitkeep b/examples/hello_world/docs/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/hello_world/models/.gitkeep b/examples/hello_world/models/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/hello_world/operators/.gitkeep b/examples/hello_world/operators/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/hello_world/scripts/.gitkeep b/examples/hello_world/scripts/.gitkeep new file mode 100644 index 00000000..e69de29b From 44304db34f7a16a09bd1aab836d8215316684e96 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Wed, 15 Jan 2025 07:33:37 -0800 Subject: [PATCH 02/50] updated --- examples/hello_world/api_server/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/hello_world/api_server/.gitkeep diff --git a/examples/hello_world/api_server/.gitkeep b/examples/hello_world/api_server/.gitkeep new file mode 100644 index 00000000..e69de29b From f4a1ef0de2c42c4e435a807c68eaced4c4e1ef94 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Wed, 15 Jan 2025 07:34:19 -0800 Subject: [PATCH 03/50] updated --- examples/hello_world/router/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/hello_world/router/.gitkeep diff --git a/examples/hello_world/router/.gitkeep b/examples/hello_world/router/.gitkeep new file mode 100644 index 00000000..e69de29b From 62b20c5d02d09aa8e4925bd7a401c5073b59f61c Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Wed, 15 Jan 2025 15:48:46 -0800 Subject: [PATCH 04/50] moving under operators --- examples/hello_world/models/.gitkeep | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 examples/hello_world/models/.gitkeep diff --git a/examples/hello_world/models/.gitkeep b/examples/hello_world/models/.gitkeep deleted file mode 100644 index e69de29b..00000000 From 7dd83c119b6dace0daf1b705a1de6c6265bf62fe Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Wed, 15 Jan 2025 16:14:23 -0800 Subject: [PATCH 05/50] updated skeleton --- examples/hello_world/client/.gitkeep | 0 examples/hello_world/operators/triton_model_repo/.gitkeep | 0 examples/hello_world/tests/.gitkeep | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 examples/hello_world/client/.gitkeep create mode 100644 examples/hello_world/operators/triton_model_repo/.gitkeep create mode 100644 examples/hello_world/tests/.gitkeep diff --git a/examples/hello_world/client/.gitkeep b/examples/hello_world/client/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/hello_world/operators/triton_model_repo/.gitkeep b/examples/hello_world/operators/triton_model_repo/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/examples/hello_world/tests/.gitkeep b/examples/hello_world/tests/.gitkeep new file mode 100644 index 00000000..e69de29b From cad9d3f4209abb64c7c612187281c2be42bd6f6a Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Wed, 15 Jan 2025 17:55:02 -0800 Subject: [PATCH 06/50] updated with minimal example - not yet tested --- examples/hello_world/deploy/__main__.py | 195 ++++++++++++++++++ .../.gitkeep | 0 .../triton_core_models/decoder/1/model.py | 105 ++++++++++ .../triton_core_models/decoder/config.pbtxt | 20 ++ .../triton_core_models/encoder/1/model.py | 130 ++++++++++++ .../triton_core_models/encoder/config.pbtxt | 37 ++++ 6 files changed, 487 insertions(+) create mode 100644 examples/hello_world/deploy/__main__.py rename examples/hello_world/operators/{triton_model_repo => triton_core_models}/.gitkeep (100%) create mode 100644 examples/hello_world/operators/triton_core_models/decoder/1/model.py create mode 100644 examples/hello_world/operators/triton_core_models/decoder/config.pbtxt create mode 100644 examples/hello_world/operators/triton_core_models/encoder/1/model.py create mode 100644 examples/hello_world/operators/triton_core_models/encoder/config.pbtxt diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py new file mode 100644 index 00000000..b2c37b51 --- /dev/null +++ b/examples/hello_world/deploy/__main__.py @@ -0,0 +1,195 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import time + +import cupy +import numpy +from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer +from triton_distributed.icp.ucp_data_plane import UcpDataPlane +from triton_distributed.tritonserver import MemoryType +from triton_distributed.worker import WorkerConfig +from triton_distributed.worker.operator import Operator, OperatorConfig +from triton_distributed.worker.remote_operator import RemoteOperator +from triton_distributed.worker.remote_request import RemoteInferenceRequest +from triton_distributed.worker.triton_core_operator import TritonCoreOperator + + +class EncodeDecodeOperator(Operator): + def __init__( + self, + name, + version, + triton_core, + request_plane, + data_plane, + parameters, + repository, + logger, + ): + self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane) + self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane) + + async def execute(self, requests: list[RemoteInferenceRequest]): + for request in requests: + encoded_responses = await self._encoder.async_infer( + inputs={"input": request.inputs["input"]} + ) + + async for encoded_response in encoded_responses: + input_copies = int( + numpy.from_dlpack(encoded_response.outputs["input_copies"]) + ) + decoded_responses = await self._decoder.async_infer( + inputs={"input": encoded_response.outputs["output"]}, + parameters={"input_copies": input_copies}, + ) + + async for decoded_response in decoded_responses: + await request.response_sender().send( + final=True, + outputs={"output": decoded_response.outputs["output"]}, + ) + del decoded_response + + +async def send_requests(nats_server_url): + request_plane = NatsRequestPlane(nats_server_url) + data_plane = UcpDataPlane() + await request_plane.connect() + data_plane.connect() + + remote_operator: RemoteOperator = RemoteOperator( + "encoder_decoder", 1, request_plane, data_plane + ) + + inputs = [ + numpy.array(numpy.random.randint(0, 100, 10000)).astype("int64") + for _ in range(100) + ] + + requests = [ + await remote_operator.async_infer( + inputs={"input": inputs[index]}, request_id=str(index) + ) + for index in range(100) + ] + + for request in requests: + async for response in request: + for output_name, output_value in response.outputs.items(): + if output_value.memory_type == MemoryType.CPU: + output = numpy.from_dlpack(output_value) + numpy.testing.assert_array_equal( + output, inputs[int(response.request_id)] + ) + else: + output = cupy.from_dlpack(output_value) + cupy.testing.assert_array_equal( + output, inputs[int(response.request_id)] + ) + del output_value + print(f"Finished Request: {response.request_id}") + print(response.error) + del response + + await request_plane.close() + data_plane.close() + + +async def main(): + nats_server = NatsServer() + time.sleep(1) + + encoder_op = OperatorConfig( + name="encoder", + repository="/workspace/examples/hello_world/operators/models", + implementation=TritonCoreOperator, + max_inflight_requests=1, + parameters={ + "config": { + "instance_group": [{"count": 1, "kind": "KIND_CPU"}], + "parameters": {"delay": {"string_value": "0"}}, + } + }, + ) + + decoder_op = OperatorConfig( + name="decoder", + repository="/workspace/examples/hello_world/operators/models", + implementation=TritonCoreOperator, + max_inflight_requests=1, + parameters={ + "config": { + "instance_group": [{"count": 1, "kind": "KIND_GPU"}], + "parameters": {"delay": {"string_value": "0"}}, + } + }, + ) + + encoder_decoder_op = OperatorConfig( + name="encoder_decoder", + implementation=EncodeDecodeOperator, + max_inflight_requests=100, + ) + + encoder = WorkerConfig( + request_plane_args=([nats_server.url], {}), + log_level=6, + operators=[encoder_op], + name="encoder", + metrics_port=8060, + log_dir="logs", + ) + + decoder = WorkerConfig( + request_plane_args=([nats_server.url], {}), + log_level=6, + operators=[decoder_op], + name="decoder", + metrics_port=8061, + log_dir="logs", + ) + + encoder_decoder = WorkerConfig( + request_plane_args=([nats_server.url], {}), + log_level=6, + operators=[encoder_decoder_op], + name="encoder_decoder", + metrics_port=8062, + log_dir="logs", + ) + + print("Starting Workers") + + processes = [process.start() for process in [encoder, decoder, encoder_decoder]] + + print("Sending Requests") + + await send_requests(nats_server.url) + + print("Stopping Workers") + + for process in reversed(processes): + print("shutting down", process) + process.terminate() + print("waiting", process) + process.join() + print("done", process) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/hello_world/operators/triton_model_repo/.gitkeep b/examples/hello_world/operators/triton_core_models/.gitkeep similarity index 100% rename from examples/hello_world/operators/triton_model_repo/.gitkeep rename to examples/hello_world/operators/triton_core_models/.gitkeep diff --git a/examples/hello_world/operators/triton_core_models/decoder/1/model.py b/examples/hello_world/operators/triton_core_models/decoder/1/model.py new file mode 100644 index 00000000..8187835d --- /dev/null +++ b/examples/hello_world/operators/triton_core_models/decoder/1/model.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import time + +import numpy +import triton_python_backend_utils as pb_utils + +try: + import cupy +except Exception: + cupy = None + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + """Auto Complets Model Config + + Model has one input and one output + both of type int64 + + Parameters + ---------- + auto_complete_model_config : config + Enables reading and updating config.pbtxt + + + """ + + input_config = { + "name": "input", + "data_type": "TYPE_INT64", + "dims": [-1], + "optional": False, + } + + output_config = { + "name": "output", + "data_type": "TYPE_INT64", + "dims": [-1], + } + + auto_complete_model_config.add_input(input_config) + auto_complete_model_config.add_output(output_config) + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.set_model_transaction_policy({"decoupled": False}) + + return auto_complete_model_config + + def initialize(self, args): + self._model_config = json.loads(args["model_config"]) + self._model_instance_kind = args["model_instance_kind"] + self._model_instance_device_id = int(args["model_instance_device_id"]) + self._config_parameters = self._model_config.get("parameters", {}) + self._input_copies = int( + self._config_parameters.get("input_copies", {"string_value": "5"})[ + "string_value" + ] + ) + self._delay = float( + self._config_parameters.get("delay", {"string_value": "0"})["string_value"] + ) + + def execute(self, requests): + responses = [] + input_copies = self._input_copies + delay = self._delay + for request in requests: + output_tensors = [] + parameters = json.loads(request.parameters()) + if parameters: + input_copies = int(parameters.get("input_copies", self._input_copies)) + delay = float(parameters.get("delay", self._delay)) + for input_tensor in request.inputs(): + input_value = input_tensor.as_numpy() + output_value = [] + if self._model_instance_kind == "GPU": + with cupy.cuda.Device(self._model_instance_device_id): + input_value = cupy.array(input_value) + output_value = cupy.invert(input_value) + output_value = output_value[::input_copies] + output_tensor = pb_utils.Tensor.from_dlpack( + "output", output_value + ) + else: + output_value = numpy.invert(input_value) + output_value = output_value[::input_copies] + output_tensor = pb_utils.Tensor("output", output_value) + output_tensors.append(output_tensor) + time.sleep(len(output_value) * delay) + responses.append(pb_utils.InferenceResponse(output_tensors=output_tensors)) + return responses diff --git a/examples/hello_world/operators/triton_core_models/decoder/config.pbtxt b/examples/hello_world/operators/triton_core_models/decoder/config.pbtxt new file mode 100644 index 00000000..5581461c --- /dev/null +++ b/examples/hello_world/operators/triton_core_models/decoder/config.pbtxt @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## Model Instance and Kind are filled in by configuration when launched +## All other values are filled in by auto_complete in model.py + +backend: "python" + diff --git a/examples/hello_world/operators/triton_core_models/encoder/1/model.py b/examples/hello_world/operators/triton_core_models/encoder/1/model.py new file mode 100644 index 00000000..36a2dbf9 --- /dev/null +++ b/examples/hello_world/operators/triton_core_models/encoder/1/model.py @@ -0,0 +1,130 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +import time + +import numpy +import triton_python_backend_utils as pb_utils + +try: + import cupy +except Exception: + cupy = None + + +class TritonPythonModel: + @staticmethod + def auto_complete_config(auto_complete_model_config): + """Auto Complets Model Config + + Model has one input and one output + both of type int64 + + Parameters + ---------- + auto_complete_model_config : config + Enables reading and updating config.pbtxt + + + """ + + input_config = { + "name": "input", + "data_type": "TYPE_INT64", + "dims": [-1], + "optional": False, + } + + output_config = { + "name": "output", + "data_type": "TYPE_INT64", + "dims": [-1], + } + + copies_config = { + "name": "input_copies", + "data_type": "TYPE_INT64", + "dims": [1], + } + + auto_complete_model_config.add_input(input_config) + auto_complete_model_config.add_output(output_config) + auto_complete_model_config.add_output(copies_config) + auto_complete_model_config.set_max_batch_size(0) + auto_complete_model_config.set_model_transaction_policy({"decoupled": False}) + + return auto_complete_model_config + + def initialize(self, args): + self._model_config = json.loads(args["model_config"]) + self._model_instance_kind = args["model_instance_kind"] + self._model_instance_device_id = int(args["model_instance_device_id"]) + self._config_parameters = self._model_config.get("parameters", {}) + self._input_copies = int( + self._config_parameters.get("input_copies", {"string_value": "5"})[ + "string_value" + ] + ) + self._delay = float( + self._config_parameters.get("delay", {"string_value": "0"})["string_value"] + ) + + def execute(self, requests): + responses = [] + input_copies = self._input_copies + delay = self._delay + for request in requests: + output_tensors = [] + parameters = json.loads(request.parameters()) + if parameters: + input_copies = int(parameters.get("input_copies", self._input_copies)) + delay = float(parameters.get("delay", self._delay)) + for input_tensor in request.inputs(): + input_value = input_tensor.as_numpy() + output_value = [] + if self._model_instance_kind == "GPU": + with cupy.cuda.Device(self._model_instance_device_id): + input_value = cupy.array(input_value) + output_value = cupy.tile(input_value, input_copies) + output_value = cupy.invert(output_value) + output_tensor = pb_utils.Tensor.from_dlpack( + "output", output_value + ) + else: + output_value = numpy.tile(input_value, input_copies) + output_value = numpy.invert(output_value) + output_tensor = pb_utils.Tensor("output", output_value) + output_tensors.append(output_tensor) + output_tensors.append( + pb_utils.Tensor( + "input_copies", numpy.array(input_copies).astype("int64") + ) + ) + time.sleep(len(output_value) * delay) + + responses.append(pb_utils.InferenceResponse(output_tensors=output_tensors)) + return responses diff --git a/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt new file mode 100644 index 00000000..05968c5a --- /dev/null +++ b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt @@ -0,0 +1,37 @@ +# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +## Model Instance and Kind are filled in by configuration when launched +## All other values are filled in by auto_complete in model.py + +backend: "python" + +# instance_group [ +# { count: {MODEL_INSTANCE_COUNT} +# kind: {MODEL_INSTANCE_KIND} +# } +# ] +# \ No newline at end of file From 97c752dc9a4d22cdba07e384c86e861e4b4d2d5c Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Wed, 15 Jan 2025 23:45:24 -0800 Subject: [PATCH 07/50] updated to use deployment --- examples/hello_world/deploy/__main__.py | 33 ++++++++++--------- .../triton_distributed/worker/__init__.py | 3 ++ .../triton_distributed/worker/deployment.py | 4 +++ .../triton_distributed/worker/worker.py | 1 + 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py index b2c37b51..37d8df38 100644 --- a/examples/hello_world/deploy/__main__.py +++ b/examples/hello_world/deploy/__main__.py @@ -20,12 +20,16 @@ import numpy from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer from triton_distributed.icp.ucp_data_plane import UcpDataPlane -from triton_distributed.tritonserver import MemoryType -from triton_distributed.worker import WorkerConfig -from triton_distributed.worker.operator import Operator, OperatorConfig -from triton_distributed.worker.remote_operator import RemoteOperator -from triton_distributed.worker.remote_request import RemoteInferenceRequest -from triton_distributed.worker.triton_core_operator import TritonCoreOperator +from triton_distributed.worker import ( + Deployment, + Operator, + OperatorConfig, + RemoteInferenceRequest, + RemoteOperator, + TritonCoreOperator, + WorkerConfig, +) +from tritonserver import MemoryType class EncodeDecodeOperator(Operator): @@ -116,7 +120,7 @@ async def main(): encoder_op = OperatorConfig( name="encoder", - repository="/workspace/examples/hello_world/operators/models", + repository="/workspace/examples/hello_world/operators/triton_core_models", implementation=TritonCoreOperator, max_inflight_requests=1, parameters={ @@ -129,7 +133,7 @@ async def main(): decoder_op = OperatorConfig( name="decoder", - repository="/workspace/examples/hello_world/operators/models", + repository="/workspace/examples/hello_world/operators/triton_core_models", implementation=TritonCoreOperator, max_inflight_requests=1, parameters={ @@ -142,7 +146,7 @@ async def main(): encoder_decoder_op = OperatorConfig( name="encoder_decoder", - implementation=EncodeDecodeOperator, + implementation="/workspace/examples/hello_world/deploy/__main__:EncodeDecodeOperator", max_inflight_requests=100, ) @@ -175,7 +179,9 @@ async def main(): print("Starting Workers") - processes = [process.start() for process in [encoder, decoder, encoder_decoder]] + deployment = Deployment([encoder, decoder, encoder_decoder]) + + deployment.start() print("Sending Requests") @@ -183,12 +189,7 @@ async def main(): print("Stopping Workers") - for process in reversed(processes): - print("shutting down", process) - process.terminate() - print("waiting", process) - process.join() - print("done", process) + deployment.stop() if __name__ == "__main__": diff --git a/worker/src/python/triton_distributed/worker/__init__.py b/worker/src/python/triton_distributed/worker/__init__.py index 365c827e..e681f755 100644 --- a/worker/src/python/triton_distributed/worker/__init__.py +++ b/worker/src/python/triton_distributed/worker/__init__.py @@ -22,5 +22,8 @@ from triton_distributed.worker.remote_response import ( RemoteInferenceResponse as RemoteInferenceResponse, ) +from triton_distributed.worker.triton_core_operator import ( + TritonCoreOperator as TritonCoreOperator, +) from triton_distributed.worker.worker import Worker as Worker from triton_distributed.worker.worker import WorkerConfig as WorkerConfig diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py index 9fc0b67e..68f5645e 100644 --- a/worker/src/python/triton_distributed/worker/deployment.py +++ b/worker/src/python/triton_distributed/worker/deployment.py @@ -36,6 +36,10 @@ def start(self): args=[worker_config], ) ) + self._workers[-1].start() + + def stop(self): + self.shutdown() def shutdown(self, join=True, timeout=10): for worker in self._workers: diff --git a/worker/src/python/triton_distributed/worker/worker.py b/worker/src/python/triton_distributed/worker/worker.py index 9f3646f7..acfa3853 100644 --- a/worker/src/python/triton_distributed/worker/worker.py +++ b/worker/src/python/triton_distributed/worker/worker.py @@ -111,6 +111,7 @@ def _import_operators(self): sys.path.append(str(module_path.parent.absolute())) try: module = importlib.import_module(module_path.name) + print(dir(module)) class_ = getattr(module, class_name) except Exception as e: logger.exception( From b0f3464a4f3bf37650ac53120b48c1f5145c567a Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Thu, 16 Jan 2025 08:07:43 -0800 Subject: [PATCH 08/50] updating with single_file separate from main --- examples/hello_world/deploy/__main__.py | 7 +- examples/hello_world/single_file.py | 203 ++++++++++++++++++ .../triton_distributed/worker/deployment.py | 38 +++- .../triton_distributed/worker/worker.py | 12 +- 4 files changed, 247 insertions(+), 13 deletions(-) create mode 100644 examples/hello_world/single_file.py diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py index 37d8df38..9ecb9f96 100644 --- a/examples/hello_world/deploy/__main__.py +++ b/examples/hello_world/deploy/__main__.py @@ -46,9 +46,11 @@ def __init__( ): self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane) self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane) + self._logger = logger async def execute(self, requests: list[RemoteInferenceRequest]): for request in requests: + self._logger.info("got request!") encoded_responses = await self._encoder.async_infer( inputs={"input": request.inputs["input"]} ) @@ -146,8 +148,9 @@ async def main(): encoder_decoder_op = OperatorConfig( name="encoder_decoder", - implementation="/workspace/examples/hello_world/deploy/__main__:EncodeDecodeOperator", + implementation="EncodeDecodeOperator", max_inflight_requests=100, + repository="/workspace/examples/hello_world/operators", ) encoder = WorkerConfig( @@ -179,7 +182,7 @@ async def main(): print("Starting Workers") - deployment = Deployment([encoder, decoder, encoder_decoder]) + deployment = Deployment([(encoder, 5), decoder, (encoder_decoder, 6)]) deployment.start() diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py new file mode 100644 index 00000000..531c0abb --- /dev/null +++ b/examples/hello_world/single_file.py @@ -0,0 +1,203 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import shutil + +import cupy +import numpy +from tqdm import tqdm +from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer +from triton_distributed.icp.ucp_data_plane import UcpDataPlane +from triton_distributed.worker import ( + Deployment, + Operator, + OperatorConfig, + RemoteInferenceRequest, + RemoteOperator, + TritonCoreOperator, + WorkerConfig, +) +from tritonserver import MemoryType + + +class EncodeDecodeOperator(Operator): + def __init__( + self, + name, + version, + triton_core, + request_plane, + data_plane, + parameters, + repository, + logger, + ): + self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane) + self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane) + self._logger = logger + + async def execute(self, requests: list[RemoteInferenceRequest]): + for request in requests: + self._logger.info("got request!") + encoded_responses = await self._encoder.async_infer( + inputs={"input": request.inputs["input"]} + ) + + async for encoded_response in encoded_responses: + input_copies = int( + numpy.from_dlpack(encoded_response.outputs["input_copies"]) + ) + decoded_responses = await self._decoder.async_infer( + inputs={"input": encoded_response.outputs["output"]}, + parameters={"input_copies": input_copies}, + ) + + async for decoded_response in decoded_responses: + await request.response_sender().send( + final=True, + outputs={"output": decoded_response.outputs["output"]}, + ) + del decoded_response + + +async def send_requests(nats_server_url, request_count=100): + request_plane = NatsRequestPlane(nats_server_url) + data_plane = UcpDataPlane() + await request_plane.connect() + data_plane.connect() + + remote_operator: RemoteOperator = RemoteOperator( + "encoder_decoder", 1, request_plane, data_plane + ) + + inputs = [ + numpy.array(numpy.random.randint(0, 100, 10000)).astype("int64") + for _ in range(request_count) + ] + + with tqdm(total=request_count, desc="Sending Requests", unit="request") as pbar: + requests = [ + await remote_operator.async_infer( + inputs={"input": inputs[index]}, request_id=str(index) + ) + for index in range(request_count) + ] + + for request in requests: + async for response in request: + for output_name, output_value in response.outputs.items(): + if output_value.memory_type == MemoryType.CPU: + output = numpy.from_dlpack(output_value) + numpy.testing.assert_array_equal( + output, inputs[int(response.request_id)] + ) + else: + output = cupy.from_dlpack(output_value) + cupy.testing.assert_array_equal( + output, inputs[int(response.request_id)] + ) + del output_value + print( + f"Finished Request: {response.request_id} Response From: {response.component_id} Error: {response.error}" + ) + pbar.update(1) + del response + + await request_plane.close() + data_plane.close() + + +async def main(): + shutil.rmtree("logs") + + nats_server = NatsServer() + + encoder_op = OperatorConfig( + name="encoder", + repository="/workspace/examples/hello_world/operators/triton_core_models", + implementation=TritonCoreOperator, + max_inflight_requests=1, + parameters={ + "config": { + "instance_group": [{"count": 1, "kind": "KIND_CPU"}], + "parameters": {"delay": {"string_value": "0"}}, + } + }, + ) + + decoder_op = OperatorConfig( + name="decoder", + repository="/workspace/examples/hello_world/operators/triton_core_models", + implementation=TritonCoreOperator, + max_inflight_requests=1, + parameters={ + "config": { + "instance_group": [{"count": 1, "kind": "KIND_GPU"}], + "parameters": {"delay": {"string_value": "0"}}, + } + }, + ) + + encoder_decoder_op = OperatorConfig( + name="encoder_decoder", + implementation=EncodeDecodeOperator, + max_inflight_requests=100, + ) + + encoder = WorkerConfig( + request_plane_args=([nats_server.url], {}), + log_level=6, + operators=[encoder_op], + name="encoder", + metrics_port=50000, + log_dir="logs", + ) + + decoder = WorkerConfig( + request_plane_args=([nats_server.url], {}), + log_level=6, + operators=[decoder_op], + name="decoder", + metrics_port=50100, + log_dir="logs", + ) + + encoder_decoder = WorkerConfig( + request_plane_args=([nats_server.url], {}), + log_level=6, + operators=[encoder_decoder_op], + name="encoder_decoder", + metrics_port=50200, + log_dir="logs", + ) + + print("Starting Workers") + + deployment = Deployment([encoder, (decoder, 500), (encoder_decoder, 100)]) + + deployment.start() + + print("Sending Requests") + + await send_requests(nats_server.url) + + print("Stopping Workers") + + deployment.stop() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py index 68f5645e..75650f57 100644 --- a/worker/src/python/triton_distributed/worker/deployment.py +++ b/worker/src/python/triton_distributed/worker/deployment.py @@ -13,15 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing +from pprint import pformat +from triton_distributed.worker.log_formatter import setup_logger from triton_distributed.worker.worker import Worker, WorkerConfig +LOGGER_NAME = __name__ + class Deployment: - def __init__(self, worker_configs: list[WorkerConfig]): + def __init__( + self, worker_configs: list[WorkerConfig | tuple[WorkerConfig, int]], log_level=3 + ): self._process_context = multiprocessing.get_context("spawn") self._worker_configs = worker_configs self._workers: list[multiprocessing.context.SpawnProcess] = [] + self._logger = setup_logger(log_level, LOGGER_NAME) @staticmethod def _start_worker(worker_config): @@ -29,14 +36,29 @@ def _start_worker(worker_config): def start(self): for worker_config in self._worker_configs: - self._workers.append( - self._process_context.Process( - target=Deployment._start_worker, - name=worker_config.name, - args=[worker_config], + worker_instances = 1 + if isinstance(worker_config, tuple): + worker_instances = worker_config[1] + worker_config = worker_config[0] + worker_config.log_level = 6 + base_name = worker_config.name + base_port = worker_config.metrics_port + for index in range(worker_instances): + worker_config.name = f"{base_name}.{index}" + worker_config.metrics_port = base_port + index + self._workers.append( + self._process_context.Process( + target=Deployment._start_worker, + name=worker_config.name, + args=[worker_config], + ) + ) + self._logger.info( + "\n\nStarting Worker:\n\n\tConfig:\n\t%s\n\t%s\n", + pformat(worker_config), + self._workers[-1], ) - ) - self._workers[-1].start() + self._workers[-1].start() def stop(self): self.shutdown() diff --git a/worker/src/python/triton_distributed/worker/worker.py b/worker/src/python/triton_distributed/worker/worker.py index acfa3853..50c432b2 100644 --- a/worker/src/python/triton_distributed/worker/worker.py +++ b/worker/src/python/triton_distributed/worker/worker.py @@ -87,6 +87,7 @@ def __init__( self._operators: dict[tuple[str, int], Operator] = {} self._metrics_port = config.metrics_port self._metrics_server: Optional[uvicorn.Server] = None + self._component_id = self._request_plane.component_id def _import_operators(self): for operator_config in self._operator_configs: @@ -328,12 +329,17 @@ async def _wait_for_tasks(self, loop): def start(self): if self._log_dir: + pid = os.getpid() os.makedirs(self._log_dir, exist_ok=True) - stdout_path = os.path.join(self._log_dir, f"{self._name}.stdout.log") - stderr_path = os.path.join(self._log_dir, f"{self._name}.stderr.log") + stdout_path = os.path.join( + self._log_dir, f"{self._name}.{self._component_id}.{pid}.stdout.log" + ) + stderr_path = os.path.join( + self._log_dir, f"{self._name}.{self._component_id}.{pid}.stderr.log" + ) if not self._triton_log_path: self._triton_log_path = os.path.join( - self._log_dir, f"{self._name}.triton.log" + self._log_dir, f"{self._name}.{self._component_id}.{pid}.triton.log" ) sys.stdout = open(stdout_path, "w", buffering=1) sys.stderr = open(stderr_path, "w", buffering=1) From 991358ba333b53dbed2f2fce67d6f1ab7382630e Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Thu, 16 Jan 2025 08:13:41 -0800 Subject: [PATCH 09/50] updating --- examples/hello_world/single_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py index 531c0abb..3ad786f0 100644 --- a/examples/hello_world/single_file.py +++ b/examples/hello_world/single_file.py @@ -186,7 +186,7 @@ async def main(): print("Starting Workers") - deployment = Deployment([encoder, (decoder, 500), (encoder_decoder, 100)]) + deployment = Deployment([encoder, (decoder, 10), (encoder_decoder, 10)]) deployment.start() From bd57e10be7860134a6a054de07b3ef331f5e1cee Mon Sep 17 00:00:00 2001 From: Piotr Marcinkiewicz Date: Thu, 16 Jan 2025 19:23:39 +0100 Subject: [PATCH 10/50] docs: Add descriptions for example and main folder --- README.md | 72 ++++++++++++++++++++- examples/hello_world/README.md | 114 +++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e0b361dd..1a3a6b22 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ See the License for the specific language governing permissions and limitations under the License. --> - # Triton Distributed

A Datacenter Scale Distributed Inference Serving Framework

@@ -86,6 +85,77 @@ Please see the instructions in the corresponding example for specific deployment instructions. + +## 1. Big Picture +Triton Distributed extends the standard [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) model-serving paradigm with additional “planes” that distribute data and requests across multiple processes or machines. Conceptually, you still write a Triton **Model**, but your inference requests and data transfers can be routed through: + +1. **Request Plane**: Sends model-inference requests among nodes or processes. +2. **Data Plane**: Moves tensor data and references between processes or GPUs. + +This architecture allows you to build large, multi-process or multi-node solutions for AI model inference without manually managing transport and synchronization. You can share GPU memory references, shift them among distributed processes, etc. + +--- + +## 2. Key Components +The repo has four major logical layers: + +1. **ICP (Inter-Component Protocol)**: + - Python modules under `triton_distributed/icp/…` + - Encodes how the data and requests get serialized/transported. + - Implements **NatsRequestPlane** and **UcpDataPlane**, which are concrete transport/connection classes for requests/data. + - **NatsRequestPlane** uses NATS for distributing requests. + - **UcpDataPlane** uses UCX (libucp) for transferring tensor data, possibly GPU-to-GPU. + +2. **Worker**: + - Python modules under `triton_distributed/worker/…` + - Exposes the concept of an **Operator** (a processing node that can serve one or more Triton models or custom logic). + - Runs the main loop that pulls requests from a Request Plane, processes them, and returns responses. + - Contains a Python “mini” server (the `Worker`) that spawns or manages multiple Operators. + +3. **Integration Tests & Examples**: + - A directory structure with unit tests and integration tests showing how to compose multiple workers. + - The “hello world” example is under `examples/hello_world/`. + +4. **Triton Python Models**: + - Under various directories like `.../operators/triton_core_models/...` or `icp/src/python/triton_distributed/icp/...` + - Typical Triton `model.py` files that define custom Python logic behind each “model.” + + +## 4. ICP Planes & Worker Internals + +### 4.1 Request Plane (NATS) +`NatsRequestPlane` handles distributing requests among processes. Under the hood, it: + +- Connects to a NATS server (which might run in local Docker or remote). +- Creates “streams” in NATS for each operator or for direct routing. +- On the “client” side (where you call `post_request`), it publishes request messages to the right NATS subjects. +- On the “server” side (the Worker), it “pulls” requests from NATS subscriptions. + +### 4.2 Data Plane (UCX) +`UcpDataPlane` references UCX-Py (libucp) to exchange actual tensor data. By default: + +- When you “put” a tensor, the data plane either: + 1. Embeds small data directly in the message (the “contents” approach), or + 2. If large, stores a reference (GPU or CPU memory) in the local `_tensor_store`, then sends a small “URI” like `ucp://hostname:port/` to the remote side. +- The remote side can do “get_tensor” by connecting to `ucp://hostname:port` and pulling the data. + +This allows distributed GPU memory references with minimal overhead. + +### 4.3 Worker +A `Worker` runs in a separate process. It: + +- Starts or registers Triton model(s). +- Connects to the chosen request plane (NATS) and data plane (UCX). +- Enters a loop: + 1. `pull_requests` from NATS, + 2. routes them to the correct Operator, + 3. gets the results, + 4. returns them to the request plane. + +In the “hello world,” you see three Worker processes—each hosting either the encoder, decoder, or aggregator operator. + + + + + ``` examples/ └── hello_world From 2d0f1db06b43aafafc228780a829d4fa655f62cd Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 07:41:49 -0800 Subject: [PATCH 35/50] updating copyright --- .../mock_disaggregated_serving/1/model.py | 39 +++++++------------ .../mock_disaggregated_serving/config.pbtxt | 39 +++++++------------ 2 files changed, 28 insertions(+), 50 deletions(-) diff --git a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py index 247913b6..f6c6a95a 100644 --- a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py +++ b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py @@ -1,28 +1,17 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import asyncio import gc diff --git a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt index 1fc2916e..5c68b52e 100644 --- a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt +++ b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt @@ -1,28 +1,17 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. name: "mock_disaggregated_serving" backend: "python" From 4a5df201e6a6abbe4797f927226936390e3d0fa8 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 07:45:39 -0800 Subject: [PATCH 36/50] moving new additions to comments to allow for functional merge --- README.md | 4 +--- examples/hello_world/README.md | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index aa0b9aad..539f7a43 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ HF_TOKEN```) and mounts common directories such as ```/tmp:/tmp```, Please see the instructions in the corresponding example for specific deployment instructions. - + + From 5a4b97f9361d880b3461e272790e19dc5769d68f Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 07:55:08 -0800 Subject: [PATCH 37/50] updating copyright and license --- .../triton_core_models/encoder/1/model.py | 35 +++++++------------ 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/examples/hello_world/operators/triton_core_models/encoder/1/model.py b/examples/hello_world/operators/triton_core_models/encoder/1/model.py index 36a2dbf9..f7b458a6 100644 --- a/examples/hello_world/operators/triton_core_models/encoder/1/model.py +++ b/examples/hello_world/operators/triton_core_models/encoder/1/model.py @@ -1,28 +1,17 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import json import time From 13d5fee4d0657910cc919530227b442786bc7f14 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 07:56:25 -0800 Subject: [PATCH 38/50] updated copyright --- .../triton_core_models/encoder/config.pbtxt | 41 ++++++------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt index 05968c5a..5581461c 100644 --- a/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt +++ b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt @@ -1,37 +1,20 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 # -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ## Model Instance and Kind are filled in by configuration when launched ## All other values are filled in by auto_complete in model.py backend: "python" -# instance_group [ -# { count: {MODEL_INSTANCE_COUNT} -# kind: {MODEL_INSTANCE_KIND} -# } -# ] -# \ No newline at end of file From 4eb9b3c2e1824d7425aa85ca6227285fd41c1c5e Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 09:33:37 -0800 Subject: [PATCH 39/50] WIP --- README.md | 14 +++ examples/hello_world/README.md | 151 +++++++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+) diff --git a/README.md b/README.md index 539f7a43..7d963e34 100644 --- a/README.md +++ b/README.md @@ -84,6 +84,20 @@ HF_TOKEN```) and mounts common directories such as ```/tmp:/tmp```, Please see the instructions in the corresponding example for specific deployment instructions. +## Hello World + +A basic example demonstrating the new interfaces and concepts of +triton distributed. In the hello world example, you can deploy a set +of simple workers to load balance requests from a local work queue. + +The example demonstrates: + +1. How to incorporate an existing Triton Core Model into a triton distributed worker. +1. How to incorporate a standalone python class into a triton distributed worker. +1. How deploy a set of workers +1. How to send requests to the triton distributed deployment + + +# Hello World + +A basic example demonstrating the new interfaces and concepts of +triton distributed. In the hello world example, you can deploy a set +of simple workers to load balance requests from a local work queue. + +The example demonstrates: + +1. How to incorporate an existing Triton Core Model into a triton distributed worker. +1. How to incorporate a standalone python class into a triton distributed worker. +1. How deploy a set of workers +1. How to send requests to the triton distributed deployment + + +## Building the Hello World Environment + +The hello world example is designed to be deployed in a containerized +environment and to work with and without GPU support. + +To get started build the "STANDARD" triton distributed development +environment. + +Note: "STANDARD" is the default framework + +``` +./containers/build.sh +``` + + +## Starting the Deployment + +``` +./containers/run.sh -it -- python3 -m hello_world.deploy --initialize-request-plane +``` + +#### Expected Output + + +``` +Starting Workers +17:17:09 deployment.py:115[triton_distributed.worker.deployment] INFO: + +Starting Worker: + + Config: + WorkerConfig(request_plane=, + data_plane=, + request_plane_args=([], {}), + data_plane_args=([], {}), + log_level=1, + operators=[OperatorConfig(name='encoder', + implementation=, + repository='/workspace/examples/hello_world/operators/triton_core_models', + version=1, + max_inflight_requests=1, + parameters={'config': {'instance_group': [{'count': 1, + 'kind': 'KIND_CPU'}], + 'parameters': {'delay': {'string_value': '0'}, + 'input_copies': {'string_value': '1'}}}}, + log_level=None)], + triton_log_path=None, + name='encoder.0', + log_dir='/workspace/examples/hello_world/logs', + metrics_port=50000) + + +17:17:09 deployment.py:115[triton_distributed.worker.deployment] INFO: + +Starting Worker: + + Config: + WorkerConfig(request_plane=, + data_plane=, + request_plane_args=([], {}), + data_plane_args=([], {}), + log_level=1, + operators=[OperatorConfig(name='decoder', + implementation=, + repository='/workspace/examples/hello_world/operators/triton_core_models', + version=1, + max_inflight_requests=1, + parameters={'config': {'instance_group': [{'count': 1, + 'kind': 'KIND_CPU'}], + 'parameters': {'delay': {'string_value': '0'}, + 'input_copies': {'string_value': '1'}}}}, + log_level=None)], + triton_log_path=None, + name='decoder.0', + log_dir='/workspace/examples/hello_world/logs', + metrics_port=50001) + + +17:17:09 deployment.py:115[triton_distributed.worker.deployment] INFO: + +Starting Worker: + + Config: + WorkerConfig(request_plane=, + data_plane=, + request_plane_args=([], {}), + data_plane_args=([], {}), + log_level=1, + operators=[OperatorConfig(name='encoder_decoder', + implementation='EncodeDecodeOperator', + repository='/workspace/examples/hello_world/operators', + version=1, + max_inflight_requests=1, + parameters={}, + log_level=None)], + triton_log_path=None, + name='encoder_decoder.0', + log_dir='/workspace/examples/hello_world/logs', + metrics_port=50002) + + +Workers started ... press Ctrl-C to Exit +``` + +## Sending Requests + +From a separate terminal run the sample client. + +``` +./containers/run.sh -it -- python3 -m hello_world.client +``` + +#### Expected Output + +``` + +Client: 0 Received Response: 42 From: 39491f06-d4f7-11ef-be96-047bcba9020e Error: None: 43%|███████▋ | 43/100 [00:04<00:05, 9.83request/s] + +Throughput: 9.10294484748811 Total Time: 10.985455989837646 +Clients Stopped Exit Code 0 + + +``` + +## Behind the Scenes + +The hello world example is designed to demonstrate and allow +experimenting with different mixtures of compute and memory loads and +different numbers of workers for different parts of the hello world +pipeline. + +### Hello World Pipeline + +The hello world pipeline is a simple two stage pipeline with an +encoding stage and a decoding stage plus a + + diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index a3bd89e0..96a3d99c 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -158,128 +158,114 @@ Clients Stopped Exit Code 0 The hello world example is designed to demonstrate and allow experimenting with different mixtures of compute and memory loads and different numbers of workers for different parts of the hello world -pipeline. +workflow. -### Hello World Pipeline +### Hello World Workflow -The hello world pipeline is a simple two stage pipeline with an -encoding stage and a decoding stage plus a - - - From 097250055e650fe3a7943230753e6696a7093aee Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 11:39:18 -0800 Subject: [PATCH 41/50] updated --- README.md | 4 ++++ examples/hello_world/README.md | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ad496f36..0a180225 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,8 @@ deployment instructions. ## Hello World +[Hello World](./examples/hello_world) + A basic example demonstrating the new interfaces and concepts of triton distributed. In the hello world example, you can deploy a set of simple workers to load balance requests from a local work queue. @@ -96,4 +98,6 @@ The example demonstrates: 1. How to incorporate a standalone python class into a triton distributed worker. 1. How deploy a set of workers 1. How to send requests to the triton distributed deployment +1. Requests over the Request Plane and Data movement over the Data + Plane. diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index 96a3d99c..7a187c12 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -27,7 +27,8 @@ The example demonstrates: 1. How to incorporate a standalone python class into a triton distributed worker. 1. How deploy a set of workers 1. How to send requests to the triton distributed deployment - +1. Requests over the Request Plane and Data movement over the Data + Plane. ## Building the Hello World Environment From 78e9eb4888287e105526cc8b2da02f263cd8eaec Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 12:17:39 -0800 Subject: [PATCH 42/50] update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0a180225..c8703e08 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ We provide 3 types of builds: For example, if you want to build a container for the `VLLM` backend you can run -`./container/build.sh --framework VLLM` +`./container/build.sh` Please see the instructions in the corresponding example for specific build instructions. From e28d58602ecac3c6d8482da736141cace9e20183 Mon Sep 17 00:00:00 2001 From: Neelay Shah Date: Fri, 17 Jan 2025 13:09:37 -0800 Subject: [PATCH 43/50] Update examples/hello_world/README.md Co-authored-by: Tanmay Verma --- examples/hello_world/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index 7a187c12..5d406034 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -41,7 +41,7 @@ environment. Note: "STANDARD" is the default framework ``` -./containers/build.sh +./container/build.sh ``` From 6e7eee2bd87f91baf7ff8a56c1290e75512c72b7 Mon Sep 17 00:00:00 2001 From: Neelay Shah Date: Fri, 17 Jan 2025 13:10:00 -0800 Subject: [PATCH 44/50] Update examples/hello_world/README.md Co-authored-by: Tanmay Verma --- examples/hello_world/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index 5d406034..6ca03a23 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -48,7 +48,7 @@ Note: "STANDARD" is the default framework ## Starting the Deployment ``` -./containers/run.sh -it -- python3 -m hello_world.deploy --initialize-request-plane +./container/run.sh -it -- python3 -m hello_world.deploy --initialize-request-plane ``` #### Expected Output From c447cc5aa843025b5d36c9842f089bcc2752742b Mon Sep 17 00:00:00 2001 From: Neelay Shah Date: Fri, 17 Jan 2025 13:10:14 -0800 Subject: [PATCH 45/50] Update examples/hello_world/README.md Co-authored-by: Tanmay Verma --- examples/hello_world/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index 6ca03a23..494d101d 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -139,7 +139,7 @@ Workers started ... press Ctrl-C to Exit From a separate terminal run the sample client. ``` -./containers/run.sh -it -- python3 -m hello_world.client +./container/run.sh -it -- python3 -m hello_world.client ``` #### Expected Output From 8ce4d95d5cf459cdcb1a6d25799b422cf79874e4 Mon Sep 17 00:00:00 2001 From: Neelay Shah Date: Fri, 17 Jan 2025 13:11:37 -0800 Subject: [PATCH 46/50] Update examples/hello_world/README.md Co-authored-by: Ryan McCormick --- examples/hello_world/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index 494d101d..1ce74c2c 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -24,10 +24,10 @@ of simple workers to load balance requests from a local work queue. The example demonstrates: 1. How to incorporate an existing Triton Core Model into a triton distributed worker. -1. How to incorporate a standalone python class into a triton distributed worker. -1. How deploy a set of workers -1. How to send requests to the triton distributed deployment -1. Requests over the Request Plane and Data movement over the Data +2. How to incorporate a standalone python class into a triton distributed worker. +3. How deploy a set of workers +4. How to send requests to the triton distributed deployment +5. Requests over the Request Plane and Data movement over the Data Plane. ## Building the Hello World Environment From 68a335d19791ec2ccd6632ae167397aa859f5433 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 13:15:28 -0800 Subject: [PATCH 47/50] updated --- README.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/README.md b/README.md index c8703e08..cd97382a 100644 --- a/README.md +++ b/README.md @@ -92,12 +92,3 @@ A basic example demonstrating the new interfaces and concepts of triton distributed. In the hello world example, you can deploy a set of simple workers to load balance requests from a local work queue. -The example demonstrates: - -1. How to incorporate an existing Triton Core Model into a triton distributed worker. -1. How to incorporate a standalone python class into a triton distributed worker. -1. How deploy a set of workers -1. How to send requests to the triton distributed deployment -1. Requests over the Request Plane and Data movement over the Data - Plane. - From d53bde56bdf02979872f89e782c642c6e82adb29 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 13:18:51 -0800 Subject: [PATCH 48/50] updated --- examples/hello_world/README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md index 1ce74c2c..e03adc78 100644 --- a/examples/hello_world/README.md +++ b/examples/hello_world/README.md @@ -170,7 +170,7 @@ orchestrate the overall workflow. ``` client <-> encoder_decoder <-> encoder | - -----<-> decoder + -----<-> decoder ``` @@ -207,17 +207,19 @@ by a common Worker class. #### Triton Core Operator -The triton core operator makes a triton model (following the standard -model repo and backend structure of the tritonserver) available on the -request plane. Both the encoder and decoder are implemented as triton -python backend models. +The triton core operator makes a triton model (following the [standard +model +repo](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md) +and backend structure of the tritonserver) available on the request +plane. Both the encoder and decoder are implemented as triton python +backend models. -#### Standalone Operator +#### Generic Operator -The encoder-decoder operator is a standalone python class that -implements the Operator interface. Internally it makes remote requests -to other workers. Generally a standalone operator can make use of -other operators for its work but isn't required to. +The encoder-decoder operator is a python class that implements the +Operator interface. Internally it makes remote requests to other +workers. Generally a operator can make use of other operators for its +work but isn't required to. ### Workers From 566a7a6cc38ed9d351b4f7c440cf524b2bd849a6 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 13:20:29 -0800 Subject: [PATCH 49/50] updating default to standard --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cd97382a..3e689bc4 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ We provide 3 types of builds: 2. `TENSORRTLLM` which includes our TRT-LLM backend 3. `VLLM` which includes our VLLM backend -For example, if you want to build a container for the `VLLM` backend you can run +For example, if you want to build a container for the `STANDARD` backends you can run `./container/build.sh` From aa78878642932d0708b446909e69246532d08bd4 Mon Sep 17 00:00:00 2001 From: nnshah1 Date: Fri, 17 Jan 2025 13:27:07 -0800 Subject: [PATCH 50/50] updating --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 3e689bc4..5a4a3fa2 100644 --- a/README.md +++ b/README.md @@ -92,3 +92,14 @@ A basic example demonstrating the new interfaces and concepts of triton distributed. In the hello world example, you can deploy a set of simple workers to load balance requests from a local work queue. +# Disclaimers + +> [!NOTE] +> This project is currently in the alpha / experimental / +> rapid-prototyping stage and we will be adding new features incrementally. + +1. The `TENSORRTLLM` and `VLLM` containers are WIP and not expected to + work out of the box. + +2. Testing has primarily been on single node systems with processes + launched within a single container.