From 9a67e253ab4a20d6b1465b6ea1132dc568fd7b0c Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Wed, 15 Jan 2025 07:23:24 -0800
Subject: [PATCH 01/50] updated with skeleton / straw man structure

---
 examples/hello_world/README.md          | 0
 examples/hello_world/deploy/.gitkeep    | 0
 examples/hello_world/docs/.gitkeep      | 0
 examples/hello_world/models/.gitkeep    | 0
 examples/hello_world/operators/.gitkeep | 0
 examples/hello_world/scripts/.gitkeep   | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/hello_world/README.md
 create mode 100644 examples/hello_world/deploy/.gitkeep
 create mode 100644 examples/hello_world/docs/.gitkeep
 create mode 100644 examples/hello_world/models/.gitkeep
 create mode 100644 examples/hello_world/operators/.gitkeep
 create mode 100644 examples/hello_world/scripts/.gitkeep

diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/deploy/.gitkeep b/examples/hello_world/deploy/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/docs/.gitkeep b/examples/hello_world/docs/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/models/.gitkeep b/examples/hello_world/models/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/operators/.gitkeep b/examples/hello_world/operators/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/scripts/.gitkeep b/examples/hello_world/scripts/.gitkeep
new file mode 100644
index 00000000..e69de29b

From 44304db34f7a16a09bd1aab836d8215316684e96 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Wed, 15 Jan 2025 07:33:37 -0800
Subject: [PATCH 02/50] updated

---
 examples/hello_world/api_server/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/hello_world/api_server/.gitkeep

diff --git a/examples/hello_world/api_server/.gitkeep b/examples/hello_world/api_server/.gitkeep
new file mode 100644
index 00000000..e69de29b

From f4a1ef0de2c42c4e435a807c68eaced4c4e1ef94 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Wed, 15 Jan 2025 07:34:19 -0800
Subject: [PATCH 03/50] updated

---
 examples/hello_world/router/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/hello_world/router/.gitkeep

diff --git a/examples/hello_world/router/.gitkeep b/examples/hello_world/router/.gitkeep
new file mode 100644
index 00000000..e69de29b

From 62b20c5d02d09aa8e4925bd7a401c5073b59f61c Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Wed, 15 Jan 2025 15:48:46 -0800
Subject: [PATCH 04/50] moving under operators

---
 examples/hello_world/models/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/hello_world/models/.gitkeep

diff --git a/examples/hello_world/models/.gitkeep b/examples/hello_world/models/.gitkeep
deleted file mode 100644
index e69de29b..00000000

From 7dd83c119b6dace0daf1b705a1de6c6265bf62fe Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Wed, 15 Jan 2025 16:14:23 -0800
Subject: [PATCH 05/50] updated skeleton

---
 examples/hello_world/client/.gitkeep                      | 0
 examples/hello_world/operators/triton_model_repo/.gitkeep | 0
 examples/hello_world/tests/.gitkeep                       | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 examples/hello_world/client/.gitkeep
 create mode 100644 examples/hello_world/operators/triton_model_repo/.gitkeep
 create mode 100644 examples/hello_world/tests/.gitkeep

diff --git a/examples/hello_world/client/.gitkeep b/examples/hello_world/client/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/operators/triton_model_repo/.gitkeep b/examples/hello_world/operators/triton_model_repo/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/tests/.gitkeep b/examples/hello_world/tests/.gitkeep
new file mode 100644
index 00000000..e69de29b

From cad9d3f4209abb64c7c612187281c2be42bd6f6a Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Wed, 15 Jan 2025 17:55:02 -0800
Subject: [PATCH 06/50] updated with minimal example - not yet tested

---
 examples/hello_world/deploy/__main__.py       | 195 ++++++++++++++++++
 .../.gitkeep                                  |   0
 .../triton_core_models/decoder/1/model.py     | 105 ++++++++++
 .../triton_core_models/decoder/config.pbtxt   |  20 ++
 .../triton_core_models/encoder/1/model.py     | 130 ++++++++++++
 .../triton_core_models/encoder/config.pbtxt   |  37 ++++
 6 files changed, 487 insertions(+)
 create mode 100644 examples/hello_world/deploy/__main__.py
 rename examples/hello_world/operators/{triton_model_repo => triton_core_models}/.gitkeep (100%)
 create mode 100644 examples/hello_world/operators/triton_core_models/decoder/1/model.py
 create mode 100644 examples/hello_world/operators/triton_core_models/decoder/config.pbtxt
 create mode 100644 examples/hello_world/operators/triton_core_models/encoder/1/model.py
 create mode 100644 examples/hello_world/operators/triton_core_models/encoder/config.pbtxt

diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
new file mode 100644
index 00000000..b2c37b51
--- /dev/null
+++ b/examples/hello_world/deploy/__main__.py
@@ -0,0 +1,195 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import time
+
+import cupy
+import numpy
+from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer
+from triton_distributed.icp.ucp_data_plane import UcpDataPlane
+from triton_distributed.tritonserver import MemoryType
+from triton_distributed.worker import WorkerConfig
+from triton_distributed.worker.operator import Operator, OperatorConfig
+from triton_distributed.worker.remote_operator import RemoteOperator
+from triton_distributed.worker.remote_request import RemoteInferenceRequest
+from triton_distributed.worker.triton_core_operator import TritonCoreOperator
+
+
+class EncodeDecodeOperator(Operator):
+    def __init__(
+        self,
+        name,
+        version,
+        triton_core,
+        request_plane,
+        data_plane,
+        parameters,
+        repository,
+        logger,
+    ):
+        self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane)
+        self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane)
+
+    async def execute(self, requests: list[RemoteInferenceRequest]):
+        for request in requests:
+            encoded_responses = await self._encoder.async_infer(
+                inputs={"input": request.inputs["input"]}
+            )
+
+            async for encoded_response in encoded_responses:
+                input_copies = int(
+                    numpy.from_dlpack(encoded_response.outputs["input_copies"])
+                )
+                decoded_responses = await self._decoder.async_infer(
+                    inputs={"input": encoded_response.outputs["output"]},
+                    parameters={"input_copies": input_copies},
+                )
+
+                async for decoded_response in decoded_responses:
+                    await request.response_sender().send(
+                        final=True,
+                        outputs={"output": decoded_response.outputs["output"]},
+                    )
+                    del decoded_response
+
+
+async def send_requests(nats_server_url):
+    request_plane = NatsRequestPlane(nats_server_url)
+    data_plane = UcpDataPlane()
+    await request_plane.connect()
+    data_plane.connect()
+
+    remote_operator: RemoteOperator = RemoteOperator(
+        "encoder_decoder", 1, request_plane, data_plane
+    )
+
+    inputs = [
+        numpy.array(numpy.random.randint(0, 100, 10000)).astype("int64")
+        for _ in range(100)
+    ]
+
+    requests = [
+        await remote_operator.async_infer(
+            inputs={"input": inputs[index]}, request_id=str(index)
+        )
+        for index in range(100)
+    ]
+
+    for request in requests:
+        async for response in request:
+            for output_name, output_value in response.outputs.items():
+                if output_value.memory_type == MemoryType.CPU:
+                    output = numpy.from_dlpack(output_value)
+                    numpy.testing.assert_array_equal(
+                        output, inputs[int(response.request_id)]
+                    )
+                else:
+                    output = cupy.from_dlpack(output_value)
+                    cupy.testing.assert_array_equal(
+                        output, inputs[int(response.request_id)]
+                    )
+                del output_value
+            print(f"Finished Request: {response.request_id}")
+            print(response.error)
+            del response
+
+    await request_plane.close()
+    data_plane.close()
+
+
+async def main():
+    nats_server = NatsServer()
+    time.sleep(1)
+
+    encoder_op = OperatorConfig(
+        name="encoder",
+        repository="/workspace/examples/hello_world/operators/models",
+        implementation=TritonCoreOperator,
+        max_inflight_requests=1,
+        parameters={
+            "config": {
+                "instance_group": [{"count": 1, "kind": "KIND_CPU"}],
+                "parameters": {"delay": {"string_value": "0"}},
+            }
+        },
+    )
+
+    decoder_op = OperatorConfig(
+        name="decoder",
+        repository="/workspace/examples/hello_world/operators/models",
+        implementation=TritonCoreOperator,
+        max_inflight_requests=1,
+        parameters={
+            "config": {
+                "instance_group": [{"count": 1, "kind": "KIND_GPU"}],
+                "parameters": {"delay": {"string_value": "0"}},
+            }
+        },
+    )
+
+    encoder_decoder_op = OperatorConfig(
+        name="encoder_decoder",
+        implementation=EncodeDecodeOperator,
+        max_inflight_requests=100,
+    )
+
+    encoder = WorkerConfig(
+        request_plane_args=([nats_server.url], {}),
+        log_level=6,
+        operators=[encoder_op],
+        name="encoder",
+        metrics_port=8060,
+        log_dir="logs",
+    )
+
+    decoder = WorkerConfig(
+        request_plane_args=([nats_server.url], {}),
+        log_level=6,
+        operators=[decoder_op],
+        name="decoder",
+        metrics_port=8061,
+        log_dir="logs",
+    )
+
+    encoder_decoder = WorkerConfig(
+        request_plane_args=([nats_server.url], {}),
+        log_level=6,
+        operators=[encoder_decoder_op],
+        name="encoder_decoder",
+        metrics_port=8062,
+        log_dir="logs",
+    )
+
+    print("Starting Workers")
+
+    processes = [process.start() for process in [encoder, decoder, encoder_decoder]]
+
+    print("Sending Requests")
+
+    await send_requests(nats_server.url)
+
+    print("Stopping Workers")
+
+    for process in reversed(processes):
+        print("shutting down", process)
+        process.terminate()
+        print("waiting", process)
+        process.join()
+        print("done", process)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/hello_world/operators/triton_model_repo/.gitkeep b/examples/hello_world/operators/triton_core_models/.gitkeep
similarity index 100%
rename from examples/hello_world/operators/triton_model_repo/.gitkeep
rename to examples/hello_world/operators/triton_core_models/.gitkeep
diff --git a/examples/hello_world/operators/triton_core_models/decoder/1/model.py b/examples/hello_world/operators/triton_core_models/decoder/1/model.py
new file mode 100644
index 00000000..8187835d
--- /dev/null
+++ b/examples/hello_world/operators/triton_core_models/decoder/1/model.py
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import time
+
+import numpy
+import triton_python_backend_utils as pb_utils
+
+try:
+    import cupy
+except Exception:
+    cupy = None
+
+
+class TritonPythonModel:
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        """Auto Complets Model Config
+
+        Model has one input and one output
+        both of type int64
+
+        Parameters
+        ----------
+        auto_complete_model_config : config
+            Enables reading and updating config.pbtxt
+
+
+        """
+
+        input_config = {
+            "name": "input",
+            "data_type": "TYPE_INT64",
+            "dims": [-1],
+            "optional": False,
+        }
+
+        output_config = {
+            "name": "output",
+            "data_type": "TYPE_INT64",
+            "dims": [-1],
+        }
+
+        auto_complete_model_config.add_input(input_config)
+        auto_complete_model_config.add_output(output_config)
+        auto_complete_model_config.set_max_batch_size(0)
+        auto_complete_model_config.set_model_transaction_policy({"decoupled": False})
+
+        return auto_complete_model_config
+
+    def initialize(self, args):
+        self._model_config = json.loads(args["model_config"])
+        self._model_instance_kind = args["model_instance_kind"]
+        self._model_instance_device_id = int(args["model_instance_device_id"])
+        self._config_parameters = self._model_config.get("parameters", {})
+        self._input_copies = int(
+            self._config_parameters.get("input_copies", {"string_value": "5"})[
+                "string_value"
+            ]
+        )
+        self._delay = float(
+            self._config_parameters.get("delay", {"string_value": "0"})["string_value"]
+        )
+
+    def execute(self, requests):
+        responses = []
+        input_copies = self._input_copies
+        delay = self._delay
+        for request in requests:
+            output_tensors = []
+            parameters = json.loads(request.parameters())
+            if parameters:
+                input_copies = int(parameters.get("input_copies", self._input_copies))
+                delay = float(parameters.get("delay", self._delay))
+            for input_tensor in request.inputs():
+                input_value = input_tensor.as_numpy()
+                output_value = []
+                if self._model_instance_kind == "GPU":
+                    with cupy.cuda.Device(self._model_instance_device_id):
+                        input_value = cupy.array(input_value)
+                        output_value = cupy.invert(input_value)
+                        output_value = output_value[::input_copies]
+                        output_tensor = pb_utils.Tensor.from_dlpack(
+                            "output", output_value
+                        )
+                else:
+                    output_value = numpy.invert(input_value)
+                    output_value = output_value[::input_copies]
+                    output_tensor = pb_utils.Tensor("output", output_value)
+                output_tensors.append(output_tensor)
+                time.sleep(len(output_value) * delay)
+            responses.append(pb_utils.InferenceResponse(output_tensors=output_tensors))
+        return responses
diff --git a/examples/hello_world/operators/triton_core_models/decoder/config.pbtxt b/examples/hello_world/operators/triton_core_models/decoder/config.pbtxt
new file mode 100644
index 00000000..5581461c
--- /dev/null
+++ b/examples/hello_world/operators/triton_core_models/decoder/config.pbtxt
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## Model Instance and Kind are filled in by configuration when launched
+## All other values are filled in by auto_complete in model.py
+
+backend: "python"
+
diff --git a/examples/hello_world/operators/triton_core_models/encoder/1/model.py b/examples/hello_world/operators/triton_core_models/encoder/1/model.py
new file mode 100644
index 00000000..36a2dbf9
--- /dev/null
+++ b/examples/hello_world/operators/triton_core_models/encoder/1/model.py
@@ -0,0 +1,130 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import time
+
+import numpy
+import triton_python_backend_utils as pb_utils
+
+try:
+    import cupy
+except Exception:
+    cupy = None
+
+
+class TritonPythonModel:
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        """Auto Complets Model Config
+
+        Model has one input and one output
+        both of type int64
+
+        Parameters
+        ----------
+        auto_complete_model_config : config
+            Enables reading and updating config.pbtxt
+
+
+        """
+
+        input_config = {
+            "name": "input",
+            "data_type": "TYPE_INT64",
+            "dims": [-1],
+            "optional": False,
+        }
+
+        output_config = {
+            "name": "output",
+            "data_type": "TYPE_INT64",
+            "dims": [-1],
+        }
+
+        copies_config = {
+            "name": "input_copies",
+            "data_type": "TYPE_INT64",
+            "dims": [1],
+        }
+
+        auto_complete_model_config.add_input(input_config)
+        auto_complete_model_config.add_output(output_config)
+        auto_complete_model_config.add_output(copies_config)
+        auto_complete_model_config.set_max_batch_size(0)
+        auto_complete_model_config.set_model_transaction_policy({"decoupled": False})
+
+        return auto_complete_model_config
+
+    def initialize(self, args):
+        self._model_config = json.loads(args["model_config"])
+        self._model_instance_kind = args["model_instance_kind"]
+        self._model_instance_device_id = int(args["model_instance_device_id"])
+        self._config_parameters = self._model_config.get("parameters", {})
+        self._input_copies = int(
+            self._config_parameters.get("input_copies", {"string_value": "5"})[
+                "string_value"
+            ]
+        )
+        self._delay = float(
+            self._config_parameters.get("delay", {"string_value": "0"})["string_value"]
+        )
+
+    def execute(self, requests):
+        responses = []
+        input_copies = self._input_copies
+        delay = self._delay
+        for request in requests:
+            output_tensors = []
+            parameters = json.loads(request.parameters())
+            if parameters:
+                input_copies = int(parameters.get("input_copies", self._input_copies))
+                delay = float(parameters.get("delay", self._delay))
+            for input_tensor in request.inputs():
+                input_value = input_tensor.as_numpy()
+                output_value = []
+                if self._model_instance_kind == "GPU":
+                    with cupy.cuda.Device(self._model_instance_device_id):
+                        input_value = cupy.array(input_value)
+                        output_value = cupy.tile(input_value, input_copies)
+                        output_value = cupy.invert(output_value)
+                        output_tensor = pb_utils.Tensor.from_dlpack(
+                            "output", output_value
+                        )
+                else:
+                    output_value = numpy.tile(input_value, input_copies)
+                    output_value = numpy.invert(output_value)
+                    output_tensor = pb_utils.Tensor("output", output_value)
+                output_tensors.append(output_tensor)
+                output_tensors.append(
+                    pb_utils.Tensor(
+                        "input_copies", numpy.array(input_copies).astype("int64")
+                    )
+                )
+                time.sleep(len(output_value) * delay)
+
+            responses.append(pb_utils.InferenceResponse(output_tensors=output_tensors))
+        return responses
diff --git a/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt
new file mode 100644
index 00000000..05968c5a
--- /dev/null
+++ b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt
@@ -0,0 +1,37 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+## Model Instance and Kind are filled in by configuration when launched
+## All other values are filled in by auto_complete in model.py
+
+backend: "python"
+
+# instance_group [
+# { count: {MODEL_INSTANCE_COUNT}
+#  kind: {MODEL_INSTANCE_KIND}
+# }
+# ]
+#
\ No newline at end of file

From 97c752dc9a4d22cdba07e384c86e861e4b4d2d5c Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Wed, 15 Jan 2025 23:45:24 -0800
Subject: [PATCH 07/50] updated to use deployment

---
 examples/hello_world/deploy/__main__.py       | 33 ++++++++++---------
 .../triton_distributed/worker/__init__.py     |  3 ++
 .../triton_distributed/worker/deployment.py   |  4 +++
 .../triton_distributed/worker/worker.py       |  1 +
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index b2c37b51..37d8df38 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -20,12 +20,16 @@
 import numpy
 from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer
 from triton_distributed.icp.ucp_data_plane import UcpDataPlane
-from triton_distributed.tritonserver import MemoryType
-from triton_distributed.worker import WorkerConfig
-from triton_distributed.worker.operator import Operator, OperatorConfig
-from triton_distributed.worker.remote_operator import RemoteOperator
-from triton_distributed.worker.remote_request import RemoteInferenceRequest
-from triton_distributed.worker.triton_core_operator import TritonCoreOperator
+from triton_distributed.worker import (
+    Deployment,
+    Operator,
+    OperatorConfig,
+    RemoteInferenceRequest,
+    RemoteOperator,
+    TritonCoreOperator,
+    WorkerConfig,
+)
+from tritonserver import MemoryType
 
 
 class EncodeDecodeOperator(Operator):
@@ -116,7 +120,7 @@ async def main():
 
     encoder_op = OperatorConfig(
         name="encoder",
-        repository="/workspace/examples/hello_world/operators/models",
+        repository="/workspace/examples/hello_world/operators/triton_core_models",
         implementation=TritonCoreOperator,
         max_inflight_requests=1,
         parameters={
@@ -129,7 +133,7 @@ async def main():
 
     decoder_op = OperatorConfig(
         name="decoder",
-        repository="/workspace/examples/hello_world/operators/models",
+        repository="/workspace/examples/hello_world/operators/triton_core_models",
         implementation=TritonCoreOperator,
         max_inflight_requests=1,
         parameters={
@@ -142,7 +146,7 @@ async def main():
 
     encoder_decoder_op = OperatorConfig(
         name="encoder_decoder",
-        implementation=EncodeDecodeOperator,
+        implementation="/workspace/examples/hello_world/deploy/__main__:EncodeDecodeOperator",
         max_inflight_requests=100,
     )
 
@@ -175,7 +179,9 @@ async def main():
 
     print("Starting Workers")
 
-    processes = [process.start() for process in [encoder, decoder, encoder_decoder]]
+    deployment = Deployment([encoder, decoder, encoder_decoder])
+
+    deployment.start()
 
     print("Sending Requests")
 
@@ -183,12 +189,7 @@ async def main():
 
     print("Stopping Workers")
 
-    for process in reversed(processes):
-        print("shutting down", process)
-        process.terminate()
-        print("waiting", process)
-        process.join()
-        print("done", process)
+    deployment.stop()
 
 
 if __name__ == "__main__":
diff --git a/worker/src/python/triton_distributed/worker/__init__.py b/worker/src/python/triton_distributed/worker/__init__.py
index 365c827e..e681f755 100644
--- a/worker/src/python/triton_distributed/worker/__init__.py
+++ b/worker/src/python/triton_distributed/worker/__init__.py
@@ -22,5 +22,8 @@
 from triton_distributed.worker.remote_response import (
     RemoteInferenceResponse as RemoteInferenceResponse,
 )
+from triton_distributed.worker.triton_core_operator import (
+    TritonCoreOperator as TritonCoreOperator,
+)
 from triton_distributed.worker.worker import Worker as Worker
 from triton_distributed.worker.worker import WorkerConfig as WorkerConfig
diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py
index 9fc0b67e..68f5645e 100644
--- a/worker/src/python/triton_distributed/worker/deployment.py
+++ b/worker/src/python/triton_distributed/worker/deployment.py
@@ -36,6 +36,10 @@ def start(self):
                     args=[worker_config],
                 )
             )
+            self._workers[-1].start()
+
+    def stop(self):
+        self.shutdown()
 
     def shutdown(self, join=True, timeout=10):
         for worker in self._workers:
diff --git a/worker/src/python/triton_distributed/worker/worker.py b/worker/src/python/triton_distributed/worker/worker.py
index 9f3646f7..acfa3853 100644
--- a/worker/src/python/triton_distributed/worker/worker.py
+++ b/worker/src/python/triton_distributed/worker/worker.py
@@ -111,6 +111,7 @@ def _import_operators(self):
                     sys.path.append(str(module_path.parent.absolute()))
                 try:
                     module = importlib.import_module(module_path.name)
+                    print(dir(module))
                     class_ = getattr(module, class_name)
                 except Exception as e:
                     logger.exception(

From b0f3464a4f3bf37650ac53120b48c1f5145c567a Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 08:07:43 -0800
Subject: [PATCH 08/50] updating with single_file separate from main

---
 examples/hello_world/deploy/__main__.py       |   7 +-
 examples/hello_world/single_file.py           | 203 ++++++++++++++++++
 .../triton_distributed/worker/deployment.py   |  38 +++-
 .../triton_distributed/worker/worker.py       |  12 +-
 4 files changed, 247 insertions(+), 13 deletions(-)
 create mode 100644 examples/hello_world/single_file.py

diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index 37d8df38..9ecb9f96 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -46,9 +46,11 @@ def __init__(
     ):
         self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane)
         self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane)
+        self._logger = logger
 
     async def execute(self, requests: list[RemoteInferenceRequest]):
         for request in requests:
+            self._logger.info("got request!")
             encoded_responses = await self._encoder.async_infer(
                 inputs={"input": request.inputs["input"]}
             )
@@ -146,8 +148,9 @@ async def main():
 
     encoder_decoder_op = OperatorConfig(
         name="encoder_decoder",
-        implementation="/workspace/examples/hello_world/deploy/__main__:EncodeDecodeOperator",
+        implementation="EncodeDecodeOperator",
         max_inflight_requests=100,
+        repository="/workspace/examples/hello_world/operators",
     )
 
     encoder = WorkerConfig(
@@ -179,7 +182,7 @@ async def main():
 
     print("Starting Workers")
 
-    deployment = Deployment([encoder, decoder, encoder_decoder])
+    deployment = Deployment([(encoder, 5), decoder, (encoder_decoder, 6)])
 
     deployment.start()
 
diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py
new file mode 100644
index 00000000..531c0abb
--- /dev/null
+++ b/examples/hello_world/single_file.py
@@ -0,0 +1,203 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import shutil
+
+import cupy
+import numpy
+from tqdm import tqdm
+from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer
+from triton_distributed.icp.ucp_data_plane import UcpDataPlane
+from triton_distributed.worker import (
+    Deployment,
+    Operator,
+    OperatorConfig,
+    RemoteInferenceRequest,
+    RemoteOperator,
+    TritonCoreOperator,
+    WorkerConfig,
+)
+from tritonserver import MemoryType
+
+
+class EncodeDecodeOperator(Operator):
+    def __init__(
+        self,
+        name,
+        version,
+        triton_core,
+        request_plane,
+        data_plane,
+        parameters,
+        repository,
+        logger,
+    ):
+        self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane)
+        self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane)
+        self._logger = logger
+
+    async def execute(self, requests: list[RemoteInferenceRequest]):
+        for request in requests:
+            self._logger.info("got request!")
+            encoded_responses = await self._encoder.async_infer(
+                inputs={"input": request.inputs["input"]}
+            )
+
+            async for encoded_response in encoded_responses:
+                input_copies = int(
+                    numpy.from_dlpack(encoded_response.outputs["input_copies"])
+                )
+                decoded_responses = await self._decoder.async_infer(
+                    inputs={"input": encoded_response.outputs["output"]},
+                    parameters={"input_copies": input_copies},
+                )
+
+                async for decoded_response in decoded_responses:
+                    await request.response_sender().send(
+                        final=True,
+                        outputs={"output": decoded_response.outputs["output"]},
+                    )
+                    del decoded_response
+
+
+async def send_requests(nats_server_url, request_count=100):
+    request_plane = NatsRequestPlane(nats_server_url)
+    data_plane = UcpDataPlane()
+    await request_plane.connect()
+    data_plane.connect()
+
+    remote_operator: RemoteOperator = RemoteOperator(
+        "encoder_decoder", 1, request_plane, data_plane
+    )
+
+    inputs = [
+        numpy.array(numpy.random.randint(0, 100, 10000)).astype("int64")
+        for _ in range(request_count)
+    ]
+
+    with tqdm(total=request_count, desc="Sending Requests", unit="request") as pbar:
+        requests = [
+            await remote_operator.async_infer(
+                inputs={"input": inputs[index]}, request_id=str(index)
+            )
+            for index in range(request_count)
+        ]
+
+        for request in requests:
+            async for response in request:
+                for output_name, output_value in response.outputs.items():
+                    if output_value.memory_type == MemoryType.CPU:
+                        output = numpy.from_dlpack(output_value)
+                        numpy.testing.assert_array_equal(
+                            output, inputs[int(response.request_id)]
+                        )
+                    else:
+                        output = cupy.from_dlpack(output_value)
+                        cupy.testing.assert_array_equal(
+                            output, inputs[int(response.request_id)]
+                        )
+                    del output_value
+                print(
+                    f"Finished Request: {response.request_id} Response From: {response.component_id} Error: {response.error}"
+                )
+                pbar.update(1)
+                del response
+
+    await request_plane.close()
+    data_plane.close()
+
+
+async def main():
+    shutil.rmtree("logs")
+
+    nats_server = NatsServer()
+
+    encoder_op = OperatorConfig(
+        name="encoder",
+        repository="/workspace/examples/hello_world/operators/triton_core_models",
+        implementation=TritonCoreOperator,
+        max_inflight_requests=1,
+        parameters={
+            "config": {
+                "instance_group": [{"count": 1, "kind": "KIND_CPU"}],
+                "parameters": {"delay": {"string_value": "0"}},
+            }
+        },
+    )
+
+    decoder_op = OperatorConfig(
+        name="decoder",
+        repository="/workspace/examples/hello_world/operators/triton_core_models",
+        implementation=TritonCoreOperator,
+        max_inflight_requests=1,
+        parameters={
+            "config": {
+                "instance_group": [{"count": 1, "kind": "KIND_GPU"}],
+                "parameters": {"delay": {"string_value": "0"}},
+            }
+        },
+    )
+
+    encoder_decoder_op = OperatorConfig(
+        name="encoder_decoder",
+        implementation=EncodeDecodeOperator,
+        max_inflight_requests=100,
+    )
+
+    encoder = WorkerConfig(
+        request_plane_args=([nats_server.url], {}),
+        log_level=6,
+        operators=[encoder_op],
+        name="encoder",
+        metrics_port=50000,
+        log_dir="logs",
+    )
+
+    decoder = WorkerConfig(
+        request_plane_args=([nats_server.url], {}),
+        log_level=6,
+        operators=[decoder_op],
+        name="decoder",
+        metrics_port=50100,
+        log_dir="logs",
+    )
+
+    encoder_decoder = WorkerConfig(
+        request_plane_args=([nats_server.url], {}),
+        log_level=6,
+        operators=[encoder_decoder_op],
+        name="encoder_decoder",
+        metrics_port=50200,
+        log_dir="logs",
+    )
+
+    print("Starting Workers")
+
+    deployment = Deployment([encoder, (decoder, 500), (encoder_decoder, 100)])
+
+    deployment.start()
+
+    print("Sending Requests")
+
+    await send_requests(nats_server.url)
+
+    print("Stopping Workers")
+
+    deployment.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py
index 68f5645e..75650f57 100644
--- a/worker/src/python/triton_distributed/worker/deployment.py
+++ b/worker/src/python/triton_distributed/worker/deployment.py
@@ -13,15 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import multiprocessing
+from pprint import pformat
 
+from triton_distributed.worker.log_formatter import setup_logger
 from triton_distributed.worker.worker import Worker, WorkerConfig
 
+LOGGER_NAME = __name__
+
 
 class Deployment:
-    def __init__(self, worker_configs: list[WorkerConfig]):
+    def __init__(
+        self, worker_configs: list[WorkerConfig | tuple[WorkerConfig, int]], log_level=3
+    ):
         self._process_context = multiprocessing.get_context("spawn")
         self._worker_configs = worker_configs
         self._workers: list[multiprocessing.context.SpawnProcess] = []
+        self._logger = setup_logger(log_level, LOGGER_NAME)
 
     @staticmethod
     def _start_worker(worker_config):
@@ -29,14 +36,29 @@ def _start_worker(worker_config):
 
     def start(self):
         for worker_config in self._worker_configs:
-            self._workers.append(
-                self._process_context.Process(
-                    target=Deployment._start_worker,
-                    name=worker_config.name,
-                    args=[worker_config],
+            worker_instances = 1
+            if isinstance(worker_config, tuple):
+                worker_instances = worker_config[1]
+                worker_config = worker_config[0]
+            worker_config.log_level = 6
+            base_name = worker_config.name
+            base_port = worker_config.metrics_port
+            for index in range(worker_instances):
+                worker_config.name = f"{base_name}.{index}"
+                worker_config.metrics_port = base_port + index
+                self._workers.append(
+                    self._process_context.Process(
+                        target=Deployment._start_worker,
+                        name=worker_config.name,
+                        args=[worker_config],
+                    )
+                )
+                self._logger.info(
+                    "\n\nStarting Worker:\n\n\tConfig:\n\t%s\n\t%s\n",
+                    pformat(worker_config),
+                    self._workers[-1],
                 )
-            )
-            self._workers[-1].start()
+                self._workers[-1].start()
 
     def stop(self):
         self.shutdown()
diff --git a/worker/src/python/triton_distributed/worker/worker.py b/worker/src/python/triton_distributed/worker/worker.py
index acfa3853..50c432b2 100644
--- a/worker/src/python/triton_distributed/worker/worker.py
+++ b/worker/src/python/triton_distributed/worker/worker.py
@@ -87,6 +87,7 @@ def __init__(
         self._operators: dict[tuple[str, int], Operator] = {}
         self._metrics_port = config.metrics_port
         self._metrics_server: Optional[uvicorn.Server] = None
+        self._component_id = self._request_plane.component_id
 
     def _import_operators(self):
         for operator_config in self._operator_configs:
@@ -328,12 +329,17 @@ async def _wait_for_tasks(self, loop):
 
     def start(self):
         if self._log_dir:
+            pid = os.getpid()
             os.makedirs(self._log_dir, exist_ok=True)
-            stdout_path = os.path.join(self._log_dir, f"{self._name}.stdout.log")
-            stderr_path = os.path.join(self._log_dir, f"{self._name}.stderr.log")
+            stdout_path = os.path.join(
+                self._log_dir, f"{self._name}.{self._component_id}.{pid}.stdout.log"
+            )
+            stderr_path = os.path.join(
+                self._log_dir, f"{self._name}.{self._component_id}.{pid}.stderr.log"
+            )
             if not self._triton_log_path:
                 self._triton_log_path = os.path.join(
-                    self._log_dir, f"{self._name}.triton.log"
+                    self._log_dir, f"{self._name}.{self._component_id}.{pid}.triton.log"
                 )
             sys.stdout = open(stdout_path, "w", buffering=1)
             sys.stderr = open(stderr_path, "w", buffering=1)

From 991358ba333b53dbed2f2fce67d6f1ab7382630e Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 08:13:41 -0800
Subject: [PATCH 09/50] updating

---
 examples/hello_world/single_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py
index 531c0abb..3ad786f0 100644
--- a/examples/hello_world/single_file.py
+++ b/examples/hello_world/single_file.py
@@ -186,7 +186,7 @@ async def main():
 
     print("Starting Workers")
 
-    deployment = Deployment([encoder, (decoder, 500), (encoder_decoder, 100)])
+    deployment = Deployment([encoder, (decoder, 10), (encoder_decoder, 10)])
 
     deployment.start()
 

From bd57e10be7860134a6a054de07b3ef331f5e1cee Mon Sep 17 00:00:00 2001
From: Piotr Marcinkiewicz <piotrm@nvidia.com>
Date: Thu, 16 Jan 2025 19:23:39 +0100
Subject: [PATCH 10/50] docs: Add descriptions for example and main folder

---
 README.md                      |  72 ++++++++++++++++++++-
 examples/hello_world/README.md | 114 +++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e0b361dd..1a3a6b22 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-
 # Triton Distributed
 
 <h4> A Datacenter Scale Distributed Inference Serving Framework </h4>
@@ -86,6 +85,77 @@ Please see the instructions in the corresponding example for specific
 deployment instructions.
 
 
+
+## 1. Big Picture
+Triton Distributed extends the standard [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) model-serving paradigm with additional “planes” that distribute data and requests across multiple processes or machines. Conceptually, you still write a Triton **Model**, but your inference requests and data transfers can be routed through:
+
+1. **Request Plane**: Sends model-inference requests among nodes or processes.
+2. **Data Plane**: Moves tensor data and references between processes or GPUs.
+
+This architecture allows you to build large, multi-process or multi-node solutions for AI model inference without manually managing transport and synchronization. You can share GPU memory references, shift them among distributed processes, etc.
+
+---
+
+## 2. Key Components
+The repo has four major logical layers:
+
+1. **ICP (Inter-Component Protocol)**:
+   - Python modules under `triton_distributed/icp/…`
+   - Encodes how the data and requests get serialized/transported.
+   - Implements **NatsRequestPlane** and **UcpDataPlane**, which are concrete transport/connection classes for requests/data.
+     - **NatsRequestPlane** uses NATS for distributing requests.
+     - **UcpDataPlane** uses UCX (libucp) for transferring tensor data, possibly GPU-to-GPU.
+
+2. **Worker**:
+   - Python modules under `triton_distributed/worker/…`
+   - Exposes the concept of an **Operator** (a processing node that can serve one or more Triton models or custom logic).
+   - Runs the main loop that pulls requests from a Request Plane, processes them, and returns responses.
+   - Contains a Python “mini” server (the `Worker`) that spawns or manages multiple Operators.
+
+3. **Integration Tests & Examples**:
+   - A directory structure with unit tests and integration tests showing how to compose multiple workers.
+   - The “hello world” example is under `examples/hello_world/`.
+
+4. **Triton Python Models**:
+   - Under various directories like `.../operators/triton_core_models/...` or `icp/src/python/triton_distributed/icp/...`
+   - Typical Triton `model.py` files that define custom Python logic behind each “model.”
+
+
+## 4. ICP Planes & Worker Internals
+
+### 4.1 Request Plane (NATS)
+`NatsRequestPlane` handles distributing requests among processes. Under the hood, it:
+
+- Connects to a NATS server (which might run in local Docker or remote).
+- Creates “streams” in NATS for each operator or for direct routing.
+- On the “client” side (where you call `post_request`), it publishes request messages to the right NATS subjects.
+- On the “server” side (the Worker), it “pulls” requests from NATS subscriptions.
+
+### 4.2 Data Plane (UCX)
+`UcpDataPlane` references UCX-Py (libucp) to exchange actual tensor data. By default:
+
+- When you “put” a tensor, the data plane either:
+  1. Embeds small data directly in the message (the “contents” approach), or
+  2. If large, stores a reference (GPU or CPU memory) in the local `_tensor_store`, then sends a small “URI” like `ucp://hostname:port/<uuid>` to the remote side.
+- The remote side can do “get_tensor” by connecting to `ucp://hostname:port` and pulling the data.
+
+This allows distributed GPU memory references with minimal overhead.
+
+### 4.3 Worker
+A `Worker` runs in a separate process. It:
+
+- Starts or registers Triton model(s).
+- Connects to the chosen request plane (NATS) and data plane (UCX).
+- Enters a loop:
+  1. `pull_requests` from NATS,
+  2. routes them to the correct Operator,
+  3. gets the results,
+  4. returns them to the request plane.
+
+In the “hello world,” you see three Worker processes—each hosting either the encoder, decoder, or aggregator operator.
+
+
+
 <!--
 
 ## Goals
diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index e69de29b..9f969016 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -0,0 +1,114 @@
+```
+examples/
+└── hello_world
+    ├── README.md
+    ├── api_server
+    ├── client (optional)
+    ├── deploy
+    │   └── __main__.py (should it contain all workers, the example have here also API server-like logic to publish requests from users)
+    ├── docs
+    ├── operators
+    │   └── triton_core_models (optional)
+    │       ├── decoder
+    │       │   ├── 1
+    │       │   │   └── model.py
+    │       │   └── config.pbtxt
+    │       └── encoder
+    │           ├── 1
+    │           │   └── model.py
+    │           └── config.pbtxt
+    ├── router (optional)
+    ├── scripts (What should be here?)
+    ├── single_file.py
+    └── tests
+```
+
+Review plans for deploy cli / client cli
+```
+deploy --encoder workers:instances:device --decoder workers:instances:device --encoder-decoder workers
+  in future
+  deploy --api-server <kserve>
+  deploy --request-plane nginx  (would need to convert encode decode into bls?)
+```
+
+
+Below is a high-level overview of how Triton Distributed is organized, with special attention to the “hello world” example that demonstrates how the system’s pieces fit together.
+
+---
+
+
+## 3. “Hello World” Layout
+In `examples/hello_world/`, you see a minimal demonstration of how to:
+
+1. Create a few Triton models (the “encoder” and “decoder”).
+2. Start a small distributed deployment with these models.
+3. Send requests in parallel and demonstrate data-plane usage.
+
+### 3.1 Directory Structure
+
+```
+examples/hello_world/
+  deploy/
+    __main__.py     # Entry point that starts the “hello world” deployment
+  operators/
+    triton_core_models/
+      encoder/1/model.py    # Python model code for an “encoder” step
+      decoder/1/model.py    # Python model code for a “decoder” step
+```
+
+#### (a) The `__main__.py` (Deploy Script)
+This file spins up everything end-to-end:
+
+- Creates a local NATS server object (`nats_server`) so that requests can be published and consumed.
+- Defines **OperatorConfig** objects for the two Triton models, `encoder` and `decoder`. Each references a local path to the Python model code and custom parameters (e.g., instance group, concurrency, etc.).
+- Defines a custom “orchestrator” operator named `encoder_decoder` (`EncodeDecodeOperator` in the code) that chains calls to the `encoder` and `decoder`.
+- Creates three WorkerConfig entries:
+  1. Worker that hosts the `encoder` model  **(REMOVE in Python)**
+  2. Worker that hosts the `decoder` model  **(REMOVE in Python)**
+  3. Worker that hosts the aggregator operator (`encoder_decoder`) (Python HERE)
+- Launches all three processes with a `Deployment` object. (We need separate entry points for API server)
+- Sends test requests to `encoder_decoder` operator (which calls `encoder` then `decoder`) and verifies the results. (this will run vLLM)
+
+
+#### (c) The `EncodeDecodeOperator`
+This is a custom operator (in `deploy/__main__.py` as a short class, or sometimes in a separate file) that demonstrates how to chain calls:
+
+```python
+for request in requests:
+  # 1. Send "input" to the "encoder" model
+  encoded_responses = await self._encoder.async_infer(inputs={"input": request.inputs["input"]})
+
+  # 2. When the encoder finishes, read "input_copies" from the response
+  #    then call “decoder” with the “encoded” output
+  decoded_responses = await self._decoder.async_infer(
+      inputs={"input": encoded_response.outputs["output"]},
+      parameters={"input_copies": input_copies},
+  )
+
+  # 3. Return the result back to the user
+  await request.response_sender().send(final=True, outputs={"output": decoded_response.outputs["output"]})
+```
+
+Hence, the aggregator itself is just a normal Python class implementing the `Operator` interface, but inside it calls **RemoteOperator** objects for actual inference.
+
+## 5. How the Hello World Example Flows
+1. **`main()`** in `examples/hello_world/deploy/__main__.py` starts:
+   - A local NATS server for request-plane traffic.
+   - Worker processes for “encoder,” “decoder,” and “encoder_decoder.”
+2. Each Worker loads a Python model or an Operator class:
+   - The `encoder` Worker loads the model code from `encoder/1/model.py`.
+   - The `decoder` Worker does the same for `decoder/1/model.py`.
+   - The `encoder_decoder` Worker instantiates the `EncodeDecodeOperator` Python class, which calls `encoder` and `decoder` remotely.
+3. The script then calls `send_requests(nats_server_url)`:
+   - It uses a **RemoteOperator** for `encoder_decoder` and does something like:
+     ```python
+     remote_operator: RemoteOperator = RemoteOperator("encoder_decoder", 1, request_plane, data_plane)
+     await remote_operator.async_infer(inputs={"input": some_numpy_array})
+     ```
+4. The `async_infer()` method publishes a request to the “encoder_decoder” Worker (via `NatsRequestPlane`) and references data (via `UcpDataPlane`).
+5. The aggregator Worker receives the request, calls `_encoder.async_infer()`, which sends a second request to the “encoder” Worker:
+   - The “encoder” Worker runs the simple tile/invert logic.
+   - Once done, it returns the result to the aggregator Worker.
+6. The aggregator Worker then calls `_decoder.async_infer()`, which calls the “decoder” Worker’s model, which re-inverts and slices the data, returning it back.
+7. Finally, the aggregator Worker returns the final “decoded” data to the original caller in `send_requests`.
+

From 1109f657536ec79657d83ef446ad7bf0835cbeff Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 10:28:48 -0800
Subject: [PATCH 11/50] updating to add operator to operators folder for us in
 'deploy'

---
 examples/hello_world/operators/__init__.py    |  1 +
 .../hello_world/operators/encoder_decoder.py  | 42 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 examples/hello_world/operators/__init__.py
 create mode 100644 examples/hello_world/operators/encoder_decoder.py

diff --git a/examples/hello_world/operators/__init__.py b/examples/hello_world/operators/__init__.py
new file mode 100644
index 00000000..c3d81699
--- /dev/null
+++ b/examples/hello_world/operators/__init__.py
@@ -0,0 +1 @@
+from encoder_decoder import EncodeDecodeOperator as EncodeDecodeOperator
diff --git a/examples/hello_world/operators/encoder_decoder.py b/examples/hello_world/operators/encoder_decoder.py
new file mode 100644
index 00000000..3cde11bc
--- /dev/null
+++ b/examples/hello_world/operators/encoder_decoder.py
@@ -0,0 +1,42 @@
+import numpy
+from triton_distributed.worker import Operator, RemoteInferenceRequest, RemoteOperator
+
+
+class EncodeDecodeOperator(Operator):
+    def __init__(
+        self,
+        name,
+        version,
+        triton_core,
+        request_plane,
+        data_plane,
+        parameters,
+        repository,
+        logger,
+    ):
+        self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane)
+        self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane)
+        self._logger = logger
+
+    async def execute(self, requests: list[RemoteInferenceRequest]):
+        self._logger.info("got request!")
+        for request in requests:
+            encoded_responses = await self._encoder.async_infer(
+                inputs={"input": request.inputs["input"]}
+            )
+
+            async for encoded_response in encoded_responses:
+                input_copies = int(
+                    numpy.from_dlpack(encoded_response.outputs["input_copies"])
+                )
+                decoded_responses = await self._decoder.async_infer(
+                    inputs={"input": encoded_response.outputs["output"]},
+                    parameters={"input_copies": input_copies},
+                )
+
+                async for decoded_response in decoded_responses:
+                    await request.response_sender().send(
+                        final=True,
+                        outputs={"output": decoded_response.outputs["output"]},
+                    )
+                    del decoded_response

From 88de51c8a43893c9495085da32be9274985c8537 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 10:39:21 -0800
Subject: [PATCH 12/50] updates to remove operator from deploy

---
 examples/hello_world/deploy/__main__.py       | 95 +------------------
 .../triton_distributed/worker/worker.py       | 33 -------
 2 files changed, 2 insertions(+), 126 deletions(-)

diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index 9ecb9f96..8fb0418e 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -16,104 +16,13 @@
 import asyncio
 import time
 
-import cupy
-import numpy
-from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer
-from triton_distributed.icp.ucp_data_plane import UcpDataPlane
+from triton_distributed.icp.nats_request_plane import NatsServer
 from triton_distributed.worker import (
     Deployment,
-    Operator,
     OperatorConfig,
-    RemoteInferenceRequest,
-    RemoteOperator,
     TritonCoreOperator,
     WorkerConfig,
 )
-from tritonserver import MemoryType
-
-
-class EncodeDecodeOperator(Operator):
-    def __init__(
-        self,
-        name,
-        version,
-        triton_core,
-        request_plane,
-        data_plane,
-        parameters,
-        repository,
-        logger,
-    ):
-        self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane)
-        self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane)
-        self._logger = logger
-
-    async def execute(self, requests: list[RemoteInferenceRequest]):
-        for request in requests:
-            self._logger.info("got request!")
-            encoded_responses = await self._encoder.async_infer(
-                inputs={"input": request.inputs["input"]}
-            )
-
-            async for encoded_response in encoded_responses:
-                input_copies = int(
-                    numpy.from_dlpack(encoded_response.outputs["input_copies"])
-                )
-                decoded_responses = await self._decoder.async_infer(
-                    inputs={"input": encoded_response.outputs["output"]},
-                    parameters={"input_copies": input_copies},
-                )
-
-                async for decoded_response in decoded_responses:
-                    await request.response_sender().send(
-                        final=True,
-                        outputs={"output": decoded_response.outputs["output"]},
-                    )
-                    del decoded_response
-
-
-async def send_requests(nats_server_url):
-    request_plane = NatsRequestPlane(nats_server_url)
-    data_plane = UcpDataPlane()
-    await request_plane.connect()
-    data_plane.connect()
-
-    remote_operator: RemoteOperator = RemoteOperator(
-        "encoder_decoder", 1, request_plane, data_plane
-    )
-
-    inputs = [
-        numpy.array(numpy.random.randint(0, 100, 10000)).astype("int64")
-        for _ in range(100)
-    ]
-
-    requests = [
-        await remote_operator.async_infer(
-            inputs={"input": inputs[index]}, request_id=str(index)
-        )
-        for index in range(100)
-    ]
-
-    for request in requests:
-        async for response in request:
-            for output_name, output_value in response.outputs.items():
-                if output_value.memory_type == MemoryType.CPU:
-                    output = numpy.from_dlpack(output_value)
-                    numpy.testing.assert_array_equal(
-                        output, inputs[int(response.request_id)]
-                    )
-                else:
-                    output = cupy.from_dlpack(output_value)
-                    cupy.testing.assert_array_equal(
-                        output, inputs[int(response.request_id)]
-                    )
-                del output_value
-            print(f"Finished Request: {response.request_id}")
-            print(response.error)
-            del response
-
-    await request_plane.close()
-    data_plane.close()
 
 
 async def main():
@@ -188,7 +97,7 @@ async def main():
 
     print("Sending Requests")
 
-    await send_requests(nats_server.url)
+    #    await send_requests(nats_server.url)
 
     print("Stopping Workers")
 
diff --git a/worker/src/python/triton_distributed/worker/worker.py b/worker/src/python/triton_distributed/worker/worker.py
index 50c432b2..2e57570a 100644
--- a/worker/src/python/triton_distributed/worker/worker.py
+++ b/worker/src/python/triton_distributed/worker/worker.py
@@ -16,7 +16,6 @@
 import asyncio
 import importlib
 import logging
-import multiprocessing
 import os
 import pathlib
 import signal
@@ -376,35 +375,3 @@ def start(self):
             if self._log_dir:
                 sys.stdout.close()
                 sys.stderr.close()
-
-
-class Deployment:
-    def __init__(self, worker_configs: list[WorkerConfig]):
-        self._process_context = multiprocessing.get_context("spawn")
-        self._worker_configs = worker_configs
-        self._workers: list[multiprocessing.context.SpawnProcess] = []
-
-    @staticmethod
-    def _start_worker(worker_config):
-        Worker(worker_config).start()
-
-    def start(self):
-        for worker_config in self._worker_configs:
-            self._workers.append(
-                self._process_context.Process(
-                    target=Deployment._start_worker,
-                    name=worker_config.name,
-                    args=[worker_config],
-                )
-            )
-
-    def shutdown(self, join=True, timeout=10):
-        for worker in self._workers:
-            worker.terminate()
-        if join:
-            for worker in self._workers:
-                worker.join(timeout)
-            for worker in self._workers:
-                if worker.is_alive():
-                    worker.kill()
-                    worker.join(timeout)

From a7face52888353d1895245f7391c4596bc92b204 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 12:29:37 -0800
Subject: [PATCH 13/50] incremental updates

---
 examples/hello_world/deploy/__main__.py       | 25 ++++----
 examples/hello_world/single_file.py           | 40 ++++++++-----
 .../python/triton_distributed/icp/__init__.py |  8 +++
 .../triton_distributed/worker/deployment.py   | 57 ++++++++++++++++++-
 .../triton_distributed/worker/worker.py       |  4 +-
 5 files changed, 104 insertions(+), 30 deletions(-)

diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index 8fb0418e..5e0be026 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -16,7 +16,6 @@
 import asyncio
 import time
 
-from triton_distributed.icp.nats_request_plane import NatsServer
 from triton_distributed.worker import (
     Deployment,
     OperatorConfig,
@@ -26,7 +25,7 @@
 
 
 async def main():
-    nats_server = NatsServer()
+    #    nats_server = NatsServer()
     time.sleep(1)
 
     encoder_op = OperatorConfig(
@@ -63,35 +62,37 @@ async def main():
     )
 
     encoder = WorkerConfig(
-        request_plane_args=([nats_server.url], {}),
-        log_level=6,
+        #        request_plane_args=([nats_server.url], {}),
+        #        log_level=6,
         operators=[encoder_op],
         name="encoder",
         metrics_port=8060,
-        log_dir="logs",
+        #       log_dir="logs",
     )
 
     decoder = WorkerConfig(
-        request_plane_args=([nats_server.url], {}),
-        log_level=6,
+        #      request_plane_args=([nats_server.url], {}),
+        #     log_level=6,
         operators=[decoder_op],
         name="decoder",
         metrics_port=8061,
-        log_dir="logs",
+        #    log_dir="logs",
     )
 
     encoder_decoder = WorkerConfig(
-        request_plane_args=([nats_server.url], {}),
-        log_level=6,
+        #   request_plane_args=([nats_server.url], {}),
+        #  log_level=6,
         operators=[encoder_decoder_op],
         name="encoder_decoder",
         metrics_port=8062,
-        log_dir="logs",
+        # log_dir="logs",
     )
 
     print("Starting Workers")
 
-    deployment = Deployment([(encoder, 5), decoder, (encoder_decoder, 6)])
+    deployment = Deployment(
+        [(encoder, 5), decoder, (encoder_decoder, 6)], initialize_request_plane=True
+    )
 
     deployment.start()
 
diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py
index 3ad786f0..bc79e121 100644
--- a/examples/hello_world/single_file.py
+++ b/examples/hello_world/single_file.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 
 import asyncio
-import shutil
+from pathlib import Path
 
 import cupy
 import numpy
 from tqdm import tqdm
-from triton_distributed.icp.nats_request_plane import NatsRequestPlane, NatsServer
+from triton_distributed.icp.nats_request_plane import NatsRequestPlane
 from triton_distributed.icp.ucp_data_plane import UcpDataPlane
 from triton_distributed.worker import (
     Deployment,
@@ -121,9 +121,15 @@ async def send_requests(nats_server_url, request_count=100):
 
 
 async def main():
-    shutil.rmtree("logs")
+    current_dir = Path(__file__).parent.absolute()
 
-    nats_server = NatsServer()
+    log_dir = current_dir.joinpath("logs")
+    if log_dir.is_dir():
+        log_dir.rmdir()
+    log_dir.mkdir(exist_ok=True)
+    #    shutil.rmtree("logs")
+
+    #    nats_server = NatsServer()
 
     encoder_op = OperatorConfig(
         name="encoder",
@@ -158,41 +164,45 @@ async def main():
     )
 
     encoder = WorkerConfig(
-        request_plane_args=([nats_server.url], {}),
-        log_level=6,
+        #       request_plane_args=([nats_server.url], {}),
+        #      log_level=6,
         operators=[encoder_op],
         name="encoder",
         metrics_port=50000,
-        log_dir="logs",
+        #     log_dir="logs",
     )
 
     decoder = WorkerConfig(
-        request_plane_args=([nats_server.url], {}),
-        log_level=6,
+        #    request_plane_args=([nats_server.url], {}),
+        #   log_level=6,
         operators=[decoder_op],
         name="decoder",
         metrics_port=50100,
-        log_dir="logs",
+        #  log_dir="logs",
     )
 
     encoder_decoder = WorkerConfig(
-        request_plane_args=([nats_server.url], {}),
-        log_level=6,
+        # request_plane_args=([nats_server.url], {}),
+        # log_level=6,
         operators=[encoder_decoder_op],
         name="encoder_decoder",
         metrics_port=50200,
-        log_dir="logs",
+        # log_dir="logs",
     )
 
     print("Starting Workers")
 
-    deployment = Deployment([encoder, (decoder, 10), (encoder_decoder, 10)])
+    deployment = Deployment(
+        [encoder, (decoder, 10), (encoder_decoder, 10)],
+        initialize_request_plane=True,
+        log_dir=str(log_dir),
+    )
 
     deployment.start()
 
     print("Sending Requests")
 
-    await send_requests(nats_server.url)
+    await send_requests("nats://localhost:4223")
 
     print("Stopping Workers")
 
diff --git a/icp/src/python/triton_distributed/icp/__init__.py b/icp/src/python/triton_distributed/icp/__init__.py
index e9d1d880..a01b1f9b 100644
--- a/icp/src/python/triton_distributed/icp/__init__.py
+++ b/icp/src/python/triton_distributed/icp/__init__.py
@@ -12,3 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from triton_distributed.icp.data_plane import DataPlane as DataPlane
+from triton_distributed.icp.nats_request_plane import (
+    NatsRequestPlane as NatsRequestPlane,
+)
+from triton_distributed.icp.nats_request_plane import NatsServer as NatsServer
+from triton_distributed.icp.request_plane import RequestPlane as RequestPlane
+from triton_distributed.icp.ucp_data_plane import UcpDataPlane as UcpDataPlane
diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py
index 75650f57..8054643f 100644
--- a/worker/src/python/triton_distributed/worker/deployment.py
+++ b/worker/src/python/triton_distributed/worker/deployment.py
@@ -14,35 +14,88 @@
 # limitations under the License.
 import multiprocessing
 from pprint import pformat
+from typing import Optional, Type
 
+from triton_distributed.icp import (
+    DataPlane,
+    NatsRequestPlane,
+    NatsServer,
+    RequestPlane,
+    UcpDataPlane,
+)
 from triton_distributed.worker.log_formatter import setup_logger
 from triton_distributed.worker.worker import Worker, WorkerConfig
+from tritonserver import InvalidArgumentError
 
 LOGGER_NAME = __name__
 
 
 class Deployment:
     def __init__(
-        self, worker_configs: list[WorkerConfig | tuple[WorkerConfig, int]], log_level=3
+        self,
+        worker_configs: list[WorkerConfig | tuple[WorkerConfig, int]],
+        log_level=3,
+        initialize_request_plane=False,
+        initialize_data_plane=False,
+        request_plane_args: Optional[tuple[list, dict]] = None,
+        request_plane: Optional[Type[RequestPlane]] = NatsRequestPlane,
+        data_plane: Optional[Type[DataPlane]] = UcpDataPlane,
+        data_plane_args: Optional[tuple[list, dict]] = None,
+        log_dir="logs",
     ):
         self._process_context = multiprocessing.get_context("spawn")
         self._worker_configs = worker_configs
         self._workers: list[multiprocessing.context.SpawnProcess] = []
         self._logger = setup_logger(log_level, LOGGER_NAME)
+        self._default_request_plane = request_plane
+        self._default_request_plane_args = request_plane_args
+        self._default_data_plane = data_plane
+        self._default_data_plane_args = data_plane_args
+        self._initialize_request_plane = initialize_request_plane
+        self._initialize_data_plane = initialize_data_plane
+        self._request_plane_server: NatsServer = None
+        self._default_log_dir = log_dir
+        self._default_log_level = log_level
 
     @staticmethod
     def _start_worker(worker_config):
         Worker(worker_config).start()
 
     def start(self):
+        if self._initialize_request_plane:
+            if self._default_request_plane == NatsRequestPlane:
+                self._request_plane_server = NatsServer(log_dir=self._default_log_dir)
+            else:
+                raise InvalidArgumentError(
+                    f"Unknown Request Plane Type, can not initialize {self._default_request_plane}"
+                )
+
         for worker_config in self._worker_configs:
             worker_instances = 1
             if isinstance(worker_config, tuple):
                 worker_instances = worker_config[1]
                 worker_config = worker_config[0]
-            worker_config.log_level = 6
+
             base_name = worker_config.name
             base_port = worker_config.metrics_port
+
+            request_plane_args, request_plane_kwargs = worker_config.request_plane_args
+
+            if not request_plane_args and not request_plane_kwargs:
+                if self._default_request_plane_args:
+                    worker_config.request_plane_args = self._default_request_plane_args
+                elif self._request_plane_server:
+                    worker_config.request_plane_args = (
+                        [self._request_plane_server.url],
+                        {},
+                    )
+
+            if not worker_config.log_dir:
+                worker_config.log_dir = self._default_log_dir
+
+            if not worker_config.log_level:
+                worker_config.log_level = self._default_log_level
+
             for index in range(worker_instances):
                 worker_config.name = f"{base_name}.{index}"
                 worker_config.metrics_port = base_port + index
diff --git a/worker/src/python/triton_distributed/worker/worker.py b/worker/src/python/triton_distributed/worker/worker.py
index 2e57570a..4c573976 100644
--- a/worker/src/python/triton_distributed/worker/worker.py
+++ b/worker/src/python/triton_distributed/worker/worker.py
@@ -49,7 +49,7 @@ class WorkerConfig:
     data_plane: Type[DataPlane] = UcpDataPlane
     request_plane_args: tuple[list, dict] = field(default_factory=lambda: ([], {}))
     data_plane_args: tuple[list, dict] = field(default_factory=lambda: ([], {}))
-    log_level: int = 0
+    log_level: Optional[int] = None
     operators: list[OperatorConfig] = field(default_factory=list)
     triton_log_path: Optional[str] = None
     name: str = str(uuid.uuid1())
@@ -74,6 +74,8 @@ def __init__(
         self._triton_log_path = config.triton_log_path
         self._name = config.name
         self._log_level = config.log_level
+        if self._log_level is None:
+            self._log_level = 0
         self._operator_configs = config.operators
         self._log_dir = config.log_dir
 

From c52b3ad170e1b6cfd173a4367d86afd24fb202c4 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 12:36:52 -0800
Subject: [PATCH 14/50] removing temporary files

---
 examples/hello_world/api_server/.gitkeep                   | 0
 examples/hello_world/docs/.gitkeep                         | 0
 examples/hello_world/operators/.gitkeep                    | 0
 examples/hello_world/operators/triton_core_models/.gitkeep | 0
 examples/hello_world/router/.gitkeep                       | 0
 examples/hello_world/scripts/.gitkeep                      | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/hello_world/api_server/.gitkeep
 delete mode 100644 examples/hello_world/docs/.gitkeep
 delete mode 100644 examples/hello_world/operators/.gitkeep
 delete mode 100644 examples/hello_world/operators/triton_core_models/.gitkeep
 delete mode 100644 examples/hello_world/router/.gitkeep
 delete mode 100644 examples/hello_world/scripts/.gitkeep

diff --git a/examples/hello_world/api_server/.gitkeep b/examples/hello_world/api_server/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/hello_world/docs/.gitkeep b/examples/hello_world/docs/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/hello_world/operators/.gitkeep b/examples/hello_world/operators/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/hello_world/operators/triton_core_models/.gitkeep b/examples/hello_world/operators/triton_core_models/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/hello_world/router/.gitkeep b/examples/hello_world/router/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/hello_world/scripts/.gitkeep b/examples/hello_world/scripts/.gitkeep
deleted file mode 100644
index e69de29b..00000000

From ff727175bb9d0af6153cab1b53278d07ded94c46 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 13:02:14 -0800
Subject: [PATCH 15/50] putting more common handling into deployment

---
 examples/hello_world/single_file.py           | 35 ++++++++-----------
 .../triton_distributed/worker/deployment.py   | 14 +++++---
 2 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py
index bc79e121..6a429fda 100644
--- a/examples/hello_world/single_file.py
+++ b/examples/hello_world/single_file.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import asyncio
+import shutil
 from pathlib import Path
 
 import cupy
@@ -110,7 +111,8 @@ async def send_requests(nats_server_url, request_count=100):
                             output, inputs[int(response.request_id)]
                         )
                     del output_value
-                print(
+
+                pbar.set_description(
                     f"Finished Request: {response.request_id} Response From: {response.component_id} Error: {response.error}"
                 )
                 pbar.update(1)
@@ -121,19 +123,20 @@ async def send_requests(nats_server_url, request_count=100):
 
 
 async def main():
-    current_dir = Path(__file__).parent.absolute()
+    module_dir = Path(__file__).parent.absolute()
+
+    log_dir = module_dir.joinpath("logs")
 
-    log_dir = current_dir.joinpath("logs")
     if log_dir.is_dir():
-        log_dir.rmdir()
+        shutil.rmtree(log_dir)
+
     log_dir.mkdir(exist_ok=True)
-    #    shutil.rmtree("logs")
 
-    #    nats_server = NatsServer()
+    triton_core_models_dir = module_dir.joinpath("operators", "triton_core_models")
 
     encoder_op = OperatorConfig(
         name="encoder",
-        repository="/workspace/examples/hello_world/operators/triton_core_models",
+        repository=str(triton_core_models_dir),
         implementation=TritonCoreOperator,
         max_inflight_requests=1,
         parameters={
@@ -146,7 +149,7 @@ async def main():
 
     decoder_op = OperatorConfig(
         name="decoder",
-        repository="/workspace/examples/hello_world/operators/triton_core_models",
+        repository=str(triton_core_models_dir),
         implementation=TritonCoreOperator,
         max_inflight_requests=1,
         parameters={
@@ -164,30 +167,18 @@ async def main():
     )
 
     encoder = WorkerConfig(
-        #       request_plane_args=([nats_server.url], {}),
-        #      log_level=6,
         operators=[encoder_op],
         name="encoder",
-        metrics_port=50000,
-        #     log_dir="logs",
     )
 
     decoder = WorkerConfig(
-        #    request_plane_args=([nats_server.url], {}),
-        #   log_level=6,
         operators=[decoder_op],
         name="decoder",
-        metrics_port=50100,
-        #  log_dir="logs",
     )
 
     encoder_decoder = WorkerConfig(
-        # request_plane_args=([nats_server.url], {}),
-        # log_level=6,
         operators=[encoder_decoder_op],
         name="encoder_decoder",
-        metrics_port=50200,
-        # log_dir="logs",
     )
 
     print("Starting Workers")
@@ -196,13 +187,15 @@ async def main():
         [encoder, (decoder, 10), (encoder_decoder, 10)],
         initialize_request_plane=True,
         log_dir=str(log_dir),
+        log_level=0,
+        starting_metrics_port=50000,
     )
 
     deployment.start()
 
     print("Sending Requests")
 
-    await send_requests("nats://localhost:4223")
+    await send_requests(deployment.request_plane_server.url)
 
     print("Stopping Workers")
 
diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py
index 8054643f..88aeda6f 100644
--- a/worker/src/python/triton_distributed/worker/deployment.py
+++ b/worker/src/python/triton_distributed/worker/deployment.py
@@ -42,6 +42,7 @@ def __init__(
         data_plane: Optional[Type[DataPlane]] = UcpDataPlane,
         data_plane_args: Optional[tuple[list, dict]] = None,
         log_dir="logs",
+        starting_metrics_port=50000,
     ):
         self._process_context = multiprocessing.get_context("spawn")
         self._worker_configs = worker_configs
@@ -53,9 +54,10 @@ def __init__(
         self._default_data_plane_args = data_plane_args
         self._initialize_request_plane = initialize_request_plane
         self._initialize_data_plane = initialize_data_plane
-        self._request_plane_server: NatsServer = None
+        self.request_plane_server: NatsServer = None
         self._default_log_dir = log_dir
         self._default_log_level = log_level
+        self._starting_metrics_port = starting_metrics_port
 
     @staticmethod
     def _start_worker(worker_config):
@@ -64,7 +66,7 @@ def _start_worker(worker_config):
     def start(self):
         if self._initialize_request_plane:
             if self._default_request_plane == NatsRequestPlane:
-                self._request_plane_server = NatsServer(log_dir=self._default_log_dir)
+                self.request_plane_server = NatsServer(log_dir=self._default_log_dir)
             else:
                 raise InvalidArgumentError(
                     f"Unknown Request Plane Type, can not initialize {self._default_request_plane}"
@@ -79,14 +81,18 @@ def start(self):
             base_name = worker_config.name
             base_port = worker_config.metrics_port
 
+            if not base_port and self._starting_metrics_port:
+                base_port = self._starting_metrics_port
+                self._starting_metrics_port += worker_instances
+
             request_plane_args, request_plane_kwargs = worker_config.request_plane_args
 
             if not request_plane_args and not request_plane_kwargs:
                 if self._default_request_plane_args:
                     worker_config.request_plane_args = self._default_request_plane_args
-                elif self._request_plane_server:
+                elif self.request_plane_server:
                     worker_config.request_plane_args = (
-                        [self._request_plane_server.url],
+                        [self.request_plane_server.url],
                         {},
                     )
 

From 361182805d2ea0aa70c9769dcfb5a375c316a21d Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 13:15:37 -0800
Subject: [PATCH 16/50] updating default for single file

---
 examples/hello_world/single_file.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py
index 6a429fda..d43c0899 100644
--- a/examples/hello_world/single_file.py
+++ b/examples/hello_world/single_file.py
@@ -184,10 +184,10 @@ async def main():
     print("Starting Workers")
 
     deployment = Deployment(
-        [encoder, (decoder, 10), (encoder_decoder, 10)],
+        [(encoder, 1), (decoder, 1), (encoder_decoder, 1)],
         initialize_request_plane=True,
         log_dir=str(log_dir),
-        log_level=0,
+        log_level=1,
         starting_metrics_port=50000,
     )
 

From 958f5f8c4e4177d66f1821e6bce0e937dab33815 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 15:38:59 -0800
Subject: [PATCH 17/50] updating with deploy app

---
 examples/hello_world/deploy/__main__.py       | 135 ++++++++++++------
 .../triton_distributed/worker/deployment.py   |   4 +-
 2 files changed, 94 insertions(+), 45 deletions(-)

diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index 5e0be026..6b945ad0 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -14,7 +14,9 @@
 # limitations under the License.
 
 import asyncio
+import shutil
 import time
+from pathlib import Path
 
 from triton_distributed.worker import (
     Deployment,
@@ -23,87 +25,132 @@
     WorkerConfig,
 )
 
+from .parser import parse_args
 
-async def main():
-    #    nats_server = NatsServer()
-    time.sleep(1)
+# def handler(signum, frame):
+#     for process in processes:
+#         process.terminate()
+#         process.kill()
+#     if processes:
+#         print("exiting")
+#         print(processes)
+#         shutil.rmtree(nats_store, ignore_errors=True)
+#         sys.exit(0)
 
-    encoder_op = OperatorConfig(
-        name="encoder",
-        repository="/workspace/examples/hello_world/operators/triton_core_models",
-        implementation=TritonCoreOperator,
-        max_inflight_requests=1,
-        parameters={
-            "config": {
-                "instance_group": [{"count": 1, "kind": "KIND_CPU"}],
-                "parameters": {"delay": {"string_value": "0"}},
-            }
-        },
+
+# signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
+# for sig in signals:
+#     try:
+#         signal.signal(sig, handler)
+#     except Exception:
+#         pass
+
+
+def _create_encoder_decoder_op(name, max_inflight_requests, args):
+    return OperatorConfig(
+        name=name,
+        implementation="EncodeDecodeOperator",
+        max_inflight_requests=int(max_inflight_requests),
+        repository=args.operator_repository,
     )
 
-    decoder_op = OperatorConfig(
-        name="decoder",
-        repository="/workspace/examples/hello_world/operators/triton_core_models",
+
+def _create_triton_core_operator(
+    name,
+    max_inflight_requests,
+    instances_per_worker,
+    kind,
+    delay_per_token,
+    input_copies,
+    args,
+):
+    return OperatorConfig(
+        name=name,
+        repository=args.triton_core_models,
         implementation=TritonCoreOperator,
-        max_inflight_requests=1,
+        max_inflight_requests=int(max_inflight_requests),
         parameters={
             "config": {
-                "instance_group": [{"count": 1, "kind": "KIND_GPU"}],
-                "parameters": {"delay": {"string_value": "0"}},
+                "instance_group": [
+                    {"count": int(instances_per_worker), "kind": f"KIND_{kind}"}
+                ],
+                "parameters": {
+                    "delay": {"string_value": f"{delay_per_token}"},
+                    "input_copies": {"string_value": f"{input_copies}"},
+                },
             }
         },
     )
 
-    encoder_decoder_op = OperatorConfig(
-        name="encoder_decoder",
-        implementation="EncodeDecodeOperator",
-        max_inflight_requests=100,
-        repository="/workspace/examples/hello_world/operators",
+
+async def main(args):
+    log_dir = Path(args.log_dir)
+
+    if args.clear_logs:
+        shutil.rmtree(log_dir)
+
+    log_dir.mkdir(exist_ok=True)
+
+    encoder_op = _create_triton_core_operator(
+        "encoder",
+        args.encoders[1],
+        args.encoders[2],
+        args.encoders[3],
+        args.encoder_delay_per_token,
+        args.encoder_input_copies,
+        args,
     )
 
     encoder = WorkerConfig(
-        #        request_plane_args=([nats_server.url], {}),
-        #        log_level=6,
         operators=[encoder_op],
         name="encoder",
-        metrics_port=8060,
-        #       log_dir="logs",
+    )
+
+    decoder_op = _create_triton_core_operator(
+        "decoder",
+        args.decoders[1],
+        args.decoders[2],
+        args.decoders[3],
+        args.decoder_delay_per_token,
+        args.encoder_input_copies,
+        args,
     )
 
     decoder = WorkerConfig(
-        #      request_plane_args=([nats_server.url], {}),
-        #     log_level=6,
         operators=[decoder_op],
         name="decoder",
-        metrics_port=8061,
-        #    log_dir="logs",
+    )
+
+    encoder_decoder_op = _create_encoder_decoder_op(
+        "encoder_decoder", args.encoder_decoders[1], args
     )
 
     encoder_decoder = WorkerConfig(
-        #   request_plane_args=([nats_server.url], {}),
-        #  log_level=6,
         operators=[encoder_decoder_op],
         name="encoder_decoder",
-        metrics_port=8062,
-        # log_dir="logs",
     )
 
     print("Starting Workers")
 
     deployment = Deployment(
-        [(encoder, 5), decoder, (encoder_decoder, 6)], initialize_request_plane=True
+        [
+            (encoder, int(args.encoders[0])),
+            (decoder, int(args.decoders[0])),
+            (encoder_decoder, int(args.encoder_decoders[0])),
+        ],
+        initialize_request_plane=args.initialize_request_plane,
+        log_dir=args.log_dir,
+        log_level=args.log_level,
+        starting_metrics_port=args.starting_metrics_port,
     )
 
     deployment.start()
 
-    print("Sending Requests")
-
-    #    await send_requests(nats_server.url)
-
-    print("Stopping Workers")
+    time.sleep(10)
 
     deployment.stop()
 
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    args = parse_args()
+    asyncio.run(main(args))
diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py
index 88aeda6f..2e10ad29 100644
--- a/worker/src/python/triton_distributed/worker/deployment.py
+++ b/worker/src/python/triton_distributed/worker/deployment.py
@@ -124,6 +124,7 @@ def stop(self):
 
     def shutdown(self, join=True, timeout=10):
         for worker in self._workers:
+            self._logger.info("\n\nStopping Worker:\n\n\n\t%s\n", worker)
             worker.terminate()
         if join:
             for worker in self._workers:
@@ -131,4 +132,5 @@ def shutdown(self, join=True, timeout=10):
             for worker in self._workers:
                 if worker.is_alive():
                     worker.kill()
-                    worker.join(timeout)
+                worker.join(timeout)
+                self._logger.info("\n\nWorker Stopped:\n\n\n\t%s\n", worker)

From c4e6fed510cb868da83adadb964d0e6fabaf1fcc Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 15:52:05 -0800
Subject: [PATCH 18/50] updated with signal handler for exit

---
 examples/hello_world/deploy/__main__.py | 38 ++++++++++++++-----------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index 6b945ad0..019a70ca 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -15,6 +15,8 @@
 
 import asyncio
 import shutil
+import signal
+import sys
 import time
 from pathlib import Path
 
@@ -27,23 +29,23 @@
 
 from .parser import parse_args
 
-# def handler(signum, frame):
-#     for process in processes:
-#         process.terminate()
-#         process.kill()
-#     if processes:
-#         print("exiting")
-#         print(processes)
-#         shutil.rmtree(nats_store, ignore_errors=True)
-#         sys.exit(0)
+deployment = None
 
 
-# signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
-# for sig in signals:
-#     try:
-#         signal.signal(sig, handler)
-#     except Exception:
-#         pass
+def handler(signum, frame):
+    if deployment:
+        print("Stopping Workers")
+        deployment.stop()
+    print("Workers Stopped")
+    sys.exit(0)
+
+
+signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
+for sig in signals:
+    try:
+        signal.signal(sig, handler)
+    except Exception:
+        pass
 
 
 def _create_encoder_decoder_op(name, max_inflight_requests, args):
@@ -84,6 +86,7 @@ def _create_triton_core_operator(
 
 
 async def main(args):
+    global deployment
     log_dir = Path(args.log_dir)
 
     if args.clear_logs:
@@ -146,9 +149,10 @@ async def main(args):
 
     deployment.start()
 
-    time.sleep(10)
+    print("Workers started ... press Ctrl-C to Exit")
 
-    deployment.stop()
+    while True:
+        time.sleep(10)
 
 
 if __name__ == "__main__":

From 00aedbce0d4b8216bfd69586eef3667c9365a0b4 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 15:56:53 -0800
Subject: [PATCH 19/50] adding parser

---
 examples/hello_world/deploy/parser.py | 127 ++++++++++++++++++++++++++
 1 file changed, 127 insertions(+)
 create mode 100644 examples/hello_world/deploy/parser.py

diff --git a/examples/hello_world/deploy/parser.py b/examples/hello_world/deploy/parser.py
new file mode 100644
index 00000000..ded391cf
--- /dev/null
+++ b/examples/hello_world/deploy/parser.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from pathlib import Path
+
+# Default values
+DEFAULT_REQUEST_PLANE_URI = "nats://localhost:4222"
+DEFAULT_LOG_LEVEL = 0
+
+# Property keys
+NAME = "name"
+VERSION = "version"
+MAX_INFLIGHT_REQUESTS = "max_inflight_requests"
+PARAMETERS = "parameters"
+MODULE = "module"
+REPOSITORY = "repository"
+IMPLEMENTATION = "implementation"
+
+
+def parse_args(args=None):
+    example_dir = Path(__file__).parent.absolute().parent.absolute()
+
+    default_log_dir = example_dir.joinpath("logs")
+
+    default_operator_repository = example_dir.joinpath("operators")
+
+    default_triton_core_models = default_operator_repository.joinpath(
+        "triton_core_models"
+    )
+
+    parser = argparse.ArgumentParser(description="Hello World Deployment")
+
+    parser.add_argument(
+        "--initialize-request-plane",
+        default=False,
+        action="store_true",
+        help="Initialize the request plane, should only be done once per deployment",
+    )
+
+    parser.add_argument(
+        "--log-dir",
+        type=str,
+        default=str(default_log_dir),
+        help="log dir folder",
+    )
+
+    parser.add_argument(
+        "--clear-logs", default=False, action="store_true", help="clear log dir"
+    )
+
+    parser.add_argument("--log-level", type=int, default=1)
+
+    parser.add_argument(
+        "--request-plane-uri", type=str, default="nats://localhost:4223"
+    )
+
+    parser.add_argument("--starting-metrics-port", type=int, default=50000)
+
+    parser.add_argument(
+        "--operator-repository", type=str, default=str(default_operator_repository)
+    )
+
+    parser.add_argument(
+        "--triton-core-models", type=str, default=str(default_triton_core_models)
+    )
+
+    parser.add_argument(
+        "--encoder-delay-per-token",
+        type=float,
+        default=0,
+        help="Delay per input token. In this toy example can be used to vary the simulated compute load for encoding stage.",
+    )
+
+    parser.add_argument(
+        "--encoder-input-copies",
+        type=int,
+        default=1,
+        help="Number of copies of input to create during encoding. In this toy example can be used to vary the memory transferred between encoding and decoding stages.",
+    )
+
+    parser.add_argument(
+        "--encoders",
+        type=str,
+        nargs=4,
+        default=["1", "1", "1", "CPU"],
+        help="Number of encoding workers to deploy. Specified as #Workers, #MaxInflightRequests, #ModelInstancesPerWorker, CPU || GPU",
+    )
+
+    parser.add_argument(
+        "--decoders",
+        type=str,
+        nargs=4,
+        default=["1", "1", "1", "CPU"],
+        help="Number of decoding workers to deploy. Specified as #Workers, #MaxInflightRequests,#ModelInstancesPerWorker, CPU || GPU",
+    )
+
+    parser.add_argument(
+        "--decoder-delay-per-token",
+        type=float,
+        default=0,
+        help="Delay per input token. In this toy example can be used to vary the simulated compute load for decoding stage.",
+    )
+
+    parser.add_argument(
+        "--encoder-decoders",
+        type=str,
+        nargs=2,
+        default=["1", "1"],
+        help="Number of encode-decode workers to deploy. Specified as #Worker, #MaxInflightRequests",
+    )
+
+    args = parser.parse_args(args)
+
+    return args

From c2dc1905c878c4b308459ae2a83e07810844ff0e Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 15:57:27 -0800
Subject: [PATCH 20/50] removing gitkeep

---
 examples/hello_world/deploy/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/hello_world/deploy/.gitkeep

diff --git a/examples/hello_world/deploy/.gitkeep b/examples/hello_world/deploy/.gitkeep
deleted file mode 100644
index e69de29b..00000000

From 3a74bcb0979bdf43181b333114922a87c2f35df2 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 20:53:54 -0800
Subject: [PATCH 21/50] adding in initial client - still testing

---
 examples/hello_world/client/__main__.py       | 115 ++++++++++++++++++
 examples/hello_world/client/parser.py         |  43 +++++++
 examples/hello_world/deploy/__main__.py       |  32 ++---
 examples/hello_world/deploy/parser.py         |  13 --
 examples/hello_world/single_file.py           |   6 +-
 .../worker/remote_operator.py                 |  11 +-
 6 files changed, 185 insertions(+), 35 deletions(-)
 create mode 100644 examples/hello_world/client/__main__.py
 create mode 100644 examples/hello_world/client/parser.py

diff --git a/examples/hello_world/client/__main__.py b/examples/hello_world/client/__main__.py
new file mode 100644
index 00000000..59571587
--- /dev/null
+++ b/examples/hello_world/client/__main__.py
@@ -0,0 +1,115 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import multiprocessing
+import time
+
+import cupy
+import numpy
+import tqdm
+from triton_distributed.icp import NatsRequestPlane, UcpDataPlane
+from triton_distributed.worker import RemoteOperator
+from tritonserver import MemoryType
+
+from .parser import parse_args
+
+
+def _get_input_sizes(args):
+    return numpy.maximum(
+        0,
+        numpy.round(
+            numpy.random.normal(
+                loc=args.input_size_mean,
+                scale=args.input_size_stdev,
+                size=args.requests_per_client,
+            )
+        ),
+    ).astype(int)
+
+
+async def client(client_index, args):
+    request_count = args.requests_per_client
+    request_plane = NatsRequestPlane(args.request_plane_uri)
+    data_plane = UcpDataPlane()
+    await request_plane.connect()
+    data_plane.connect()
+
+    remote_operator: RemoteOperator = RemoteOperator(
+        args.operator, request_plane, data_plane
+    )
+    input_sizes = _get_input_sizes(args)
+
+    inputs = [
+        numpy.array(numpy.random.randint(0, 100, input_sizes[index]))
+        for index in range(request_count)
+    ]
+
+    with tqdm(
+        total=args.requests_per_client,
+        desc=f"Client: {client_index}",
+        unit="request",
+        position=client_index,
+    ) as pbar:
+        requests = [
+            await remote_operator.async_infer(
+                inputs={"input": inputs[index]}, request_id=str(index)
+            )
+            for index in range(request_count)
+        ]
+
+        for request in requests:
+            async for response in request:
+                for output_name, output_value in response.outputs.items():
+                    if output_value.memory_type == MemoryType.CPU:
+                        output = numpy.from_dlpack(output_value)
+                        numpy.testing.assert_array_equal(
+                            output, inputs[int(response.request_id)]
+                        )
+                    else:
+                        output = cupy.from_dlpack(output_value)
+                        cupy.testing.assert_array_equal(
+                            output, inputs[int(response.request_id)]
+                        )
+                    del output_value
+
+                pbar.set_description(
+                    f"Client Received Response: {response.request_id} From: {response.component_id} Error: {response.error}"
+                )
+                pbar.update(1)
+                del response
+
+    await request_plane.close()
+    data_plane.close()
+
+
+def main(args):
+    process_context = multiprocessing.get_context("spawn")
+    processes = []
+    start_time = time.time()
+    for index in range(args.clients):
+        processes.append(process_context.Process(target=client, args=(index, args)))
+        processes[-1].start()
+
+    for process in processes:
+        process.join()
+    end_time = time.time()
+    print(
+        f"Throughput: {(args.requests_per_client*args.clients)/(end_time-start_time)} Total Time: {end_time-start_time}"
+    )
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/examples/hello_world/client/parser.py b/examples/hello_world/client/parser.py
new file mode 100644
index 00000000..6ba16607
--- /dev/null
+++ b/examples/hello_world/client/parser.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+
+def parse_args(args=None):
+    parser = argparse.ArgumentParser(description="Hello World Client")
+
+    parser.add_argument(
+        "--request-plane-uri", type=str, default="nats://localhost:4223"
+    )
+
+    parser.add_argument("--requests-per-client", type=int, default=100)
+
+    parser.add_argument(
+        "--operator",
+        type=str,
+        choices=["encoder_decoder", "encoder", "decoder"],
+        default="encoder_decoder",
+    )
+
+    parser.add_argument("--input-size-mean", type=int, default=1000)
+
+    parser.add_argument("--input-size-stdev", type=float, default=0)
+
+    parser.add_argument("--clients", type=int, default=1)
+
+    args = parser.parse_args(args)
+
+    return args
diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index 019a70ca..c922c1cb 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -95,13 +95,13 @@ async def main(args):
     log_dir.mkdir(exist_ok=True)
 
     encoder_op = _create_triton_core_operator(
-        "encoder",
-        args.encoders[1],
-        args.encoders[2],
-        args.encoders[3],
-        args.encoder_delay_per_token,
-        args.encoder_input_copies,
-        args,
+        name="encoder",
+        max_inflight_requests=args.encoders[1],
+        instances_per_worker=args.encoders[2],
+        kind=args.encoders[3],
+        delay_per_token=args.encoder_delay_per_token,
+        input_copies=args.encoder_input_copies,
+        args=args,
     )
 
     encoder = WorkerConfig(
@@ -110,13 +110,13 @@ async def main(args):
     )
 
     decoder_op = _create_triton_core_operator(
-        "decoder",
-        args.decoders[1],
-        args.decoders[2],
-        args.decoders[3],
-        args.decoder_delay_per_token,
-        args.encoder_input_copies,
-        args,
+        name="decoder",
+        max_inflight_requests=args.decoders[1],
+        instances_per_worker=args.decoders[2],
+        kind=args.decoders[3],
+        delay_per_token=args.decoder_delay_per_token,
+        input_copies=args.encoder_input_copies,
+        args=args,
     )
 
     decoder = WorkerConfig(
@@ -125,7 +125,9 @@ async def main(args):
     )
 
     encoder_decoder_op = _create_encoder_decoder_op(
-        "encoder_decoder", args.encoder_decoders[1], args
+        name="encoder_decoder",
+        max_inflight_requests=args.encoder_decoders[1],
+        args=args,
     )
 
     encoder_decoder = WorkerConfig(
diff --git a/examples/hello_world/deploy/parser.py b/examples/hello_world/deploy/parser.py
index ded391cf..4e2885dc 100644
--- a/examples/hello_world/deploy/parser.py
+++ b/examples/hello_world/deploy/parser.py
@@ -16,19 +16,6 @@
 import argparse
 from pathlib import Path
 
-# Default values
-DEFAULT_REQUEST_PLANE_URI = "nats://localhost:4222"
-DEFAULT_LOG_LEVEL = 0
-
-# Property keys
-NAME = "name"
-VERSION = "version"
-MAX_INFLIGHT_REQUESTS = "max_inflight_requests"
-PARAMETERS = "parameters"
-MODULE = "module"
-REPOSITORY = "repository"
-IMPLEMENTATION = "implementation"
-
 
 def parse_args(args=None):
     example_dir = Path(__file__).parent.absolute().parent.absolute()
diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py
index d43c0899..1c55746f 100644
--- a/examples/hello_world/single_file.py
+++ b/examples/hello_world/single_file.py
@@ -46,8 +46,8 @@ def __init__(
         repository,
         logger,
     ):
-        self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane)
-        self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane)
+        self._encoder = RemoteOperator(("encoder", 1), request_plane, data_plane)
+        self._decoder = RemoteOperator("decoder", request_plane, data_plane)
         self._logger = logger
 
     async def execute(self, requests: list[RemoteInferenceRequest]):
@@ -81,7 +81,7 @@ async def send_requests(nats_server_url, request_count=100):
     data_plane.connect()
 
     remote_operator: RemoteOperator = RemoteOperator(
-        "encoder_decoder", 1, request_plane, data_plane
+        "encoder_decoder", request_plane, data_plane
     )
 
     inputs = [
diff --git a/worker/src/python/triton_distributed/worker/remote_operator.py b/worker/src/python/triton_distributed/worker/remote_operator.py
index 9ea78e01..6c58702e 100644
--- a/worker/src/python/triton_distributed/worker/remote_operator.py
+++ b/worker/src/python/triton_distributed/worker/remote_operator.py
@@ -29,14 +29,17 @@
 class RemoteOperator:
     def __init__(
         self,
-        name: str,
-        version: int,
+        operator: str | tuple[str, int],
         request_plane: RequestPlane,
         data_plane: DataPlane,
         component_id: Optional[uuid.UUID] = None,
     ):
-        self.name = name
-        self.version = version
+        if isinstance(operator, str):
+            self.name = operator
+            self.version = 1
+        else:
+            self.name = operator[0]
+            self.version = operator[1]
         self._request_plane = request_plane
         self._data_plane = data_plane
         self.component_id = component_id

From e7bd20f9d65beeb9e1a047ed4de301c6d98249ff Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 21:54:25 -0800
Subject: [PATCH 22/50] updated with working client

---
 examples/hello_world/client/__main__.py       | 81 +--------------
 examples/hello_world/client/client.py         | 98 +++++++++++++++++++
 .../hello_world/operators/encoder_decoder.py  |  4 +-
 3 files changed, 105 insertions(+), 78 deletions(-)
 create mode 100644 examples/hello_world/client/client.py

diff --git a/examples/hello_world/client/__main__.py b/examples/hello_world/client/__main__.py
index 59571587..c2295c4e 100644
--- a/examples/hello_world/client/__main__.py
+++ b/examples/hello_world/client/__main__.py
@@ -16,90 +16,19 @@
 import multiprocessing
 import time
 
-import cupy
-import numpy
-import tqdm
-from triton_distributed.icp import NatsRequestPlane, UcpDataPlane
-from triton_distributed.worker import RemoteOperator
-from tritonserver import MemoryType
-
+from .client import _start_client
 from .parser import parse_args
 
 
-def _get_input_sizes(args):
-    return numpy.maximum(
-        0,
-        numpy.round(
-            numpy.random.normal(
-                loc=args.input_size_mean,
-                scale=args.input_size_stdev,
-                size=args.requests_per_client,
-            )
-        ),
-    ).astype(int)
-
-
-async def client(client_index, args):
-    request_count = args.requests_per_client
-    request_plane = NatsRequestPlane(args.request_plane_uri)
-    data_plane = UcpDataPlane()
-    await request_plane.connect()
-    data_plane.connect()
-
-    remote_operator: RemoteOperator = RemoteOperator(
-        args.operator, request_plane, data_plane
-    )
-    input_sizes = _get_input_sizes(args)
-
-    inputs = [
-        numpy.array(numpy.random.randint(0, 100, input_sizes[index]))
-        for index in range(request_count)
-    ]
-
-    with tqdm(
-        total=args.requests_per_client,
-        desc=f"Client: {client_index}",
-        unit="request",
-        position=client_index,
-    ) as pbar:
-        requests = [
-            await remote_operator.async_infer(
-                inputs={"input": inputs[index]}, request_id=str(index)
-            )
-            for index in range(request_count)
-        ]
-
-        for request in requests:
-            async for response in request:
-                for output_name, output_value in response.outputs.items():
-                    if output_value.memory_type == MemoryType.CPU:
-                        output = numpy.from_dlpack(output_value)
-                        numpy.testing.assert_array_equal(
-                            output, inputs[int(response.request_id)]
-                        )
-                    else:
-                        output = cupy.from_dlpack(output_value)
-                        cupy.testing.assert_array_equal(
-                            output, inputs[int(response.request_id)]
-                        )
-                    del output_value
-
-                pbar.set_description(
-                    f"Client Received Response: {response.request_id} From: {response.component_id} Error: {response.error}"
-                )
-                pbar.update(1)
-                del response
-
-    await request_plane.close()
-    data_plane.close()
-
-
 def main(args):
     process_context = multiprocessing.get_context("spawn")
+    args.lock = process_context.Lock()
     processes = []
     start_time = time.time()
     for index in range(args.clients):
-        processes.append(process_context.Process(target=client, args=(index, args)))
+        processes.append(
+            process_context.Process(target=_start_client, args=(index, args))
+        )
         processes[-1].start()
 
     for process in processes:
diff --git a/examples/hello_world/client/client.py b/examples/hello_world/client/client.py
new file mode 100644
index 00000000..855a4393
--- /dev/null
+++ b/examples/hello_world/client/client.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+
+import cupy
+import numpy
+from tqdm import tqdm
+from triton_distributed.icp import NatsRequestPlane, UcpDataPlane
+from triton_distributed.worker import RemoteOperator
+from tritonserver import MemoryType
+
+
+def _get_input_sizes(args):
+    return numpy.maximum(
+        0,
+        numpy.round(
+            numpy.random.normal(
+                loc=args.input_size_mean,
+                scale=args.input_size_stdev,
+                size=args.requests_per_client,
+            )
+        ),
+    ).astype(int)
+
+
+def _start_client(client_index, args):
+    asyncio.run(client(client_index, args))
+
+
+async def client(client_index, args):
+    request_count = args.requests_per_client
+    request_plane = NatsRequestPlane(args.request_plane_uri)
+    data_plane = UcpDataPlane()
+    await request_plane.connect()
+    data_plane.connect()
+
+    remote_operator: RemoteOperator = RemoteOperator(
+        args.operator, request_plane, data_plane
+    )
+    input_sizes = _get_input_sizes(args)
+
+    inputs = [
+        numpy.array(numpy.random.randint(0, 100, input_sizes[index]))
+        for index in range(request_count)
+    ]
+    tqdm.set_lock(args.lock)
+
+    with tqdm(
+        total=args.requests_per_client,
+        desc=f"Client: {client_index}",
+        unit="request",
+        position=client_index,
+        leave=False,
+    ) as pbar:
+        requests = [
+            await remote_operator.async_infer(
+                inputs={"input": inputs[index]}, request_id=str(index)
+            )
+            for index in range(request_count)
+        ]
+
+        for request in requests:
+            async for response in request:
+                for output_name, output_value in response.outputs.items():
+                    if output_value.memory_type == MemoryType.CPU:
+                        output = numpy.from_dlpack(output_value)
+                        numpy.testing.assert_array_equal(
+                            output, inputs[int(response.request_id)]
+                        )
+                    else:
+                        output = cupy.from_dlpack(output_value)
+                        cupy.testing.assert_array_equal(
+                            output, inputs[int(response.request_id)]
+                        )
+                    print(output.size)
+                    del output_value
+
+                pbar.set_description(
+                    f"Client: {client_index} Received Response: {response.request_id} From: {response.component_id} Error: {response.error}"
+                )
+                pbar.update(1)
+                del response
+
+    await request_plane.close()
+    data_plane.close()
diff --git a/examples/hello_world/operators/encoder_decoder.py b/examples/hello_world/operators/encoder_decoder.py
index 3cde11bc..f6b013ab 100644
--- a/examples/hello_world/operators/encoder_decoder.py
+++ b/examples/hello_world/operators/encoder_decoder.py
@@ -14,8 +14,8 @@ def __init__(
         repository,
         logger,
     ):
-        self._encoder = RemoteOperator("encoder", 1, request_plane, data_plane)
-        self._decoder = RemoteOperator("decoder", 1, request_plane, data_plane)
+        self._encoder = RemoteOperator("encoder", request_plane, data_plane)
+        self._decoder = RemoteOperator("decoder", request_plane, data_plane)
         self._logger = logger
 
     async def execute(self, requests: list[RemoteInferenceRequest]):

From 369259f02e076d8fb3f5b31ae8aedae3b77b4066 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 21:57:04 -0800
Subject: [PATCH 23/50] updated to remove version

---
 examples/hello_world/single_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/hello_world/single_file.py b/examples/hello_world/single_file.py
index 1c55746f..17888569 100644
--- a/examples/hello_world/single_file.py
+++ b/examples/hello_world/single_file.py
@@ -46,7 +46,7 @@ def __init__(
         repository,
         logger,
     ):
-        self._encoder = RemoteOperator(("encoder", 1), request_plane, data_plane)
+        self._encoder = RemoteOperator("encoder", request_plane, data_plane)
         self._decoder = RemoteOperator("decoder", request_plane, data_plane)
         self._logger = logger
 

From 9f56b11fc3b9a4aaedd70ae683cae4c03e7d0a85 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 22:06:58 -0800
Subject: [PATCH 24/50] removing version from tests

---
 .../python/integration/operators/add_multiply_divide.py   | 8 +++-----
 .../integration/operators/mock_disaggregated_serving.py   | 8 ++++----
 .../tests/python/integration/test_add_multiply_divide.py  | 2 +-
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/worker/tests/python/integration/operators/add_multiply_divide.py b/worker/tests/python/integration/operators/add_multiply_divide.py
index b1c982e3..631f259b 100644
--- a/worker/tests/python/integration/operators/add_multiply_divide.py
+++ b/worker/tests/python/integration/operators/add_multiply_divide.py
@@ -35,14 +35,12 @@ def __init__(
         self._request_plane = request_plane
         self._data_plane = data_plane
         self._parameters = parameters
-        self._add_model = RemoteOperator(
-            "add", 1, self._request_plane, self._data_plane
-        )
+        self._add_model = RemoteOperator("add", self._request_plane, self._data_plane)
         self._multiply_model = RemoteOperator(
-            "multiply", 1, self._request_plane, self._data_plane
+            "multiply", self._request_plane, self._data_plane
         )
         self._divide_model = RemoteOperator(
-            "divide", 1, self._request_plane, self._data_plane
+            "divide", self._request_plane, self._data_plane
         )
 
     async def execute(self, requests: list[RemoteInferenceRequest]):
diff --git a/worker/tests/python/integration/operators/mock_disaggregated_serving.py b/worker/tests/python/integration/operators/mock_disaggregated_serving.py
index 5539565e..d2288839 100644
--- a/worker/tests/python/integration/operators/mock_disaggregated_serving.py
+++ b/worker/tests/python/integration/operators/mock_disaggregated_serving.py
@@ -37,16 +37,16 @@ def __init__(
         self._data_plane = data_plane
         self._params = params
         self._preprocessing_model = RemoteOperator(
-            "preprocessing", 1, self._request_plane, self._data_plane
+            "preprocessing", self._request_plane, self._data_plane
         )
         self._context_model = RemoteOperator(
-            "context", 1, self._request_plane, self._data_plane
+            "context", self._request_plane, self._data_plane
         )
         self._generate_model = RemoteOperator(
-            "generation", 1, self._request_plane, self._data_plane
+            "generation", self._request_plane, self._data_plane
         )
         self._postprocessing_model = RemoteOperator(
-            "postprocessing", 1, self._request_plane, self._data_plane
+            "postprocessing", self._request_plane, self._data_plane
         )
         self._logger = logger
 
diff --git a/worker/tests/python/integration/test_add_multiply_divide.py b/worker/tests/python/integration/test_add_multiply_divide.py
index 1d8ad772..49591225 100644
--- a/worker/tests/python/integration/test_add_multiply_divide.py
+++ b/worker/tests/python/integration/test_add_multiply_divide.py
@@ -164,7 +164,7 @@ async def post_requests(num_requests, store_inputs_in_request):
     await request_plane.connect()
 
     add_multiply_divide_operator = RemoteOperator(
-        "add_multiply_divide", 1, request_plane, data_plane
+        "add_multiply_divide", request_plane, data_plane
     )
 
     results = []

From dadd9f022070c372c6b650b08772c67ba16be8f8 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Thu, 16 Jan 2025 23:21:04 -0800
Subject: [PATCH 25/50] adding basic exit code detection to help with sanity
 tests

---
 container/Dockerfile                          |   2 +-
 examples/hello_world/client/__main__.py       |   8 +-
 examples/hello_world/client/client.py         | 102 +++++++++---------
 examples/hello_world/deploy/__main__.py       |   7 +-
 examples/hello_world/operators/__init__.py    |   4 +-
 .../triton_distributed/worker/deployment.py   |   5 +-
 .../triton_distributed/worker/worker.py       |  23 +++-
 7 files changed, 92 insertions(+), 59 deletions(-)

diff --git a/container/Dockerfile b/container/Dockerfile
index 59971e18..0948054f 100644
--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -133,7 +133,7 @@ COPY . /workspace
 RUN /workspace/icp/protos/gen_python.sh
 
 # Sets pythonpath for python modules
-ENV PYTHONPATH="${PYTHONPATH}:/workspace/icp/src/python:/workspace/worker/src/python"
+ENV PYTHONPATH="${PYTHONPATH}:/workspace/icp/src/python:/workspace/worker/src/python:/workspace/examples"
 
 # Command and Entrypoint
 CMD []
diff --git a/examples/hello_world/client/__main__.py b/examples/hello_world/client/__main__.py
index c2295c4e..18e34c92 100644
--- a/examples/hello_world/client/__main__.py
+++ b/examples/hello_world/client/__main__.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import multiprocessing
+import sys
 import time
 
 from .client import _start_client
@@ -37,8 +38,13 @@ def main(args):
     print(
         f"Throughput: {(args.requests_per_client*args.clients)/(end_time-start_time)} Total Time: {end_time-start_time}"
     )
+    exit_code = 0
+    for process in processes:
+        exit_code += process.exitcode
+    print(f"Clients Stopped Exit Code {exit_code}")
+    return exit_code
 
 
 if __name__ == "__main__":
     args = parse_args()
-    main(args)
+    sys.exit(main(args))
diff --git a/examples/hello_world/client/client.py b/examples/hello_world/client/client.py
index 855a4393..e43c84d9 100644
--- a/examples/hello_world/client/client.py
+++ b/examples/hello_world/client/client.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import asyncio
+import sys
 
 import cupy
 import numpy
@@ -37,62 +38,67 @@ def _get_input_sizes(args):
 
 
 def _start_client(client_index, args):
-    asyncio.run(client(client_index, args))
+    sys.exit(asyncio.run(client(client_index, args)))
 
 
 async def client(client_index, args):
     request_count = args.requests_per_client
-    request_plane = NatsRequestPlane(args.request_plane_uri)
-    data_plane = UcpDataPlane()
-    await request_plane.connect()
-    data_plane.connect()
+    try:
+        request_plane = NatsRequestPlane(args.request_plane_uri)
+        data_plane = UcpDataPlane()
+        await request_plane.connect()
+        data_plane.connect()
 
-    remote_operator: RemoteOperator = RemoteOperator(
-        args.operator, request_plane, data_plane
-    )
-    input_sizes = _get_input_sizes(args)
+        remote_operator: RemoteOperator = RemoteOperator(
+            args.operator, request_plane, data_plane
+        )
+        input_sizes = _get_input_sizes(args)
 
-    inputs = [
-        numpy.array(numpy.random.randint(0, 100, input_sizes[index]))
-        for index in range(request_count)
-    ]
-    tqdm.set_lock(args.lock)
-
-    with tqdm(
-        total=args.requests_per_client,
-        desc=f"Client: {client_index}",
-        unit="request",
-        position=client_index,
-        leave=False,
-    ) as pbar:
-        requests = [
-            await remote_operator.async_infer(
-                inputs={"input": inputs[index]}, request_id=str(index)
-            )
+        inputs = [
+            numpy.array(numpy.random.randint(0, 100, input_sizes[index]))
             for index in range(request_count)
         ]
+        tqdm.set_lock(args.lock)
 
-        for request in requests:
-            async for response in request:
-                for output_name, output_value in response.outputs.items():
-                    if output_value.memory_type == MemoryType.CPU:
-                        output = numpy.from_dlpack(output_value)
-                        numpy.testing.assert_array_equal(
-                            output, inputs[int(response.request_id)]
-                        )
-                    else:
-                        output = cupy.from_dlpack(output_value)
-                        cupy.testing.assert_array_equal(
-                            output, inputs[int(response.request_id)]
-                        )
-                    print(output.size)
-                    del output_value
-
-                pbar.set_description(
-                    f"Client: {client_index} Received Response: {response.request_id} From: {response.component_id} Error: {response.error}"
+        with tqdm(
+            total=args.requests_per_client,
+            desc=f"Client: {client_index}",
+            unit="request",
+            position=client_index,
+            leave=False,
+        ) as pbar:
+            requests = [
+                await remote_operator.async_infer(
+                    inputs={"input": inputs[index]}, request_id=str(index)
                 )
-                pbar.update(1)
-                del response
+                for index in range(request_count)
+            ]
+
+            for request in requests:
+                async for response in request:
+                    for output_name, output_value in response.outputs.items():
+                        if output_value.memory_type == MemoryType.CPU:
+                            output = numpy.from_dlpack(output_value)
+                            numpy.testing.assert_array_equal(
+                                output, inputs[int(response.request_id)]
+                            )
+                        else:
+                            output = cupy.from_dlpack(output_value)
+                            cupy.testing.assert_array_equal(
+                                output, inputs[int(response.request_id)]
+                            )
+                        del output_value
+
+                    pbar.set_description(
+                        f"Client: {client_index} Received Response: {response.request_id} From: {response.component_id} Error: {response.error}"
+                    )
+                    pbar.update(1)
+                    del response
 
-    await request_plane.close()
-    data_plane.close()
+        await request_plane.close()
+        data_plane.close()
+    except Exception as e:
+        print(f"Exception: {e}")
+        return 1
+    else:
+        return 0
diff --git a/examples/hello_world/deploy/__main__.py b/examples/hello_world/deploy/__main__.py
index c922c1cb..fee985fc 100644
--- a/examples/hello_world/deploy/__main__.py
+++ b/examples/hello_world/deploy/__main__.py
@@ -33,11 +33,12 @@
 
 
 def handler(signum, frame):
+    exit_code = 0
     if deployment:
         print("Stopping Workers")
-        deployment.stop()
-    print("Workers Stopped")
-    sys.exit(0)
+        exit_code = deployment.stop()
+    print(f"Workers Stopped Exit Code {exit_code}")
+    sys.exit(exit_code)
 
 
 signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
diff --git a/examples/hello_world/operators/__init__.py b/examples/hello_world/operators/__init__.py
index c3d81699..e3efe37e 100644
--- a/examples/hello_world/operators/__init__.py
+++ b/examples/hello_world/operators/__init__.py
@@ -1 +1,3 @@
-from encoder_decoder import EncodeDecodeOperator as EncodeDecodeOperator
+from hello_world.operators.encoder_decoder import (
+    EncodeDecodeOperator as EncodeDecodeOperator,
+)
diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py
index 2e10ad29..71d9d6a2 100644
--- a/worker/src/python/triton_distributed/worker/deployment.py
+++ b/worker/src/python/triton_distributed/worker/deployment.py
@@ -120,9 +120,10 @@ def start(self):
                 self._workers[-1].start()
 
     def stop(self):
-        self.shutdown()
+        return self.shutdown()
 
     def shutdown(self, join=True, timeout=10):
+        exit_code = 0
         for worker in self._workers:
             self._logger.info("\n\nStopping Worker:\n\n\n\t%s\n", worker)
             worker.terminate()
@@ -134,3 +135,5 @@ def shutdown(self, join=True, timeout=10):
                     worker.kill()
                 worker.join(timeout)
                 self._logger.info("\n\nWorker Stopped:\n\n\n\t%s\n", worker)
+                exit_code += worker.exitcode
+        return exit_code
diff --git a/worker/src/python/triton_distributed/worker/worker.py b/worker/src/python/triton_distributed/worker/worker.py
index 4c573976..8952ef73 100644
--- a/worker/src/python/triton_distributed/worker/worker.py
+++ b/worker/src/python/triton_distributed/worker/worker.py
@@ -113,7 +113,6 @@ def _import_operators(self):
                     sys.path.append(str(module_path.parent.absolute()))
                 try:
                     module = importlib.import_module(module_path.name)
-                    print(dir(module))
                     class_ = getattr(module, class_name)
                 except Exception as e:
                     logger.exception(
@@ -228,6 +227,7 @@ async def _initialize_request_handlers(self):
         await asyncio.gather(*handlers)
 
     async def serve(self):
+        error = None
         self._triton_core = tritonserver.Server(
             model_repository=".",
             log_error=True,
@@ -261,6 +261,7 @@ async def serve(self):
         except Exception as e:
             logger.exception("Encountered an error in worker: %s", e)
             self._stop_requested = True
+            error = e
         logger.info("worker store: %s", list(self._data_plane._tensor_store.keys()))
         logger.info("Worker stopped...")
         logger.info(
@@ -275,6 +276,7 @@ async def serve(self):
         if self._metrics_server:
             self._metrics_server.should_exit = True
             await self._metrics_server.shutdown()
+        return error
 
     async def shutdown(self, signal):
         logger.info("Received exit signal %s...", signal.name)
@@ -329,6 +331,8 @@ async def _wait_for_tasks(self, loop):
         loop.stop()
 
     def start(self):
+        exit_condition = None
+
         if self._log_dir:
             pid = os.getpid()
             os.makedirs(self._log_dir, exist_ok=True)
@@ -357,23 +361,34 @@ def start(self):
             loop.add_signal_handler(
                 sig, lambda s=sig: asyncio.create_task(self.shutdown(s))  # type: ignore
             )
+        serve_result = None
         try:
             if self._metrics_port:
-                loop.create_task(self.serve())
+                serve_result = loop.create_task(self.serve())
                 self._metrics_server = self._setup_metrics_server()
                 assert self._metrics_server, "Unable to start metrics server"
                 loop.run_until_complete(self._metrics_server.serve())
             else:
-                loop.run_until_complete(self.serve())
+                serve_result = loop.run_until_complete(self.serve())
         except asyncio.CancelledError:
-            pass
             logger.info("Worker cancelled!")
         finally:
             loop.run_until_complete(self._wait_for_tasks(loop))
             loop.close()
             logger.info("Successfully shutdown worker.")
+            if isinstance(serve_result, asyncio.Task):
+                exit_condition = serve_result.result()
+            else:
+                exit_condition = serve_result
+
             sys.stdout.flush()
             sys.stderr.flush()
+
             if self._log_dir:
                 sys.stdout.close()
                 sys.stderr.close()
+
+        if exit_condition is not None:
+            sys.exit(1)
+        else:
+            sys.exit(0)

From 2da91d756f7bff47dd849662750ec793318444bb Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 00:03:21 -0800
Subject: [PATCH 26/50] adding sanity test

---
 examples/hello_world/__init__.py          |  0
 examples/hello_world/client/__main__.py   | 26 +++++++++++
 examples/hello_world/tests/test_sanity.py | 55 +++++++++++++++++++++++
 3 files changed, 81 insertions(+)
 create mode 100644 examples/hello_world/__init__.py
 create mode 100644 examples/hello_world/tests/test_sanity.py

diff --git a/examples/hello_world/__init__.py b/examples/hello_world/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hello_world/client/__main__.py b/examples/hello_world/client/__main__.py
index 18e34c92..bcda4ba1 100644
--- a/examples/hello_world/client/__main__.py
+++ b/examples/hello_world/client/__main__.py
@@ -14,14 +14,40 @@
 # limitations under the License.
 
 import multiprocessing
+import signal
 import sys
 import time
 
 from .client import _start_client
 from .parser import parse_args
 
+processes = None
+
+
+def handler(signum, frame):
+    exit_code = 0
+    if processes:
+        print("Stopping Clients")
+        for process in processes:
+            process.terminate()
+            process.kill()
+            process.join()
+
+            exit_code += process.exitcode
+    print(f"Clients Stopped Exit Code {exit_code}")
+    sys.exit(exit_code)
+
+
+signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
+for sig in signals:
+    try:
+        signal.signal(sig, handler)
+    except Exception:
+        pass
+
 
 def main(args):
+    global processes
     process_context = multiprocessing.get_context("spawn")
     args.lock = process_context.Lock()
     processes = []
diff --git a/examples/hello_world/tests/test_sanity.py b/examples/hello_world/tests/test_sanity.py
new file mode 100644
index 00000000..f94530f0
--- /dev/null
+++ b/examples/hello_world/tests/test_sanity.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+
+
+def test_sanity():
+    deployment_command = [
+        "python3",
+        "-m",
+        "hello_world.deploy",
+        "--initialize-request-plane",
+    ]
+
+    deployment_process = subprocess.Popen(
+        deployment_command,
+        stdin=subprocess.DEVNULL,
+    )
+
+    client_command = [
+        "python3",
+        "-m",
+        "hello_world.client",
+    ]
+
+    client_process = subprocess.Popen(
+        client_command,
+        stdin=subprocess.DEVNULL,
+    )
+    try:
+        client_process.wait(timeout=30)
+    except subprocess.TimeoutExpired:
+        print("Client timed out!")
+        client_process.terminate()
+        client_process.wait()
+
+    client_process.terminate()
+    client_process.kill()
+    client_process.wait()
+    deployment_process.terminate()
+    deployment_process.wait()
+    assert client_process.returncode == 0, "Error in clients!"
+    assert deployment_process.returncode == 0, "Error starting deployment!"

From f2673f8093d828891477c222e02e9d82f1f0b7d7 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 00:18:37 -0800
Subject: [PATCH 27/50] delete unnecessary file

---
 examples/hello_world/client/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/hello_world/client/.gitkeep

diff --git a/examples/hello_world/client/.gitkeep b/examples/hello_world/client/.gitkeep
deleted file mode 100644
index e69de29b..00000000

From 348b57876aec3e2b512105a42db60c576655778c Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 00:20:37 -0800
Subject: [PATCH 28/50] adding copyright

---
 examples/hello_world/__init__.py           | 14 ++++++++++++++
 examples/hello_world/operators/__init__.py | 15 +++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/examples/hello_world/__init__.py b/examples/hello_world/__init__.py
index e69de29b..e9d1d880 100644
--- a/examples/hello_world/__init__.py
+++ b/examples/hello_world/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/examples/hello_world/operators/__init__.py b/examples/hello_world/operators/__init__.py
index e3efe37e..9b3bfb4e 100644
--- a/examples/hello_world/operators/__init__.py
+++ b/examples/hello_world/operators/__init__.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from hello_world.operators.encoder_decoder import (
     EncodeDecodeOperator as EncodeDecodeOperator,
 )

From 5748a15bf89c12ff4ec12e1a164008bd14f993d6 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 00:29:02 -0800
Subject: [PATCH 29/50] removing unnecessary files

---
 examples/hello_world/tests/.gitkeep | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 examples/hello_world/tests/.gitkeep

diff --git a/examples/hello_world/tests/.gitkeep b/examples/hello_world/tests/.gitkeep
deleted file mode 100644
index e69de29b..00000000

From 26fe1d8d2d27729f4ddda8690d20ef092da3074c Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 00:39:20 -0800
Subject: [PATCH 30/50] updates for static type checks

---
 examples/hello_world/client/__main__.py                | 10 ++++++----
 .../src/python/triton_distributed/worker/deployment.py |  3 ++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/examples/hello_world/client/__main__.py b/examples/hello_world/client/__main__.py
index bcda4ba1..9f7744f4 100644
--- a/examples/hello_world/client/__main__.py
+++ b/examples/hello_world/client/__main__.py
@@ -17,11 +17,12 @@
 import signal
 import sys
 import time
+from typing import Optional
 
 from .client import _start_client
 from .parser import parse_args
 
-processes = None
+processes: Optional[list[multiprocessing.context.SpawnProcess]] = None
 
 
 def handler(signum, frame):
@@ -32,8 +33,8 @@ def handler(signum, frame):
             process.terminate()
             process.kill()
             process.join()
-
-            exit_code += process.exitcode
+            if process.exitcode is not None:
+                exit_code += process.exitcode
     print(f"Clients Stopped Exit Code {exit_code}")
     sys.exit(exit_code)
 
@@ -66,7 +67,8 @@ def main(args):
     )
     exit_code = 0
     for process in processes:
-        exit_code += process.exitcode
+        if process.exitcode is not None:
+            exit_code += process.exitcode
     print(f"Clients Stopped Exit Code {exit_code}")
     return exit_code
 
diff --git a/worker/src/python/triton_distributed/worker/deployment.py b/worker/src/python/triton_distributed/worker/deployment.py
index 71d9d6a2..016fc5cc 100644
--- a/worker/src/python/triton_distributed/worker/deployment.py
+++ b/worker/src/python/triton_distributed/worker/deployment.py
@@ -135,5 +135,6 @@ def shutdown(self, join=True, timeout=10):
                     worker.kill()
                 worker.join(timeout)
                 self._logger.info("\n\nWorker Stopped:\n\n\n\t%s\n", worker)
-                exit_code += worker.exitcode
+                if worker.exitcode is not None:
+                    exit_code += worker.exitcode
         return exit_code

From 9421700087bf8c2936462b46f5376c7940c5a82c Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 00:41:55 -0800
Subject: [PATCH 31/50] adding sanity test to pre-merge until we have
 additional triggers enabled

---
 examples/hello_world/tests/test_sanity.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/examples/hello_world/tests/test_sanity.py b/examples/hello_world/tests/test_sanity.py
index f94530f0..2e5aea3c 100644
--- a/examples/hello_world/tests/test_sanity.py
+++ b/examples/hello_world/tests/test_sanity.py
@@ -15,6 +15,13 @@
 
 import subprocess
 
+import pytest
+
+# TODO
+# Decide if this should be
+# pre merge, nightly, or weekly
+pytestmark = pytest.mark.pre_merge
+
 
 def test_sanity():
     deployment_command = [

From 729850252def3f27cfb121210a835f13f432628d Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 06:58:46 -0800
Subject: [PATCH 32/50] reduce number of requests for sanity test

---
 examples/hello_world/tests/test_sanity.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/hello_world/tests/test_sanity.py b/examples/hello_world/tests/test_sanity.py
index 2e5aea3c..e90e6dc3 100644
--- a/examples/hello_world/tests/test_sanity.py
+++ b/examples/hello_world/tests/test_sanity.py
@@ -40,6 +40,8 @@ def test_sanity():
         "python3",
         "-m",
         "hello_world.client",
+        "--requests-per-client",
+        "10",
     ]
 
     client_process = subprocess.Popen(

From 444a82acd28cd5f916a7515f1084870d08934f5a Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 07:23:37 -0800
Subject: [PATCH 33/50] removing version from RemoteOperator calls

---
 .../api_server/models/mock_disaggregated_serving/1/model.py     | 2 +-
 worker/tests/python/integration/test_direct.py                  | 2 +-
 .../tests/python/integration/test_mock_disaggregated_serving.py | 2 +-
 worker/tests/python/integration/test_perf_benchmark.py          | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
index c80fdf1d..247913b6 100644
--- a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
+++ b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
@@ -162,7 +162,7 @@ def initialize(self, args):
             "string_value"
         ]
         self._remote_operator = RemoteOperator(
-            self._remote_worker_name, 1, self._request_plane, self._data_plane
+            self._remote_worker_name, self._request_plane, self._data_plane
         )
 
         # Starting the response thread. It allows API Server to keep making progress while
diff --git a/worker/tests/python/integration/test_direct.py b/worker/tests/python/integration/test_direct.py
index 6be761f9..c031ba7b 100644
--- a/worker/tests/python/integration/test_direct.py
+++ b/worker/tests/python/integration/test_direct.py
@@ -115,7 +115,7 @@ async def post_requests(num_requests, num_targets):
     request_plane = NatsRequestPlane(f"nats://localhost:{NATS_PORT}")
     await request_plane.connect()
 
-    identity_operator = RemoteOperator("identity", 1, request_plane, data_plane)
+    identity_operator = RemoteOperator("identity", request_plane, data_plane)
 
     target_components = set()
     target_component_list: list[uuid.UUID] = []
diff --git a/worker/tests/python/integration/test_mock_disaggregated_serving.py b/worker/tests/python/integration/test_mock_disaggregated_serving.py
index c56389cb..c23e1df0 100644
--- a/worker/tests/python/integration/test_mock_disaggregated_serving.py
+++ b/worker/tests/python/integration/test_mock_disaggregated_serving.py
@@ -156,7 +156,7 @@ async def post_requests(num_requests):
     await request_plane.connect()
 
     mock_disaggregated_serving_operator = RemoteOperator(
-        "mock_disaggregated_serving", 1, request_plane, data_plane
+        "mock_disaggregated_serving", request_plane, data_plane
     )
 
     expected_results = {}
diff --git a/worker/tests/python/integration/test_perf_benchmark.py b/worker/tests/python/integration/test_perf_benchmark.py
index 7d1c7228..d54c3763 100644
--- a/worker/tests/python/integration/test_perf_benchmark.py
+++ b/worker/tests/python/integration/test_perf_benchmark.py
@@ -133,7 +133,7 @@ def run(
     asyncio.get_event_loop().run_until_complete(request_plane.connect())
 
     identity_operator = RemoteOperator(
-        "identity", 1, request_plane, data_plane_tracker._data_plane
+        "identity", request_plane, data_plane_tracker._data_plane
     )
 
     inputs, outputs = _create_inputs(1, tensor_size_in_kb)

From 671ce6ae0c733b39e9aa220a50877396f03f947c Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 07:29:00 -0800
Subject: [PATCH 34/50] adding copyright

---
 examples/hello_world/README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 9f969016..635a3909 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -1,3 +1,21 @@
+<!--
+SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+
 ```
 examples/
 └── hello_world

From 2d0f1db06b43aafafc228780a829d4fa655f62cd Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 07:41:49 -0800
Subject: [PATCH 35/50] updating copyright

---
 .../mock_disaggregated_serving/1/model.py     | 39 +++++++------------
 .../mock_disaggregated_serving/config.pbtxt   | 39 +++++++------------
 2 files changed, 28 insertions(+), 50 deletions(-)

diff --git a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
index 247913b6..f6c6a95a 100644
--- a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
+++ b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/1/model.py
@@ -1,28 +1,17 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import asyncio
 import gc
diff --git a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt
index 1fc2916e..5c68b52e 100644
--- a/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt
+++ b/worker/tests/python/integration/api_server/models/mock_disaggregated_serving/config.pbtxt
@@ -1,28 +1,17 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 name: "mock_disaggregated_serving"
 backend: "python"

From 4a5df201e6a6abbe4797f927226936390e3d0fa8 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 07:45:39 -0800
Subject: [PATCH 36/50] moving new additions to comments to allow for
 functional merge

---
 README.md                      | 4 +---
 examples/hello_world/README.md | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index aa0b9aad..539f7a43 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ HF_TOKEN```) and mounts common directories such as ```/tmp:/tmp```,
 Please see the instructions in the corresponding example for specific
 deployment instructions.
 
-
+<!--
 
 ## 1. Big Picture
 Triton Distributed extends the standard [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) model-serving paradigm with additional “planes” that distribute data and requests across multiple processes or machines. Conceptually, you still write a Triton **Model**, but your inference requests and data transfers can be routed through:
@@ -156,8 +156,6 @@ In the “hello world,” you see three Worker processes—each hosting either t
 
 
 
-<!--
-
 ## Goals
 
 ## Concepts
diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 635a3909..475f3481 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -15,6 +15,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+<!--
 
 ```
 examples/
@@ -130,3 +131,4 @@ Hence, the aggregator itself is just a normal Python class implementing the `Ope
 6. The aggregator Worker then calls `_decoder.async_infer()`, which calls the “decoder” Worker’s model, which re-inverts and slices the data, returning it back.
 7. Finally, the aggregator Worker returns the final “decoded” data to the original caller in `send_requests`.
 
+-->

From 5a4b97f9361d880b3461e272790e19dc5769d68f Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 07:55:08 -0800
Subject: [PATCH 37/50] updating copyright and license

---
 .../triton_core_models/encoder/1/model.py     | 35 +++++++------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/examples/hello_world/operators/triton_core_models/encoder/1/model.py b/examples/hello_world/operators/triton_core_models/encoder/1/model.py
index 36a2dbf9..f7b458a6 100644
--- a/examples/hello_world/operators/triton_core_models/encoder/1/model.py
+++ b/examples/hello_world/operators/triton_core_models/encoder/1/model.py
@@ -1,28 +1,17 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import json
 import time

From 13d5fee4d0657910cc919530227b442786bc7f14 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 07:56:25 -0800
Subject: [PATCH 38/50] updated copyright

---
 .../triton_core_models/encoder/config.pbtxt   | 41 ++++++-------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt
index 05968c5a..5581461c 100644
--- a/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt
+++ b/examples/hello_world/operators/triton_core_models/encoder/config.pbtxt
@@ -1,37 +1,20 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 ## Model Instance and Kind are filled in by configuration when launched
 ## All other values are filled in by auto_complete in model.py
 
 backend: "python"
 
-# instance_group [
-# { count: {MODEL_INSTANCE_COUNT}
-#  kind: {MODEL_INSTANCE_KIND}
-# }
-# ]
-#
\ No newline at end of file

From 4eb9b3c2e1824d7425aa85ca6227285fd41c1c5e Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 09:33:37 -0800
Subject: [PATCH 39/50] WIP

---
 README.md                      |  14 +++
 examples/hello_world/README.md | 151 +++++++++++++++++++++++++++++++++
 2 files changed, 165 insertions(+)

diff --git a/README.md b/README.md
index 539f7a43..7d963e34 100644
--- a/README.md
+++ b/README.md
@@ -84,6 +84,20 @@ HF_TOKEN```) and mounts common directories such as ```/tmp:/tmp```,
 Please see the instructions in the corresponding example for specific
 deployment instructions.
 
+## Hello World
+
+A basic example demonstrating the new interfaces and concepts of
+triton distributed. In the hello world example, you can deploy a set
+of simple workers to load balance requests from a local work queue.
+
+The example demonstrates:
+
+1. How to incorporate an existing Triton Core Model into a triton distributed worker.
+1. How to incorporate a standalone python class into a triton distributed worker.
+1. How deploy a set of workers
+1. How to send requests to the triton distributed deployment
+
+
 <!--
 
 ## 1. Big Picture
diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 475f3481..a3bd89e0 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -15,6 +15,157 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
+# Hello World
+
+A basic example demonstrating the new interfaces and concepts of
+triton distributed. In the hello world example, you can deploy a set
+of simple workers to load balance requests from a local work queue.
+
+The example demonstrates:
+
+1. How to incorporate an existing Triton Core Model into a triton distributed worker.
+1. How to incorporate a standalone python class into a triton distributed worker.
+1. How deploy a set of workers
+1. How to send requests to the triton distributed deployment
+
+
+## Building the Hello World Environment
+
+The hello world example is designed to be deployed in a containerized
+environment and to work with and without GPU support.
+
+To get started build the "STANDARD" triton distributed development
+environment.
+
+Note: "STANDARD" is the default framework
+
+```
+./containers/build.sh
+```
+
+
+## Starting the Deployment
+
+```
+./containers/run.sh -it -- python3 -m hello_world.deploy --initialize-request-plane
+```
+
+#### Expected Output
+
+
+```
+Starting Workers
+17:17:09 deployment.py:115[triton_distributed.worker.deployment] INFO:
+
+Starting Worker:
+
+	Config:
+	WorkerConfig(request_plane=<class 'triton_distributed.icp.nats_request_plane.NatsRequestPlane'>,
+             data_plane=<function UcpDataPlane at 0x7f477eb5d580>,
+             request_plane_args=([], {}),
+             data_plane_args=([], {}),
+             log_level=1,
+             operators=[OperatorConfig(name='encoder',
+                                       implementation=<class 'triton_distributed.worker.triton_core_operator.TritonCoreOperator'>,
+                                       repository='/workspace/examples/hello_world/operators/triton_core_models',
+                                       version=1,
+                                       max_inflight_requests=1,
+                                       parameters={'config': {'instance_group': [{'count': 1,
+                                                                                  'kind': 'KIND_CPU'}],
+                                                              'parameters': {'delay': {'string_value': '0'},
+                                                                             'input_copies': {'string_value': '1'}}}},
+                                       log_level=None)],
+             triton_log_path=None,
+             name='encoder.0',
+             log_dir='/workspace/examples/hello_world/logs',
+             metrics_port=50000)
+	<SpawnProcess name='encoder.0' parent=1 initial>
+
+17:17:09 deployment.py:115[triton_distributed.worker.deployment] INFO:
+
+Starting Worker:
+
+	Config:
+	WorkerConfig(request_plane=<class 'triton_distributed.icp.nats_request_plane.NatsRequestPlane'>,
+             data_plane=<function UcpDataPlane at 0x7f477eb5d580>,
+             request_plane_args=([], {}),
+             data_plane_args=([], {}),
+             log_level=1,
+             operators=[OperatorConfig(name='decoder',
+                                       implementation=<class 'triton_distributed.worker.triton_core_operator.TritonCoreOperator'>,
+                                       repository='/workspace/examples/hello_world/operators/triton_core_models',
+                                       version=1,
+                                       max_inflight_requests=1,
+                                       parameters={'config': {'instance_group': [{'count': 1,
+                                                                                  'kind': 'KIND_CPU'}],
+                                                              'parameters': {'delay': {'string_value': '0'},
+                                                                             'input_copies': {'string_value': '1'}}}},
+                                       log_level=None)],
+             triton_log_path=None,
+             name='decoder.0',
+             log_dir='/workspace/examples/hello_world/logs',
+             metrics_port=50001)
+	<SpawnProcess name='decoder.0' parent=1 initial>
+
+17:17:09 deployment.py:115[triton_distributed.worker.deployment] INFO:
+
+Starting Worker:
+
+	Config:
+	WorkerConfig(request_plane=<class 'triton_distributed.icp.nats_request_plane.NatsRequestPlane'>,
+             data_plane=<function UcpDataPlane at 0x7f477eb5d580>,
+             request_plane_args=([], {}),
+             data_plane_args=([], {}),
+             log_level=1,
+             operators=[OperatorConfig(name='encoder_decoder',
+                                       implementation='EncodeDecodeOperator',
+                                       repository='/workspace/examples/hello_world/operators',
+                                       version=1,
+                                       max_inflight_requests=1,
+                                       parameters={},
+                                       log_level=None)],
+             triton_log_path=None,
+             name='encoder_decoder.0',
+             log_dir='/workspace/examples/hello_world/logs',
+             metrics_port=50002)
+	<SpawnProcess name='encoder_decoder.0' parent=1 initial>
+
+Workers started ... press Ctrl-C to Exit
+```
+
+## Sending Requests
+
+From a separate terminal run the sample client.
+
+```
+./containers/run.sh -it -- python3 -m hello_world.client
+```
+
+#### Expected Output
+
+```
+
+Client: 0 Received Response: 42 From: 39491f06-d4f7-11ef-be96-047bcba9020e Error: None:  43%|███████▋          | 43/100 [00:04<00:05,  9.83request/s]
+
+Throughput: 9.10294484748811 Total Time: 10.985455989837646
+Clients Stopped Exit Code 0
+
+
+```
+
+## Behind the Scenes
+
+The hello world example is designed to demonstrate and allow
+experimenting with different mixtures of compute and memory loads and
+different numbers of workers for different parts of the hello world
+pipeline.
+
+### Hello World Pipeline
+
+The hello world pipeline is a simple two stage pipeline with an
+encoding stage and a decoding stage plus a
+
+
 <!--
 
 ```

From 7fe44cea073cd9c0925b6dc2e756168eca09f311 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 11:36:07 -0800
Subject: [PATCH 40/50] updates

---
 README.md                      |  80 -------------
 examples/hello_world/README.md | 202 +++++++++++++++------------------
 2 files changed, 94 insertions(+), 188 deletions(-)

diff --git a/README.md b/README.md
index 7d963e34..ad496f36 100644
--- a/README.md
+++ b/README.md
@@ -97,83 +97,3 @@ The example demonstrates:
 1. How deploy a set of workers
 1. How to send requests to the triton distributed deployment
 
-
-<!--
-
-## 1. Big Picture
-Triton Distributed extends the standard [NVIDIA Triton Inference Server](https://github.com/triton-inference-server/server) model-serving paradigm with additional “planes” that distribute data and requests across multiple processes or machines. Conceptually, you still write a Triton **Model**, but your inference requests and data transfers can be routed through:
-
-1. **Request Plane**: Sends model-inference requests among nodes or processes.
-2. **Data Plane**: Moves tensor data and references between processes or GPUs.
-
-This architecture allows you to build large, multi-process or multi-node solutions for AI model inference without manually managing transport and synchronization. You can share GPU memory references, shift them among distributed processes, etc.
-
----
-
-## 2. Key Components
-The repo has four major logical layers:
-
-1. **ICP (Inter-Component Protocol)**:
-   - Python modules under `triton_distributed/icp/…`
-   - Encodes how the data and requests get serialized/transported.
-   - Implements **NatsRequestPlane** and **UcpDataPlane**, which are concrete transport/connection classes for requests/data.
-     - **NatsRequestPlane** uses NATS for distributing requests.
-     - **UcpDataPlane** uses UCX (libucp) for transferring tensor data, possibly GPU-to-GPU.
-
-2. **Worker**:
-   - Python modules under `triton_distributed/worker/…`
-   - Exposes the concept of an **Operator** (a processing node that can serve one or more Triton models or custom logic).
-   - Runs the main loop that pulls requests from a Request Plane, processes them, and returns responses.
-   - Contains a Python “mini” server (the `Worker`) that spawns or manages multiple Operators.
-
-3. **Integration Tests & Examples**:
-   - A directory structure with unit tests and integration tests showing how to compose multiple workers.
-   - The “hello world” example is under `examples/hello_world/`.
-
-4. **Triton Python Models**:
-   - Under various directories like `.../operators/triton_core_models/...` or `icp/src/python/triton_distributed/icp/...`
-   - Typical Triton `model.py` files that define custom Python logic behind each “model.”
-
-
-## 4. ICP Planes & Worker Internals
-
-### 4.1 Request Plane (NATS)
-`NatsRequestPlane` handles distributing requests among processes. Under the hood, it:
-
-- Connects to a NATS server (which might run in local Docker or remote).
-- Creates “streams” in NATS for each operator or for direct routing.
-- On the “client” side (where you call `post_request`), it publishes request messages to the right NATS subjects.
-- On the “server” side (the Worker), it “pulls” requests from NATS subscriptions.
-
-### 4.2 Data Plane (UCX)
-`UcpDataPlane` references UCX-Py (libucp) to exchange actual tensor data. By default:
-
-- When you “put” a tensor, the data plane either:
-  1. Embeds small data directly in the message (the “contents” approach), or
-  2. If large, stores a reference (GPU or CPU memory) in the local `_tensor_store`, then sends a small “URI” like `ucp://hostname:port/<uuid>` to the remote side.
-- The remote side can do “get_tensor” by connecting to `ucp://hostname:port` and pulling the data.
-
-This allows distributed GPU memory references with minimal overhead.
-
-### 4.3 Worker
-A `Worker` runs in a separate process. It:
-
-- Starts or registers Triton model(s).
-- Connects to the chosen request plane (NATS) and data plane (UCX).
-- Enters a loop:
-  1. `pull_requests` from NATS,
-  2. routes them to the correct Operator,
-  3. gets the results,
-  4. returns them to the request plane.
-
-In the “hello world,” you see three Worker processes—each hosting either the encoder, decoder, or aggregator operator.
-
-
-
-## Goals
-
-## Concepts
-
-## Examples
-
--->
diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index a3bd89e0..96a3d99c 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -158,128 +158,114 @@ Clients Stopped Exit Code 0
 The hello world example is designed to demonstrate and allow
 experimenting with different mixtures of compute and memory loads and
 different numbers of workers for different parts of the hello world
-pipeline.
+workflow.
 
-### Hello World Pipeline
+### Hello World Workflow
 
-The hello world pipeline is a simple two stage pipeline with an
-encoding stage and a decoding stage plus a
-
-
-<!--
+The hello world workflow is a simple two stage pipeline with an
+encoding stage and a decoding stage plus an encoder-decoder stage to
+orchestrate the overall workflow.
 
 ```
-examples/
-└── hello_world
-    ├── README.md
-    ├── api_server
-    ├── client (optional)
-    ├── deploy
-    │   └── __main__.py (should it contain all workers, the example have here also API server-like logic to publish requests from users)
-    ├── docs
-    ├── operators
-    │   └── triton_core_models (optional)
-    │       ├── decoder
-    │       │   ├── 1
-    │       │   │   └── model.py
-    │       │   └── config.pbtxt
-    │       └── encoder
-    │           ├── 1
-    │           │   └── model.py
-    │           └── config.pbtxt
-    ├── router (optional)
-    ├── scripts (What should be here?)
-    ├── single_file.py
-    └── tests
+client <-> encoder_decoder <-> encoder
+                      |
+					  -----<-> decoder
 ```
 
-Review plans for deploy cli / client cli
-```
-deploy --encoder workers:instances:device --decoder workers:instances:device --encoder-decoder workers
-  in future
-  deploy --api-server <kserve>
-  deploy --request-plane nginx  (would need to convert encode decode into bls?)
-```
 
+#### Encoder
 
-Below is a high-level overview of how Triton Distributed is organized, with special attention to the “hello world” example that demonstrates how the system’s pieces fit together.
+The encoder follows the simple procedure:
 
----
+1. copy the input x times (x is configurable via parameter)
+2. invert the input
+3. delay * size of output
 
+#### Decoder
 
-## 3. “Hello World” Layout
-In `examples/hello_world/`, you see a minimal demonstration of how to:
+The decoder follows the simple procedure:
 
-1. Create a few Triton models (the “encoder” and “decoder”).
-2. Start a small distributed deployment with these models.
-3. Send requests in parallel and demonstrate data-plane usage.
+1. remove the extra copies
+2. invert the input
+3. delay * size of output
 
-### 3.1 Directory Structure
+#### Encoder - Decoder
 
-```
-examples/hello_world/
-  deploy/
-    __main__.py     # Entry point that starts the “hello world” deployment
-  operators/
-    triton_core_models/
-      encoder/1/model.py    # Python model code for an “encoder” step
-      decoder/1/model.py    # Python model code for a “decoder” step
-```
+The encoder-decoder operator controls the overall workflow.
 
-#### (a) The `__main__.py` (Deploy Script)
-This file spins up everything end-to-end:
-
-- Creates a local NATS server object (`nats_server`) so that requests can be published and consumed.
-- Defines **OperatorConfig** objects for the two Triton models, `encoder` and `decoder`. Each references a local path to the Python model code and custom parameters (e.g., instance group, concurrency, etc.).
-- Defines a custom “orchestrator” operator named `encoder_decoder` (`EncodeDecodeOperator` in the code) that chains calls to the `encoder` and `decoder`.
-- Creates three WorkerConfig entries:
-  1. Worker that hosts the `encoder` model  **(REMOVE in Python)**
-  2. Worker that hosts the `decoder` model  **(REMOVE in Python)**
-  3. Worker that hosts the aggregator operator (`encoder_decoder`) (Python HERE)
-- Launches all three processes with a `Deployment` object. (We need separate entry points for API server)
-- Sends test requests to `encoder_decoder` operator (which calls `encoder` then `decoder`) and verifies the results. (this will run vLLM)
-
-
-#### (c) The `EncodeDecodeOperator`
-This is a custom operator (in `deploy/__main__.py` as a short class, or sometimes in a separate file) that demonstrates how to chain calls:
-
-```python
-for request in requests:
-  # 1. Send "input" to the "encoder" model
-  encoded_responses = await self._encoder.async_infer(inputs={"input": request.inputs["input"]})
-
-  # 2. When the encoder finishes, read "input_copies" from the response
-  #    then call “decoder” with the “encoded” output
-  decoded_responses = await self._decoder.async_infer(
-      inputs={"input": encoded_response.outputs["output"]},
-      parameters={"input_copies": input_copies},
-  )
-
-  # 3. Return the result back to the user
-  await request.response_sender().send(final=True, outputs={"output": decoded_response.outputs["output"]})
-```
+It first sends a request for an encoder. Once it receives the response
+it sends the output from the encoder as an input to the decoder. Note
+in this step memory is transferred directly between the encoder and
+decoder workers - and does not pass through the encoder-decoder.
 
-Hence, the aggregator itself is just a normal Python class implementing the `Operator` interface, but inside it calls **RemoteOperator** objects for actual inference.
-
-## 5. How the Hello World Example Flows
-1. **`main()`** in `examples/hello_world/deploy/__main__.py` starts:
-   - A local NATS server for request-plane traffic.
-   - Worker processes for “encoder,” “decoder,” and “encoder_decoder.”
-2. Each Worker loads a Python model or an Operator class:
-   - The `encoder` Worker loads the model code from `encoder/1/model.py`.
-   - The `decoder` Worker does the same for `decoder/1/model.py`.
-   - The `encoder_decoder` Worker instantiates the `EncodeDecodeOperator` Python class, which calls `encoder` and `decoder` remotely.
-3. The script then calls `send_requests(nats_server_url)`:
-   - It uses a **RemoteOperator** for `encoder_decoder` and does something like:
-     ```python
-     remote_operator: RemoteOperator = RemoteOperator("encoder_decoder", 1, request_plane, data_plane)
-     await remote_operator.async_infer(inputs={"input": some_numpy_array})
-     ```
-4. The `async_infer()` method publishes a request to the “encoder_decoder” Worker (via `NatsRequestPlane`) and references data (via `UcpDataPlane`).
-5. The aggregator Worker receives the request, calls `_encoder.async_infer()`, which sends a second request to the “encoder” Worker:
-   - The “encoder” Worker runs the simple tile/invert logic.
-   - Once done, it returns the result to the aggregator Worker.
-6. The aggregator Worker then calls `_decoder.async_infer()`, which calls the “decoder” Worker’s model, which re-inverts and slices the data, returning it back.
-7. Finally, the aggregator Worker returns the final “decoded” data to the original caller in `send_requests`.
+### Operators
+
+Operators are responsible for actually doing work and responding to
+requests. Operators are supported in two main flavors and are hosted
+by a common Worker class.
+
+#### Triton Core Operator
+
+The triton core operator makes a triton model (following the standard
+model repo and backend structure of the tritonserver) available on the
+request plane. Both the encoder and decoder are implemented as triton
+python backend models.
+
+#### Standalone Operator
+
+The encoder-decoder operator is a standalone python class that
+implements the Operator interface. Internally it makes remote requests
+to other workers. Generally a standalone operator can make use of
+other operators for its work but isn't required to.
+
+### Workers
+
+Workers host one or more operators and pull requests from the request
+plane and forward them to a local operator.
+
+### Request Plane
+
+The current triton distributed framework leverages a distributed work
+queue for its request plane implementation. The request plane ensures
+that requests for operators are forwarded and serviced by a single
+worker.
+
+### Data Plane
+
+The triton distributed framework leverages point to point data
+transfers using the UCX library to provide optimized primitives for
+device to device transfers.
+
+Data sent over the data plane is only pulled by the worker that needs
+to perform work on it. Requests themselves contain data descriptors
+and can be referenced and shared with other workers.
+
+Note: there is also a provision for sending data in the request
+contents when the message size is small enough that UCX transfer is
+not needed.
+
+### Components
+
+Any process which communicates with one or more of the request or data
+planes is considered a "component". While this example only uses
+"Workers" future examples will also include api servers, routers, and
+other types of components.
+
+### Deployment
+
+The final piece is a deployment. A deployment is a set of components
+deployed across a cluster. Components may be added and removed from
+deployments.
+
+
+## Limitations and Caveats
+
+The example is a rapidly evolving prototype and shouldn't be used in
+production. Limited testing has been done and it is meant to help
+flesh out the triton distributed concepts, architecture, and
+interfaces.
+
+1. No multi-node testing / support has been done
+
+2. No performance tuning / measurement has been done
 
--->

From 097250055e650fe3a7943230753e6696a7093aee Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 11:39:18 -0800
Subject: [PATCH 41/50] updated

---
 README.md                      | 4 ++++
 examples/hello_world/README.md | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ad496f36..0a180225 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,8 @@ deployment instructions.
 
 ## Hello World
 
+[Hello World](./examples/hello_world)
+
 A basic example demonstrating the new interfaces and concepts of
 triton distributed. In the hello world example, you can deploy a set
 of simple workers to load balance requests from a local work queue.
@@ -96,4 +98,6 @@ The example demonstrates:
 1. How to incorporate a standalone python class into a triton distributed worker.
 1. How deploy a set of workers
 1. How to send requests to the triton distributed deployment
+1. Requests over the Request Plane and Data movement over the Data
+   Plane.
 
diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 96a3d99c..7a187c12 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -27,7 +27,8 @@ The example demonstrates:
 1. How to incorporate a standalone python class into a triton distributed worker.
 1. How deploy a set of workers
 1. How to send requests to the triton distributed deployment
-
+1. Requests over the Request Plane and Data movement over the Data
+   Plane.
 
 ## Building the Hello World Environment
 

From 78e9eb4888287e105526cc8b2da02f263cd8eaec Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 12:17:39 -0800
Subject: [PATCH 42/50] update

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0a180225..c8703e08 100644
--- a/README.md
+++ b/README.md
@@ -49,7 +49,7 @@ We provide 3 types of builds:
 
 For example, if you want to build a container for the `VLLM` backend you can run
 
-`./container/build.sh --framework VLLM`
+`./container/build.sh`
 
 Please see the instructions in the corresponding example for specific build instructions.
 

From e28d58602ecac3c6d8482da736141cace9e20183 Mon Sep 17 00:00:00 2001
From: Neelay Shah <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:09:37 -0800
Subject: [PATCH 43/50] Update examples/hello_world/README.md

Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>
---
 examples/hello_world/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 7a187c12..5d406034 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -41,7 +41,7 @@ environment.
 Note: "STANDARD" is the default framework
 
 ```
-./containers/build.sh
+./container/build.sh
 ```
 
 

From 6e7eee2bd87f91baf7ff8a56c1290e75512c72b7 Mon Sep 17 00:00:00 2001
From: Neelay Shah <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:10:00 -0800
Subject: [PATCH 44/50] Update examples/hello_world/README.md

Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>
---
 examples/hello_world/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 5d406034..6ca03a23 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -48,7 +48,7 @@ Note: "STANDARD" is the default framework
 ## Starting the Deployment
 
 ```
-./containers/run.sh -it -- python3 -m hello_world.deploy --initialize-request-plane
+./container/run.sh -it -- python3 -m hello_world.deploy --initialize-request-plane
 ```
 
 #### Expected Output

From c447cc5aa843025b5d36c9842f089bcc2752742b Mon Sep 17 00:00:00 2001
From: Neelay Shah <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:10:14 -0800
Subject: [PATCH 45/50] Update examples/hello_world/README.md

Co-authored-by: Tanmay Verma <tanmay2592@gmail.com>
---
 examples/hello_world/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 6ca03a23..494d101d 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -139,7 +139,7 @@ Workers started ... press Ctrl-C to Exit
 From a separate terminal run the sample client.
 
 ```
-./containers/run.sh -it -- python3 -m hello_world.client
+./container/run.sh -it -- python3 -m hello_world.client
 ```
 
 #### Expected Output

From 8ce4d95d5cf459cdcb1a6d25799b422cf79874e4 Mon Sep 17 00:00:00 2001
From: Neelay Shah <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:11:37 -0800
Subject: [PATCH 46/50] Update examples/hello_world/README.md

Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
---
 examples/hello_world/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 494d101d..1ce74c2c 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -24,10 +24,10 @@ of simple workers to load balance requests from a local work queue.
 The example demonstrates:
 
 1. How to incorporate an existing Triton Core Model into a triton distributed worker.
-1. How to incorporate a standalone python class into a triton distributed worker.
-1. How deploy a set of workers
-1. How to send requests to the triton distributed deployment
-1. Requests over the Request Plane and Data movement over the Data
+2. How to incorporate a standalone python class into a triton distributed worker.
+3. How deploy a set of workers
+4. How to send requests to the triton distributed deployment
+5. Requests over the Request Plane and Data movement over the Data
    Plane.
 
 ## Building the Hello World Environment

From 68a335d19791ec2ccd6632ae167397aa859f5433 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:15:28 -0800
Subject: [PATCH 47/50] updated

---
 README.md | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/README.md b/README.md
index c8703e08..cd97382a 100644
--- a/README.md
+++ b/README.md
@@ -92,12 +92,3 @@ A basic example demonstrating the new interfaces and concepts of
 triton distributed. In the hello world example, you can deploy a set
 of simple workers to load balance requests from a local work queue.
 
-The example demonstrates:
-
-1. How to incorporate an existing Triton Core Model into a triton distributed worker.
-1. How to incorporate a standalone python class into a triton distributed worker.
-1. How deploy a set of workers
-1. How to send requests to the triton distributed deployment
-1. Requests over the Request Plane and Data movement over the Data
-   Plane.
-

From d53bde56bdf02979872f89e782c642c6e82adb29 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:18:51 -0800
Subject: [PATCH 48/50] updated

---
 examples/hello_world/README.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/hello_world/README.md b/examples/hello_world/README.md
index 1ce74c2c..e03adc78 100644
--- a/examples/hello_world/README.md
+++ b/examples/hello_world/README.md
@@ -170,7 +170,7 @@ orchestrate the overall workflow.
 ```
 client <-> encoder_decoder <-> encoder
                       |
-					  -----<-> decoder
+                      -----<-> decoder
 ```
 
 
@@ -207,17 +207,19 @@ by a common Worker class.
 
 #### Triton Core Operator
 
-The triton core operator makes a triton model (following the standard
-model repo and backend structure of the tritonserver) available on the
-request plane. Both the encoder and decoder are implemented as triton
-python backend models.
+The triton core operator makes a triton model (following the [standard
+model
+repo](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md)
+and backend structure of the tritonserver) available on the request
+plane. Both the encoder and decoder are implemented as triton python
+backend models.
 
-#### Standalone Operator
+#### Generic Operator
 
-The encoder-decoder operator is a standalone python class that
-implements the Operator interface. Internally it makes remote requests
-to other workers. Generally a standalone operator can make use of
-other operators for its work but isn't required to.
+The encoder-decoder operator is a python class that implements the
+Operator interface. Internally it makes remote requests to other
+workers. Generally a operator can make use of other operators for its
+work but isn't required to.
 
 ### Workers
 

From 566a7a6cc38ed9d351b4f7c440cf524b2bd849a6 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:20:29 -0800
Subject: [PATCH 49/50] updating default to standard

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cd97382a..3e689bc4 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ We provide 3 types of builds:
 2. `TENSORRTLLM` which includes our TRT-LLM backend
 3. `VLLM` which includes our VLLM backend
 
-For example, if you want to build a container for the `VLLM` backend you can run
+For example, if you want to build a container for the `STANDARD` backends you can run
 
 `./container/build.sh`
 

From aa78878642932d0708b446909e69246532d08bd4 Mon Sep 17 00:00:00 2001
From: nnshah1 <neelays@nvidia.com>
Date: Fri, 17 Jan 2025 13:27:07 -0800
Subject: [PATCH 50/50] updating

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 3e689bc4..5a4a3fa2 100644
--- a/README.md
+++ b/README.md
@@ -92,3 +92,14 @@ A basic example demonstrating the new interfaces and concepts of
 triton distributed. In the hello world example, you can deploy a set
 of simple workers to load balance requests from a local work queue.
 
+# Disclaimers
+
+> [!NOTE]
+> This project is currently in the alpha / experimental /
+> rapid-prototyping stage and we will be adding new features incrementally.
+
+1. The `TENSORRTLLM` and `VLLM` containers are WIP and not expected to
+   work out of the box.
+
+2. Testing has primarily been on single node systems with processes
+   launched within a single container.