From 57ab1145e3b6a60fc6880370961958130e30b530 Mon Sep 17 00:00:00 2001
From: Francesco Petrini <francescogpetrini@gmail.com>
Date: Tue, 10 Dec 2024 09:59:28 -0800
Subject: [PATCH 1/8] Pin Pybind Version (#418)

* aligning the pybind versions to v2.12 for core and python_backend

* Review comment

---------

Co-authored-by: Kyle McGill <kmcgill@nvidia.com>
---
 python/tritonserver/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tritonserver/CMakeLists.txt b/python/tritonserver/CMakeLists.txt
index c958619d8..c06292451 100644
--- a/python/tritonserver/CMakeLists.txt
+++ b/python/tritonserver/CMakeLists.txt
@@ -43,8 +43,8 @@ include(FetchContent)
 FetchContent_Declare(
   pybind11
   GIT_REPOSITORY "https://github.com/pybind/pybind11"
-  # COMMIT ID for v2.10.0
-  GIT_TAG "aa304c9c7d725ffb9d10af08a3b34cb372307020"
+  # COMMIT ID for v2.12.0
+  GIT_TAG "3e9dfa2866941655c56877882565e7577de6fc7b"
   GIT_SHALLOW ON
 )
 FetchContent_MakeAvailable(pybind11)

From 8030611f0da50ef541e06729a3dc170cf0040a34 Mon Sep 17 00:00:00 2001
From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com>
Date: Fri, 13 Dec 2024 17:38:56 -0500
Subject: [PATCH 2/8] ci: Add GitHub action for core build and python testing
 in pull requests (#416)

---
 .github/workflows/build-and-test.yml | 40 ++++++++++++++++++++++++++++
 python/test/test_api.py              |  6 +----
 src/test/input_byte_size_test.cc     |  2 +-
 3 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/build-and-test.yml

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
new file mode 100644
index 000000000..26bbcfd21
--- /dev/null
+++ b/.github/workflows/build-and-test.yml
@@ -0,0 +1,40 @@
+name: Build And Test
+
+on:
+    pull_request:
+      branches:
+        - main
+      types: [synchronize, opened, reopened, ready_for_review]
+
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    container:
+      image: nvcr.io/nvidia/tritonserver:24.10-py3
+      volumes:
+        - ${{ github.workspace }}:/core
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Install dependencies
+        run: |
+          apt update
+          apt install -y --no-install-recommends clang-format-15 cmake libb64-dev rapidjson-dev libre2-dev
+          wget -O /tmp/boost.tar.gz https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz && (cd /tmp && tar xzf boost.tar.gz) && mv /tmp/boost_1_80_0/boost /usr/include/boost
+          pip install build pytest
+
+      - name: Build
+        run: |
+          mkdir -p /core/build
+          cd /core/build
+          cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_CORE_HEADERS_ONLY=OFF ..
+          export TRITON_PYBIND="_c/triton_bindings.cpython-310-x86_64-linux-gnu.so"
+          make -j8
+
+      - name: Run tests with pytest
+        run: |
+          cd /core
+          python3 -m pip install --force-reinstall build/python/generic/wheel/dist/tritonserver-*.whl
+          pytest python/test -v
diff --git a/python/test/test_api.py b/python/test/test_api.py
index c15847aab..af910e71d 100644
--- a/python/test/test_api.py
+++ b/python/test/test_api.py
@@ -357,11 +357,7 @@ def test_stop(self):
                     {
                         "backend": "python",
                         "parameters": {"decoupled": {"string_value": "False"}},
-                        # Keep instance count low for fast startup/cleanup.
-                        # Alternatively can use KIND_CPU here, but keeping gpus/count explicit.
-                        "instance_group": [
-                            {"kind": "KIND_GPU", "gpus": [0], "count": 1}
-                        ],
+                        "instance_group": [{"kind": "KIND_CPU"}],
                     }
                 )
             },
diff --git a/src/test/input_byte_size_test.cc b/src/test/input_byte_size_test.cc
index cf3e3bd58..1774fe7b4 100644
--- a/src/test/input_byte_size_test.cc
+++ b/src/test/input_byte_size_test.cc
@@ -378,7 +378,7 @@ TEST_F(InputByteSizeTest, InputByteSizeLarge)
       "setting request release callback");
 
   // Define input shape and data
-  size_t element_cnt = (1LL << 31) / sizeof(float);
+  int64_t element_cnt = (1LL << 31) / sizeof(float);
   std::vector<int64_t> shape{1, element_cnt};
   std::vector<float> input_data(element_cnt, 1);
   const auto input0_byte_size = sizeof(input_data[0]) * input_data.size();

From 132f1d47864b8c29e37f0ad4e1752e64c8b676dc Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Fri, 13 Dec 2024 14:52:08 -0800
Subject: [PATCH 3/8] fix: Add missing struct keywords to fix support for
 auto-generated rust bindings from C APIs (#417)

---
 include/triton/core/tritonserver.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h
index d9701e890..efbac44be 100644
--- a/include/triton/core/tritonserver.h
+++ b/include/triton/core/tritonserver.h
@@ -847,9 +847,9 @@ TRITONSERVER_InferenceTraceTensorNew(
 /// \param timestamp The timestamp associated with the trace activity.
 /// \param name The trace activity name.
 /// \return a TRITONSERVER_Error indicating success or failure.
-TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
 TRITONSERVER_InferenceTraceReportActivity(
-    TRITONSERVER_InferenceTrace* trace, uint64_t timestamp,
+    struct TRITONSERVER_InferenceTrace* trace, uint64_t timestamp,
     const char* activity_name);
 
 /// Delete a trace object.
@@ -1938,9 +1938,9 @@ TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize(
 /// \param gpu_device The GPU device to set the CUDA virtual address space size
 /// \param size The size of the CUDA virtual address space.
 /// \return a TRITONSERVER_Error indicating success or failure.
-TRITONSERVER_DECLSPEC TRITONSERVER_Error*
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
 TRITONSERVER_ServerOptionsSetCudaVirtualAddressSize(
-    TRITONSERVER_ServerOptions* options, int gpu_device,
+    struct TRITONSERVER_ServerOptions* options, int gpu_device,
     size_t cuda_virtual_address_size);
 
 /// Deprecated. See TRITONSERVER_ServerOptionsSetCacheConfig instead.

From f7ff33f61f527a22c71387665aba41488324a4a5 Mon Sep 17 00:00:00 2001
From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com>
Date: Fri, 13 Dec 2024 18:51:47 -0500
Subject: [PATCH 4/8] refactor: Migrates Python tests to pytest (#413)

* Migrates test_api.py to pytest

Migrates test_api.py to pytest and removes `unittest`. Changes setup functions
to fixtures and replaces `unittest` assertion methods with regular Python `assert`s.

Also lowers the timeout for the server to make the tests run a bit faster.

* Updates test_binding.py to use pytest

Replaces all `unittest` APIs with equivalent `pytest` ones. This change
also updates the tests to use `tempfile` instead of manually creating and
removing files and directories.

* Parametrizes tests instead of running loops

* Updates L0 job to free space on host
---
 .github/workflows/build-and-test.yml |  14 +-
 python/test/test_api.py              | 225 +++++++---------
 python/test/test_binding.py          | 375 +++++++++++++--------------
 3 files changed, 294 insertions(+), 320 deletions(-)

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 26bbcfd21..2ab3d0658 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -14,15 +14,27 @@ jobs:
       image: nvcr.io/nvidia/tritonserver:24.10-py3
       volumes:
         - ${{ github.workspace }}:/core
+        # Mount /usr so we can free space
+        - /usr:/host_usr
+      env:
+        AGENT_TOOLSDIRECTORY: "$AGENT_TOOLSDIRECTORY"
 
     steps:
       - uses: actions/checkout@v3
 
+      - name: Free space
+        run: |
+          rm -rf \
+            /host_usr/share/dotnet /host_usr/local/lib/android /opt/ghc \
+            /host_usr/local/share/powershell /host_usr/share/swift /host_usr/local/.ghcup \
+            /host_usr/lib/jvm
+          rm -rf "$AGENT_TOOLSDIRECTORY"
+
       - name: Install dependencies
         run: |
           apt update
           apt install -y --no-install-recommends clang-format-15 cmake libb64-dev rapidjson-dev libre2-dev
-          wget -O /tmp/boost.tar.gz https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz && (cd /tmp && tar xzf boost.tar.gz) && mv /tmp/boost_1_80_0/boost /usr/include/boost
+          wget -O /tmp/boost.tar.gz https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz && (cd /tmp && tar xzf boost.tar.gz) && mv /tmp/boost_1_80_0/boost /usr/include/boost && rm /tmp/boost.tar.gz
           pip install build pytest
 
       - name: Build
diff --git a/python/test/test_api.py b/python/test/test_api.py
index af910e71d..68aa7a318 100644
--- a/python/test/test_api.py
+++ b/python/test/test_api.py
@@ -24,14 +24,9 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import asyncio
-import copy
 import json
 import os
-import queue
 import shutil
-import time
-import unittest
 
 import numpy
 import pytest
@@ -50,46 +45,44 @@
 except ImportError:
     torch = None
 
-module_directory = os.path.split(os.path.abspath(__file__))[0]
-test_model_directory = os.path.abspath(
-    os.path.join(module_directory, "test_api_models")
-)
-test_logs_directory = os.path.abspath(os.path.join(module_directory, "test_api_logs"))
-
-shutil.rmtree(test_logs_directory, ignore_errors=True)
-
-os.makedirs(test_logs_directory)
-
-server_options = tritonserver.Options(
-    server_id="TestServer",
-    model_repository=test_model_directory,
-    log_verbose=6,
-    log_error=True,
-    log_warn=True,
-    log_info=True,
-    exit_on_error=True,
-    strict_model_config=False,
-    model_control_mode=tritonserver.ModelControlMode.EXPLICIT,
-    exit_timeout=30,
-)
-
-
-class ModelTests(unittest.TestCase):
-    def setup_method(self, method):
-        self._server_options = copy.copy(server_options)
-        self._server_options.log_file = os.path.join(
-            test_logs_directory, method.__name__ + ".server.log"
-        )
+TEST_ROOT = os.path.abspath(os.path.dirname(__file__))
+TEST_MODEL_DIR = os.path.abspath(os.path.join(TEST_ROOT, "test_api_models"))
+TEST_LOGS_DIR = os.path.abspath(os.path.join(TEST_ROOT, "test_api_logs"))
+
+
+@pytest.fixture(autouse=True, scope="module")
+def create_log_dir():
+    shutil.rmtree(TEST_LOGS_DIR, ignore_errors=True)
+    os.makedirs(TEST_LOGS_DIR)
+
+
+@pytest.fixture()
+def server_options(request):
+    return tritonserver.Options(
+        server_id="TestServer",
+        model_repository=TEST_MODEL_DIR,
+        log_verbose=6,
+        log_error=True,
+        log_warn=True,
+        log_info=True,
+        exit_on_error=True,
+        strict_model_config=False,
+        model_control_mode=tritonserver.ModelControlMode.EXPLICIT,
+        exit_timeout=5,
+        log_file=os.path.join(TEST_LOGS_DIR, request.node.name + ".server.log"),
+    )
+
 
-    def test_create_request(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+class TestModels:
+    def test_create_request(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
         request = server.models()["test"].create_request()
 
         request = tritonserver.InferenceRequest(server.model("test"))
 
 
-class AllocatorTests(unittest.TestCase):
+class TestAllocators:
     class MockMemoryAllocator(tritonserver.MemoryAllocator):
         def __init__(self):
             pass
@@ -97,17 +90,11 @@ def __init__(self):
         def allocate(self, *args, **kwargs):
             raise Exception("foo")
 
-    def setup_method(self, method):
-        self._server_options = copy.copy(server_options)
-        self._server_options.log_file = os.path.join(
-            test_logs_directory, method.__name__ + ".server.log"
-        )
-
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
-    def test_memory_fallback_to_cpu(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+    def test_memory_fallback_to_cpu(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
-        self.assertTrue(server.ready())
+        assert server.ready()
 
         allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
 
@@ -133,18 +120,19 @@ def test_memory_fallback_to_cpu(self):
         for response in server.model("test").infer(
             inputs={"fp16_input": fp16_input},
         ):
-            self.assertEqual(
-                response.outputs["fp16_output"].memory_type, tritonserver.MemoryType.CPU
+            assert (
+                response.outputs["fp16_output"].memory_type
+                == tritonserver.MemoryType.CPU
             )
             fp16_output = numpy.from_dlpack(response.outputs["fp16_output"])
-            self.assertEqual(fp16_input[0][0], fp16_output[0][0])
+            assert fp16_input[0][0] == fp16_output[0][0]
 
         tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator
 
-    def test_memory_allocator_exception(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+    def test_memory_allocator_exception(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
-        self.assertTrue(server.ready())
+        assert server.ready()
 
         server.load(
             "test",
@@ -158,20 +146,20 @@ def test_memory_allocator_exception(self):
             },
         )
 
-        with self.assertRaises(tritonserver.InternalError):
+        with pytest.raises(tritonserver.InternalError):
             for response in server.model("test").infer(
                 inputs={
                     "string_input": tritonserver.Tensor.from_string_array([["hello"]])
                 },
                 output_memory_type="gpu",
-                output_memory_allocator=AllocatorTests.MockMemoryAllocator(),
+                output_memory_allocator=TestAllocators.MockMemoryAllocator(),
             ):
                 pass
 
-    def test_unsupported_memory_type(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+    def test_unsupported_memory_type(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
-        self.assertTrue(server.ready())
+        assert server.ready()
 
         server.load(
             "test",
@@ -194,7 +182,7 @@ def test_unsupported_memory_type(self):
         else:
             allocator = None
 
-        with self.assertRaises(tritonserver.InvalidArgumentError):
+        with pytest.raises(tritonserver.InvalidArgumentError):
             for response in server.model("test").infer(
                 inputs={
                     "string_input": tritonserver.Tensor.from_string_array([["hello"]])
@@ -218,7 +206,7 @@ def test_allocate_on_cpu_and_reshape(self):
 
         cpu_array = memory_buffer.owner
 
-        self.assertEqual(memory_buffer.size, 200)
+        assert memory_buffer.size == 200
 
         fp32_size = int(memory_buffer.size / 4)
 
@@ -227,16 +215,13 @@ def test_allocate_on_cpu_and_reshape(self):
         )
 
         cpu_fp32_array = numpy.from_dlpack(tensor)
-        self.assertEqual(cpu_array.ctypes.data, cpu_fp32_array.ctypes.data)
-        self.assertEqual(cpu_fp32_array.dtype, numpy.float32)
-        self.assertEqual(cpu_fp32_array.nbytes, 200)
+        assert cpu_array.ctypes.data == cpu_fp32_array.ctypes.data
+        assert cpu_fp32_array.dtype == numpy.float32
+        assert cpu_fp32_array.nbytes == 200
 
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
     @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
     def test_allocate_on_gpu_and_reshape(self):
-        if cupy is None:
-            return
-
         allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU]
 
         memory_buffer = allocator.allocate(
@@ -248,7 +233,7 @@ def test_allocate_on_gpu_and_reshape(self):
         gpu_array = cupy.empty([10, 20], dtype=cupy.uint8)
         memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)
 
-        self.assertEqual(memory_buffer.size, 200)
+        assert memory_buffer.size == 200
 
         fp32_size = int(memory_buffer.size / 4)
 
@@ -257,55 +242,51 @@ def test_allocate_on_gpu_and_reshape(self):
         )
 
         gpu_fp32_array = cupy.from_dlpack(tensor)
-        self.assertEqual(
-            gpu_array.__cuda_array_interface__["data"][0],
-            gpu_fp32_array.__cuda_array_interface__["data"][0],
+        assert (
+            gpu_array.__cuda_array_interface__["data"][0]
+            == gpu_fp32_array.__cuda_array_interface__["data"][0]
         )
-        self.assertEqual(gpu_fp32_array.dtype, cupy.float32)
-        self.assertEqual(gpu_fp32_array.nbytes, 200)
+
+        assert gpu_fp32_array.dtype == cupy.float32
+        assert gpu_fp32_array.nbytes == 200
 
         torch_fp32_tensor = torch.from_dlpack(tensor)
-        self.assertEqual(torch_fp32_tensor.dtype, torch.float32)
-        self.assertEqual(
-            torch_fp32_tensor.data_ptr(), gpu_array.__cuda_array_interface__["data"][0]
+        assert torch_fp32_tensor.dtype == torch.float32
+        assert (
+            torch_fp32_tensor.data_ptr()
+            == gpu_array.__cuda_array_interface__["data"][0]
         )
-        self.assertEqual(torch_fp32_tensor.nbytes, 200)
+        assert torch_fp32_tensor.nbytes == 200
 
 
-class TensorTests(unittest.TestCase):
+class TestTensor:
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
     def test_cpu_to_gpu(self):
-        if cupy is None:
-            return
         cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32)
         cpu_tensor = tritonserver.Tensor.from_dlpack(cpu_array)
         gpu_tensor = cpu_tensor.to_device("gpu:0")
         gpu_array = cupy.from_dlpack(gpu_tensor)
 
-        self.assertEqual(gpu_array.device, cupy.cuda.Device(0))
+        assert gpu_array.device == cupy.cuda.Device(0)
 
         numpy.testing.assert_array_equal(cpu_array, gpu_array.get())
 
         memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array)
 
-        self.assertEqual(
-            gpu_array.__cuda_array_interface__["data"][0], memory_buffer.data_ptr
-        )
+        assert gpu_array.__cuda_array_interface__["data"][0] == memory_buffer.data_ptr
 
     @pytest.mark.skipif(
         torch is None, reason="Skipping gpu memory, torch not installed"
     )
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
     def test_gpu_tensor_from_dl_pack(self):
-        if cupy is None or torch is None:
-            return
         cupy_array = cupy.ones([100]).astype(cupy.float64)
         tensor = tritonserver.Tensor.from_dlpack(cupy_array)
         torch_tensor = torch.from_dlpack(cupy_array)
 
-        self.assertEqual(torch_tensor.data_ptr(), tensor.data_ptr)
-        self.assertEqual(torch_tensor.nbytes, tensor.size)
-        self.assertEqual(torch_tensor.__dlpack_device__(), tensor.__dlpack_device__())
+        assert torch_tensor.data_ptr() == tensor.data_ptr
+        assert torch_tensor.nbytes == tensor.size
+        assert torch_tensor.__dlpack_device__() == tensor.__dlpack_device__()
 
     @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed")
     def test_tensor_from_numpy(self):
@@ -313,42 +294,36 @@ def test_tensor_from_numpy(self):
         tensor = tritonserver.Tensor.from_dlpack(cpu_array)
         torch_tensor = torch.from_dlpack(tensor)
         numpy.testing.assert_array_equal(torch_tensor.numpy(), cpu_array)
-        self.assertEqual(torch_tensor.data_ptr(), cpu_array.ctypes.data)
-
+        assert torch_tensor.data_ptr() == cpu_array.ctypes.data
 
-class ServerTests(unittest.TestCase):
-    def setup_method(self, method):
-        self._server_options = copy.copy(server_options)
-        self._server_options.log_file = os.path.join(
-            test_logs_directory, method.__name__ + ".server.log"
-        )
 
+class TestServer:
     def test_not_started(self):
         server = tritonserver.Server()
-        with self.assertRaises(tritonserver.InvalidArgumentError):
+        with pytest.raises(tritonserver.InvalidArgumentError):
             server.ready()
 
     def test_invalid_option_type(self):
         server = tritonserver.Server(server_id=1)
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             server.start()
 
         server = tritonserver.Server(model_repository=1)
-        with self.assertRaises(TypeError):
+        with pytest.raises(TypeError):
             server.start()
 
     def test_invalid_repo(self):
-        with self.assertRaises(tritonserver.InternalError):
+        with pytest.raises(tritonserver.InternalError):
             tritonserver.Server(model_repository="foo").start()
 
-    def test_ready(self):
-        server = tritonserver.Server(self._server_options).start()
-        self.assertTrue(server.ready())
+    def test_ready(self, server_options):
+        server = tritonserver.Server(server_options).start()
+        assert server.ready()
 
-    def test_stop(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+    def test_stop(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
-        self.assertTrue(server.ready())
+        assert server.ready()
 
         server.load(
             "test",
@@ -376,22 +351,16 @@ def test_stop(self):
         server.stop()
 
     def test_model_repository_not_specified(self):
-        with self.assertRaises(tritonserver.InvalidArgumentError):
+        with pytest.raises(tritonserver.InvalidArgumentError):
             tritonserver.Server(model_repository=None).start()
 
 
-class InferenceTests(unittest.TestCase):
-    def setup_method(self, method):
-        self._server_options = copy.copy(server_options)
-        self._server_options.log_file = os.path.join(
-            test_logs_directory, method.__name__ + ".server.log"
-        )
-
+class TestInference:
     @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
-    def test_gpu_output(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+    def test_gpu_output(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
-        self.assertTrue(server.ready())
+        assert server.ready()
 
         server.load(
             "test",
@@ -412,14 +381,14 @@ def test_gpu_output(self):
             output_memory_type="gpu",
         ):
             fp16_output = cupy.from_dlpack(response.outputs["fp16_output"])
-            self.assertEqual(fp16_input[0][0], fp16_output[0][0])
+            assert fp16_input[0][0] == fp16_output[0][0]
 
         for response in server.model("test").infer(
             inputs={"string_input": [["hello"]]},
             output_memory_type="gpu",
         ):
             text_output = response.outputs["string_output"].to_string_array()
-            self.assertEqual(text_output[0][0], "hello")
+            assert text_output[0][0] == "hello"
 
         for response in server.model("test").infer(
             inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])},
@@ -427,12 +396,12 @@ def test_gpu_output(self):
         ):
             text_output = response.outputs["string_output"].to_string_array()
             text_output = response.outputs["string_output"].to_string_array()
-            self.assertEqual(text_output[0][0], "hello")
+            assert text_output[0][0] == "hello"
 
-    def test_basic_inference(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+    def test_basic_inference(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
-        self.assertTrue(server.ready())
+        assert server.ready()
 
         server.load(
             "test",
@@ -475,10 +444,10 @@ def test_basic_inference(self):
                 )
                 numpy.testing.assert_array_equal(input_value, output_value)
 
-    def test_parameters(self):
-        server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
+    def test_parameters(self, server_options):
+        server = tritonserver.Server(server_options).start(wait_until_ready=True)
 
-        self.assertTrue(server.ready())
+        assert server.ready()
 
         server.load(
             "test",
@@ -513,7 +482,7 @@ def test_parameters(self):
             )
             assert input_parameters == output_parameters
 
-        with self.assertRaises(tritonserver.InvalidArgumentError):
+        with pytest.raises(tritonserver.InvalidArgumentError):
             input_parameters = {
                 "invalid": {"test": 1},
             }
@@ -525,7 +494,7 @@ def test_parameters(self):
                 raise_on_error=True,
             )
 
-        with self.assertRaises(tritonserver.InvalidArgumentError):
+        with pytest.raises(tritonserver.InvalidArgumentError):
             input_parameters = {
                 "invalid": None,
             }
diff --git a/python/test/test_binding.py b/python/test/test_binding.py
index 8f084bec5..143e55f50 100644
--- a/python/test/test_binding.py
+++ b/python/test/test_binding.py
@@ -28,10 +28,11 @@
 import json
 import os
 import queue
-import shutil
-import unittest
+import re
+import tempfile
 
 import numpy
+import pytest
 from tritonserver import _c as triton_bindings
 
 
@@ -226,22 +227,18 @@ def execute(self, requests):
 
 
 # ======================================= Test cases ===========================
-class BindingTest(unittest.TestCase):
-    def setUp(self):
-        self._test_model_repo = os.path.join(os.getcwd(), "binding_test_repo")
-        # clear model repository that may be created for testing.
-        if os.path.exists(self._test_model_repo):
-            shutil.rmtree(self._test_model_repo)
-        os.makedirs(self._test_model_repo)
-        self._model_name = "addsub"
-        self._version = "1"
-        self._file_name = "model.py"
-
-    def tearDown(self):
+class TestBindings:
+    @pytest.fixture(autouse=True, scope="function")
+    def model_repo(self):
+        with tempfile.TemporaryDirectory() as repo:
+            self._test_model_repo = repo
+            self._model_name = "addsub"
+            self._version = "1"
+            self._file_name = "model.py"
+
+            yield
+
         gc.collect()
-        # clear model repository that may be created for testing.
-        if os.path.exists(self._test_model_repo):
-            shutil.rmtree(self._test_model_repo)
 
     # helper functions
     def _to_pyobject(self, triton_message):
@@ -315,8 +312,9 @@ def _prepare_inference_request(self, server):
 
         return request, allocator, response_queue, request_counter
 
-    def test_exceptions(self):
-        ex_list = [
+    @pytest.mark.parametrize(
+        "ex_type",
+        [
             triton_bindings.UnknownError,
             triton_bindings.InternalError,
             triton_bindings.NotFoundError,
@@ -324,15 +322,15 @@ def test_exceptions(self):
             triton_bindings.UnavailableError,
             triton_bindings.UnsupportedError,
             triton_bindings.AlreadyExistsError,
-        ]
-        for ex_type in ex_list:
-            with self.assertRaises(triton_bindings.TritonError) as ctx:
-                raise ex_type("Error message")
-            self.assertTrue(isinstance(ctx.exception, ex_type))
-            self.assertEqual(str(ctx.exception), "Error message")
-
-    def test_data_type(self):
-        t_list = [
+        ],
+    )
+    def test_exceptions(self, ex_type):
+        with pytest.raises(ex_type, match="Error message") as ctx:
+            raise ex_type("Error message")
+
+    @pytest.mark.parametrize(
+        "t, t_str, t_size",
+        [
             (triton_bindings.TRITONSERVER_DataType.INVALID, "<invalid>", 0),
             (triton_bindings.TRITONSERVER_DataType.BOOL, "BOOL", 1),
             (triton_bindings.TRITONSERVER_DataType.UINT8, "UINT8", 1),
@@ -348,31 +346,35 @@ def test_data_type(self):
             (triton_bindings.TRITONSERVER_DataType.FP64, "FP64", 8),
             (triton_bindings.TRITONSERVER_DataType.BYTES, "BYTES", 0),
             (triton_bindings.TRITONSERVER_DataType.BF16, "BF16", 2),
-        ]
-
-        for t, t_str, t_size in t_list:
-            self.assertEqual(triton_bindings.TRITONSERVER_DataTypeString(t), t_str)
-            self.assertEqual(triton_bindings.TRITONSERVER_StringToDataType(t_str), t)
-            self.assertEqual(triton_bindings.TRITONSERVER_DataTypeByteSize(t), t_size)
-
-    def test_memory_type(self):
-        t_list = [
+        ],
+    )
+    def test_data_type(self, t, t_str, t_size):
+        assert triton_bindings.TRITONSERVER_DataTypeString(t) == t_str
+        assert triton_bindings.TRITONSERVER_StringToDataType(t_str) == t
+        assert triton_bindings.TRITONSERVER_DataTypeByteSize(t) == t_size
+
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_MemoryType.CPU, "CPU"),
             (triton_bindings.TRITONSERVER_MemoryType.CPU_PINNED, "CPU_PINNED"),
             (triton_bindings.TRITONSERVER_MemoryType.GPU, "GPU"),
-        ]
-        for t, t_str in t_list:
-            self.assertEqual(triton_bindings.TRITONSERVER_MemoryTypeString(t), t_str)
-
-    def test_parameter_type(self):
-        t_list = [
+        ],
+    )
+    def test_memory_type(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_MemoryTypeString(t) == t_str
+
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_ParameterType.STRING, "STRING"),
             (triton_bindings.TRITONSERVER_ParameterType.INT, "INT"),
             (triton_bindings.TRITONSERVER_ParameterType.BOOL, "BOOL"),
             (triton_bindings.TRITONSERVER_ParameterType.BYTES, "BYTES"),
-        ]
-        for t, t_str in t_list:
-            self.assertEqual(triton_bindings.TRITONSERVER_ParameterTypeString(t), t_str)
+        ],
+    )
+    def test_parameter_type(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_ParameterTypeString(t) == t_str
 
     def test_parameter(self):
         # C API doesn't provide additional API for parameter, can only test
@@ -389,17 +391,17 @@ def test_parameter(self):
         del bytes_param
         gc.collect()
 
-    def test_instance_kind(self):
-        t_list = [
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_InstanceGroupKind.AUTO, "AUTO"),
             (triton_bindings.TRITONSERVER_InstanceGroupKind.CPU, "CPU"),
             (triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, "GPU"),
             (triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL, "MODEL"),
-        ]
-        for t, t_str in t_list:
-            self.assertEqual(
-                triton_bindings.TRITONSERVER_InstanceGroupKindString(t), t_str
-            )
+        ],
+    )
+    def test_instance_kind(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_InstanceGroupKindString(t) == t_str
 
     def test_log(self):
         # This test depends on 'TRITONSERVER_ServerOptions' operates properly
@@ -426,7 +428,7 @@ def test_log(self):
                 (triton_bindings.TRITONSERVER_LogLevel.ERROR, True),
                 (triton_bindings.TRITONSERVER_LogLevel.VERBOSE, False),
             ]:
-                self.assertEqual(triton_bindings.TRITONSERVER_LogIsEnabled(ll), enabled)
+                assert triton_bindings.TRITONSERVER_LogIsEnabled(ll) == enabled
             # Write message to each of the log level
             triton_bindings.TRITONSERVER_LogMessage(
                 triton_bindings.TRITONSERVER_LogLevel.INFO,
@@ -455,14 +457,14 @@ def test_log(self):
             with open(log_file, "r") as f:
                 log = f.read()
                 # Check level
-                self.assertRegex(log, r"filename:123.*info_message")
-                self.assertNotRegex(log, r"filename:456.*warn_message")
-                self.assertRegex(log, r"filename:789.*error_message")
-                self.assertNotRegex(log, r"filename:147.*verbose_message")
+                assert re.search(r"filename:123.*info_message", log)
+                assert not re.search(r"filename:456.*warn_message", log)
+                assert re.search(r"filename:789.*error_message", log)
+                assert not re.search(r"filename:147.*verbose_message", log)
                 # Check format "MMDD hh:mm:ss.ssssss".
-                self.assertRegex(log, default_format_regex)
+                assert re.search(default_format_regex, log)
                 # sanity check that there is no log with other format "YYYY-MM-DDThh:mm:ssZ L"
-                self.assertNotRegex(log, iso8601_format_regex)
+                assert not re.search(iso8601_format_regex, log)
             # Test different format
             options.set_log_format(triton_bindings.TRITONSERVER_LogFormat.ISO8601)
             triton_bindings.TRITONSERVER_LogMessage(
@@ -470,8 +472,8 @@ def test_log(self):
             )
             with open(log_file, "r") as f:
                 log = f.read()
-                self.assertRegex(log, r"fn:258.*info_message")
-                self.assertRegex(log, iso8601_format_regex)
+                assert re.search(r"fn:258.*info_message", log)
+                assert re.search(iso8601_format_regex, log)
         finally:
             # Must make sure the log settings are reset as the logger is unique
             # within the process
@@ -489,11 +491,11 @@ def test_buffer_attributes(self):
         expected_byte_size = 1024
         buffer_attributes = triton_bindings.TRITONSERVER_BufferAttributes()
         buffer_attributes.memory_type_id = expected_memory_type_id
-        self.assertEqual(buffer_attributes.memory_type_id, expected_memory_type_id)
+        assert buffer_attributes.memory_type_id == expected_memory_type_id
         buffer_attributes.memory_type = expected_memory_type
-        self.assertEqual(buffer_attributes.memory_type, expected_memory_type)
+        assert buffer_attributes.memory_type == expected_memory_type
         buffer_attributes.byte_size = expected_byte_size
-        self.assertEqual(buffer_attributes.byte_size, expected_byte_size)
+        assert buffer_attributes.byte_size == expected_byte_size
         # cuda_ipc_handle is supposed to be cudaIpcMemHandle_t, must initialize buffer
         # of that size to avoid segfault. The handle getter/setter is different from other
         # attributes that different pointers may be returned from the getter, but the byte
@@ -508,7 +510,7 @@ def test_buffer_attributes(self):
             buffer_attributes.cuda_ipc_handle
         )
         for i in range(handle_byte_size):
-            self.assertEqual(int.from_bytes(res_arr[i], "big"), mock_handle[i])
+            assert int.from_bytes(res_arr[i], "big") == mock_handle[i]
 
     def test_allocator(self):
         def alloc_fn(
@@ -554,7 +556,7 @@ def buffer_fn(
     def test_message(self):
         expected_dict = {"key_0": [1, 2, "3"], "key_1": {"nested_key": "nested_value"}}
         message = triton_bindings.TRITONSERVER_Message(json.dumps(expected_dict))
-        self.assertEqual(expected_dict, json.loads(message.serialize_to_json()))
+        assert expected_dict == json.loads(message.serialize_to_json())
 
     def test_metrics(self):
         # This test depends on 'TRITONSERVER_Server' operates properly
@@ -570,35 +572,39 @@ def test_metrics(self):
         server = triton_bindings.TRITONSERVER_Server(options)
         metrics = server.metrics()
         # Check one of the metrics is reported
-        self.assertTrue(
-            "nv_cpu_memory_used_bytes"
-            in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS)
+        assert "nv_cpu_memory_used_bytes" in metrics.formatted(
+            triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS
         )
 
-    def test_trace_enum(self):
-        t_list = [
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.DISABLED, "DISABLED"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.MIN, "MIN"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.MAX, "MAX"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS, "TIMESTAMPS"),
             (triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS, "TENSORS"),
-        ]
-        for t, t_str in t_list:
-            self.assertEqual(
-                triton_bindings.TRITONSERVER_InferenceTraceLevelString(t), t_str
-            )
+        ],
+    )
+    def test_trace_enum(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_InferenceTraceLevelString(t) == t_str
+
+    def test_trace_bitwise_operations(self):
         # bit-wise operation
         level = int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) | int(
             triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS
         )
-        self.assertNotEqual(
-            level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS), 0
+        assert (
+            level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS)
+            != 0
         )
-        self.assertNotEqual(
-            level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS), 0
+        assert (
+            level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS) != 0
         )
 
-        t_list = [
+    @pytest.mark.parametrize(
+        "t, t_str",
+        [
             (
                 triton_bindings.TRITONSERVER_InferenceTraceActivity.REQUEST_START,
                 "REQUEST_START",
@@ -639,11 +645,10 @@ def test_trace_enum(self):
                 triton_bindings.TRITONSERVER_InferenceTraceActivity.TENSOR_BACKEND_OUTPUT,
                 "TENSOR_BACKEND_OUTPUT",
             ),
-        ]
-        for t, t_str in t_list:
-            self.assertEqual(
-                triton_bindings.TRITONSERVER_InferenceTraceActivityString(t), t_str
-            )
+        ],
+    )
+    def test_trace_activity_enum(self, t, t_str):
+        assert triton_bindings.TRITONSERVER_InferenceTraceActivityString(t) == t_str
 
     def test_trace(self):
         # This test depends on 'test_infer_async' test to capture
@@ -678,7 +683,7 @@ def test_trace(self):
         _ = trace_dict["signal_queue"].get(block=True, timeout=10)
 
         # check 'trace_dict'
-        self.assertTrue(trace_id in trace_dict)
+        assert trace_id in trace_dict
 
         # check activity are logged correctly,
         # value of 0 indicate it is timestamp trace,
@@ -701,22 +706,22 @@ def test_trace(self):
         }
         for tl in trace_dict[trace_id]:
             # basic check
-            self.assertEqual(tl["id"], trace_id)
-            self.assertEqual(tl["parent_id"], 123)
-            self.assertEqual(tl["model_name"], self._model_name)
-            self.assertEqual(tl["model_version"], 1)
-            self.assertEqual(tl["request_id"], "req_0")
-            self.assertTrue(tl["activity"] in expected_activities)
+            assert tl["id"] == trace_id
+            assert tl["parent_id"] == 123
+            assert tl["model_name"] == self._model_name
+            assert tl["model_version"] == 1
+            assert tl["request_id"] == "req_0"
+            assert tl["activity"] in expected_activities
             if expected_activities[tl["activity"]] == 0:
-                self.assertTrue("timestamp" in tl)
+                assert "timestamp" in tl
             else:
-                self.assertTrue("tensor" in tl)
+                assert "tensor" in tl
                 expected_activities[tl["activity"]] -= 1
             if expected_activities[tl["activity"]] == 0:
                 del expected_activities[tl["activity"]]
         # check if dict is empty to ensure the activity are logged in correct
         # amount.
-        self.assertFalse(bool(expected_activities))
+        assert not (bool(expected_activities))
         request_counter.get()
 
     def test_options(self):
@@ -751,9 +756,10 @@ def test_options(self):
             triton_bindings.TRITONSERVER_InstanceGroupKind.CPU,
             triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL,
         ]:
-            with self.assertRaises(triton_bindings.TritonError) as context:
+            with pytest.raises(
+                triton_bindings.TritonError, match="not supported"
+            ) as context:
                 options.set_model_load_device_limit(k, 0, 0)
-            self.assertTrue("not supported" in str(context.exception))
 
         # Backend
         options.set_backend_directory("backend_dir_0")
@@ -780,7 +786,7 @@ def test_options(self):
         options.set_cache_directory("cache_dir_1")
         # Log
         try:
-            options.set_log_file("some_file")
+            options.set_log_file(tempfile.NamedTemporaryFile().name)
             options.set_log_info(True)
             options.set_log_warn(True)
             options.set_log_error(True)
@@ -807,9 +813,11 @@ def test_options(self):
         options.set_metrics_config("metrics_group", "setting", "value")
 
         # Misc..
-        with self.assertRaises(triton_bindings.TritonError) as context:
+        with pytest.raises(
+            triton_bindings.TritonError, match="Unsupported host policy setting"
+        ) as context:
             options.set_host_policy("policy_name", "setting", "value")
-        self.assertTrue("Unsupported host policy setting" in str(context.exception))
+
         options.set_repo_agent_directory("repo_agent_dir_0")
         options.set_repo_agent_directory("repo_agent_dir_1")
         options.set_buffer_manager_thread_count(4)
@@ -817,48 +825,48 @@ def test_options(self):
     def test_server(self):
         server = self._start_polling_server()
         # is_live
-        self.assertTrue(server.is_live())
+        assert server.is_live()
         # is_ready
-        self.assertTrue(server.is_ready())
+        assert server.is_ready()
         # model_is_ready
-        self.assertTrue(server.model_is_ready(self._model_name, -1))
+        assert server.model_is_ready(self._model_name, -1)
         # model_batch_properties
         expected_batch_properties = (
             int(triton_bindings.TRITONSERVER_ModelBatchFlag.UNKNOWN),
             0,
         )
-        self.assertEqual(
-            server.model_batch_properties(self._model_name, -1),
-            expected_batch_properties,
+        assert (
+            server.model_batch_properties(self._model_name, -1)
+            == expected_batch_properties
         )
         # model_transaction_properties
         expected_transaction_policy = (
             int(triton_bindings.TRITONSERVER_ModelTxnPropertyFlag.ONE_TO_ONE),
             0,
         )
-        self.assertEqual(
-            server.model_transaction_properties(self._model_name, -1),
-            expected_transaction_policy,
+        assert (
+            server.model_transaction_properties(self._model_name, -1)
+            == expected_transaction_policy
         )
         # metadata
         server_meta_data = self._to_pyobject(server.metadata())
-        self.assertTrue("name" in server_meta_data)
-        self.assertEqual(server_meta_data["name"], "testing_server")
+        assert "name" in server_meta_data
+        assert server_meta_data["name"] == "testing_server"
         # model_metadata
         model_meta_data = self._to_pyobject(server.model_metadata(self._model_name, -1))
-        self.assertTrue("name" in model_meta_data)
-        self.assertEqual(model_meta_data["name"], self._model_name)
+        assert "name" in model_meta_data
+        assert model_meta_data["name"] == self._model_name
         # model_statistics
         model_statistics = self._to_pyobject(
             server.model_statistics(self._model_name, -1)
         )
-        self.assertTrue("model_stats" in model_statistics)
+        assert "model_stats" in model_statistics
         # model_config
         model_config = self._to_pyobject(server.model_config(self._model_name, -1, 1))
-        self.assertTrue("input" in model_config)
+        assert "input" in model_config
         # model_index
         model_index = self._to_pyobject(server.model_index(0))
-        self.assertEqual(model_index[0]["name"], self._model_name)
+        assert model_index[0]["name"] == self._model_name
         # metrics (see test_metrics)
         # infer_async (see test_infer_async)
 
@@ -867,11 +875,10 @@ def test_request(self):
         # the request
         server = self._start_polling_server()
 
-        with self.assertRaises(triton_bindings.NotFoundError) as ctx:
+        with pytest.raises(triton_bindings.NotFoundError, match="unknown model") as ctx:
             _ = triton_bindings.TRITONSERVER_InferenceRequest(
                 server, "not_existing_model", -1
             )
-        self.assertTrue("unknown model" in str(ctx.exception))
 
         expected_request_id = "request"
         expected_flags = int(
@@ -889,23 +896,24 @@ def test_request(self):
 
         # request metadata
         request.id = expected_request_id
-        self.assertEqual(request.id, expected_request_id)
+        assert request.id == expected_request_id
         request.flags = expected_flags
-        self.assertEqual(request.flags, expected_flags)
+        assert request.flags == expected_flags
         request.correlation_id = expected_correlation_id
-        self.assertEqual(request.correlation_id, expected_correlation_id)
+        assert request.correlation_id == expected_correlation_id
         request.correlation_id_string = expected_correlation_id_string
-        self.assertEqual(request.correlation_id_string, expected_correlation_id_string)
+        assert request.correlation_id_string == expected_correlation_id_string
         # Expect error from retrieving correlation id in a wrong type,
         # wrap in lambda function to avoid early evaluation that raises
         # exception before assert
-        self.assertRaises(triton_bindings.TritonError, lambda: request.correlation_id)
+        with pytest.raises(triton_bindings.TritonError):
+            request.correlation_id
         request.priority = expected_priority
-        self.assertEqual(request.priority, expected_priority)
+        assert request.priority == expected_priority
         request.priority_uint64 = expected_priority_uint64
-        self.assertEqual(request.priority_uint64, 10)
+        assert request.priority_uint64 == 10
         request.timeout_microseconds = expected_timeout_microseconds
-        self.assertEqual(request.timeout_microseconds, expected_timeout_microseconds)
+        assert request.timeout_microseconds == expected_timeout_microseconds
 
         request.set_string_parameter("str_key", "str_val")
         request.set_int_parameter("int_key", 567)
@@ -922,33 +930,26 @@ def test_request(self):
         request.add_input(
             "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
         )
-        self.assertRaises(triton_bindings.TritonError, request.remove_input, "INPUT2")
+        with pytest.raises(triton_bindings.TritonError):
+            request.remove_input("INPUT2")
         # raw input assumes single input
-        self.assertRaises(triton_bindings.TritonError, request.add_raw_input, "INPUT1")
+        with pytest.raises(triton_bindings.TritonError):
+            request.add_raw_input("INPUT1")
         request.remove_input("INPUT0")
         request.add_raw_input("INPUT1")
         request.remove_all_inputs()
         # all inputs are removed, all 'append' functions should raise exceptions
         aid_args = ["INPUT0", buffer, ba.byte_size, ba.memory_type, ba.memory_type_id]
-        self.assertRaises(
-            triton_bindings.TritonError, request.append_input_data, *aid_args
-        )
-        self.assertRaises(
+        with pytest.raises(triton_bindings.TritonError):
+            request.append_input_data(*aid_args)
+        with pytest.raises(triton_bindings.TritonError):
+            request.append_input_data_with_host_policy(*aid_args, "host_policy_name")
+        with pytest.raises(
             triton_bindings.TritonError,
-            request.append_input_data_with_host_policy,
-            *aid_args,
-            "host_policy_name"
-        )
-        self.assertRaises(
-            triton_bindings.TritonError,
-            request.append_input_data_with_buffer_attributes,
-            "INPUT0",
-            buffer,
-            ba,
-        )
-        self.assertRaises(
-            triton_bindings.TritonError, request.remove_all_input_data, "INPUT0"
-        )
+        ):
+            request.append_input_data_with_buffer_attributes("INPUT0", buffer, ba)
+        with pytest.raises(triton_bindings.TritonError):
+            request.remove_all_input_data("INPUT0")
         # Add back input
         request.add_input(
             "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape
@@ -1004,20 +1005,19 @@ def test_infer_async(self):
 
         # Expect every response to be returned in 10 seconds
         flags, res = response_queue.get(block=True, timeout=10)
-        self.assertEqual(
-            flags, int(triton_bindings.TRITONSERVER_ResponseCompleteFlag.FINAL)
-        )
+        assert flags == int(triton_bindings.TRITONSERVER_ResponseCompleteFlag.FINAL)
         # expect no error
         res.throw_if_response_error()
         # version will be actual model version
-        self.assertEqual(res.model, (self._model_name, 1))
-        self.assertEqual(res.id, request.id)
-        self.assertEqual(res.parameter_count, 0)
+        assert res.model == (self._model_name, 1)
+        assert res.id == request.id
+        assert res.parameter_count == 0
         # out of range access
-        self.assertRaises(triton_bindings.TritonError, res.parameter, 0)
+        with pytest.raises(triton_bindings.TritonError):
+            res.parameter(0)
 
         # read output tensor
-        self.assertEqual(res.output_count, 2)
+        assert res.output_count == 2
         for out, expected_name, expected_data in [
             (res.output(0), "OUTPUT0", input + input),
             (res.output(1), "OUTPUT1", input - input),
@@ -1032,34 +1032,32 @@ def test_infer_async(self):
                 memory_type_id,
                 numpy_buffer,
             ) = out
-            self.assertEqual(name, expected_name)
-            self.assertEqual(data_type, triton_bindings.TRITONSERVER_DataType.FP32)
-            self.assertEqual(shape, expected_data.shape)
-            self.assertEqual(out_buffer, numpy_buffer.ctypes.data)
+            assert name == expected_name
+            assert data_type == triton_bindings.TRITONSERVER_DataType.FP32
+            assert shape == expected_data.shape
+            assert out_buffer == numpy_buffer.ctypes.data
             # buffer attribute used for input doesn't necessarily to
             # match output buffer attributes, this is just knowing the detail.
-            self.assertEqual(byte_size, ba.byte_size)
-            self.assertEqual(memory_type, ba.memory_type)
-            self.assertEqual(memory_type_id, ba.memory_type_id)
-            self.assertTrue(
-                numpy.allclose(
-                    numpy_buffer.view(dtype=expected_data.dtype).reshape(shape),
-                    expected_data,
-                )
+            assert byte_size == ba.byte_size
+            assert memory_type == ba.memory_type
+            assert memory_type_id == ba.memory_type_id
+            assert numpy.allclose(
+                numpy_buffer.view(dtype=expected_data.dtype).reshape(shape),
+                expected_data,
             )
 
         # label (no label so empty)
-        self.assertEqual(len(res.output_classification_label(0, 1)), 0)
+        assert len(res.output_classification_label(0, 1)) == 0
         # [FIXME] keep alive behavior is not established between response
         # and server, so must explicitly handle the destruction order for now.
         del res
 
         # sanity check on user objects
-        self.assertEqual(allocator_counter["start"], 1)
-        self.assertEqual(allocator_counter["alloc"], 2)
+        assert allocator_counter["start"] == 1
+        assert allocator_counter["alloc"] == 2
         # Knowing implementation detail that the backend doesn't use query API
-        self.assertTrue("query" not in allocator_counter)
-        self.assertEqual(allocator_counter["buffer"], 2)
+        assert "query" not in allocator_counter
+        assert allocator_counter["buffer"] == 2
         # Expect request to be released in 10 seconds
         request = request_counter.get(block=True, timeout=10)
 
@@ -1080,25 +1078,24 @@ def test_server_explicit(self):
             ),
         ]
         server.load_model_with_parameters("wired_addsub", load_file_params)
-        self.assertTrue(server.model_is_ready("wired_addsub", -1))
+        assert server.model_is_ready("wired_addsub", -1)
 
         # Model Repository
-        self.assertFalse(server.model_is_ready(self._model_name, -1))
+        assert not (server.model_is_ready(self._model_name, -1))
         # unregister
         server.unregister_model_repository(self._test_model_repo)
-        self.assertRaises(
-            triton_bindings.TritonError, server.load_model, self._model_name
-        )
+        with pytest.raises(triton_bindings.TritonError):
+            server.load_model(self._model_name)
         # register
         server.register_model_repository(self._test_model_repo, [])
         server.load_model(self._model_name)
-        self.assertTrue(server.model_is_ready(self._model_name, -1))
+        assert server.model_is_ready(self._model_name, -1)
 
         # unload
         server.unload_model("wired_addsub")
-        self.assertFalse(server.model_is_ready("wired_addsub", -1))
+        assert not (server.model_is_ready("wired_addsub", -1))
         server.unload_model_and_dependents(self._model_name)
-        self.assertFalse(server.model_is_ready(self._model_name, -1))
+        assert not (server.model_is_ready(self._model_name, -1))
 
     def test_custom_metric(self):
         options = triton_bindings.TRITONSERVER_ServerOptions()
@@ -1116,18 +1113,14 @@ def test_custom_metric(self):
         )
         m = triton_bindings.TRITONSERVER_Metric(mf, [])
         m.increment(2)
-        self.assertEqual(m.kind, triton_bindings.TRITONSERVER_MetricKind.COUNTER)
-        self.assertEqual(m.value, 2)
+        assert m.kind == triton_bindings.TRITONSERVER_MetricKind.COUNTER
+        assert m.value == 2
         # can't use 'set_value' due to wrong kind
-        self.assertRaises(triton_bindings.TritonError, m.set_value, 5)
+        with pytest.raises(triton_bindings.TritonError):
+            m.set_value(5)
 
         # Check custom metric is reported
         metrics = server.metrics()
-        self.assertTrue(
-            "custom_metric_familiy"
-            in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS)
+        assert "custom_metric_familiy" in metrics.formatted(
+            triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS
         )
-
-
-if __name__ == "__main__":
-    unittest.main()

From fc02544dfcdb028de3a1faf51a0e975e996b0d19 Mon Sep 17 00:00:00 2001
From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com>
Date: Fri, 20 Dec 2024 18:21:57 -0500
Subject: [PATCH 5/8] build: Improves python packaging infrastructure (#414)

---
 .github/workflows/build-and-test.yml       |   1 -
 pyproject.toml                             |  48 ++
 python/CMakeLists.txt                      |   1 +
 python/build_wheel.py                      |   5 +-
 python/setup.py                            |  95 +--
 python/tritonserver/CMakeLists.txt         |   3 +-
 python/tritonserver/_c/__init__.pyi        |  39 --
 python/tritonserver/_c/triton_bindings.pyi | 696 ---------------------
 8 files changed, 67 insertions(+), 821 deletions(-)
 delete mode 100644 python/tritonserver/_c/__init__.pyi
 delete mode 100644 python/tritonserver/_c/triton_bindings.pyi

diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
index 2ab3d0658..1baed09b8 100644
--- a/.github/workflows/build-and-test.yml
+++ b/.github/workflows/build-and-test.yml
@@ -42,7 +42,6 @@ jobs:
           mkdir -p /core/build
           cd /core/build
           cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_CORE_HEADERS_ONLY=OFF ..
-          export TRITON_PYBIND="_c/triton_bindings.cpython-310-x86_64-linux-gnu.so"
           make -j8
 
       - name: Run tests with pytest
diff --git a/pyproject.toml b/pyproject.toml
index 5e8749f81..3ce1a8a91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,54 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+[project]
+name = "tritonserver"
+authors = [{ name = "NVIDIA Inc.", email = "sw-dl-triton@nvidia.com" }]
+description = "Triton Inference Server In-Process Python API"
+license = { file = "LICENSE.txt" }
+dynamic = ["version"]
+dependencies = ["numpy<2"]
+classifiers = [
+    "Development Status :: 5 - Production/Stable",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Information Technology",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Image Recognition",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Utilities",
+    "License :: OSI Approved :: BSD License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.12",
+    "Environment :: Console",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.package-data]
+tritonserver = ["_c/triton_bindings.*.so"]
+
+[build-system]
+requires = [
+    "setuptools==75.3.0",
+    "wheel==0.44.0",
+    # For stubgen:
+    "mypy==1.11.0",
+    "numpy<2",
+]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+GPU = ["cupy-cuda12x"]
+test = ["pytest"]
+all = ["tritonserver[GPU]", "tritonserver[test]"]
+
+
 [tool.codespell]
 # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
 # this is only to allow you to run codespell interactively
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 871f682f9..df7f1bde0 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -30,6 +30,7 @@ add_subdirectory(tritonserver)
 file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION})
 configure_file(../LICENSE LICENSE.txt COPYONLY)
 configure_file(setup.py setup.py @ONLY)
+configure_file(../pyproject.toml pyproject.toml COPYONLY)
 file(COPY test/  DESTINATION ./test/.)
 
 set(WHEEL_DEPENDS
diff --git a/python/build_wheel.py b/python/build_wheel.py
index 150a3e346..2888cfe01 100755
--- a/python/build_wheel.py
+++ b/python/build_wheel.py
@@ -108,17 +108,18 @@ def sed(pattern, replace, source, dest=None):
 
     shutil.copyfile("LICENSE.txt", os.path.join(FLAGS.whl_dir, "LICENSE.txt"))
     shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py"))
+    shutil.copyfile("pyproject.toml", os.path.join(FLAGS.whl_dir, "pyproject.toml"))
 
     os.chdir(FLAGS.whl_dir)
     print("=== Building wheel")
-    args = ["python3", "setup.py", "bdist_wheel"]
+    args = ["python3", "-m", "build"]
 
     wenv = os.environ.copy()
     wenv["VERSION"] = FLAGS.triton_version
     wenv["TRITON_PYBIND"] = PYBIND_LIB
     p = subprocess.Popen(args, env=wenv)
     p.wait()
-    fail_if(p.returncode != 0, "setup.py failed")
+    fail_if(p.returncode != 0, "Building wheel failed failed")
 
     cpdir("dist", FLAGS.dest_dir)
 
diff --git a/python/setup.py b/python/setup.py
index 3d371eaac..9b9b29104 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -25,90 +25,23 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import os
-import sys
-from itertools import chain
 
-from setuptools import find_packages, setup
+import subprocess
 
-if "--plat-name" in sys.argv:
-    PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1]
-else:
-    PLATFORM_FLAG = "any"
+from setuptools import setup
+from setuptools.command.build_py import build_py
 
-if "VERSION" not in os.environ:
-    raise Exception("envvar VERSION must be specified")
 
-VERSION = os.environ["VERSION"]
+class BuildPyCommand(build_py):
+    def run(self):
+        build_py.run(self)
+        # Generate stub files:
+        package_name = self.distribution.metadata.name
+        subprocess.run(
+            ["stubgen", "-p", f"{package_name}._c", "-o", f"{self.build_lib}"],
+            check=True,
+        )
 
-try:
-    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
-    class bdist_wheel(_bdist_wheel):
-        def finalize_options(self):
-            _bdist_wheel.finalize_options(self)
-            self.root_is_pure = False
-
-        def get_tag(self):
-            pyver, abi, plat = "py3", "none", PLATFORM_FLAG
-            return pyver, abi, plat
-
-except ImportError:
-    bdist_wheel = None
-
-this_directory = os.path.abspath(os.path.dirname(__file__))
-
-data_files = [
-    ("", ["LICENSE.txt"]),
-]
-
-# Type checking marker file indicating support for type checkers.
-# https://peps.python.org/pep-0561/
-# Type hints for c extension generated by mypy
-platform_package_data = [
-    os.environ["TRITON_PYBIND"],
-    "py.typed",
-    "_c/__init__.pyi",
-    "_c/triton_bindings.pyi",
-]
-
-gpu_extras = ["cupy-cuda12x"]
-test_extras = ["pytest"]
-all_extras = gpu_extras + test_extras
-
-setup(
-    name="tritonserver",
-    version=VERSION,
-    author="NVIDIA Inc.",
-    author_email="sw-dl-triton@nvidia.com",
-    description="Triton Inference Server In-Process Python API",
-    license="BSD",
-    url="https://developer.nvidia.com/nvidia-triton-inference-server",
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Science/Research",
-        "Intended Audience :: Information Technology",
-        "Topic :: Scientific/Engineering",
-        "Topic :: Scientific/Engineering :: Image Recognition",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-        "Topic :: Software Development :: Libraries",
-        "Topic :: Utilities",
-        "License :: OSI Approved :: BSD License",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.12",
-        "Environment :: Console",
-        "Natural Language :: English",
-        "Operating System :: OS Independent",
-    ],
-    packages=find_packages(),
-    package_data={
-        "": platform_package_data,
-    },
-    zip_safe=False,
-    cmdclass={"bdist_wheel": bdist_wheel},
-    data_files=data_files,
-    install_requires=["numpy<2"],
-    extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras},
-)
+if __name__ == "__main__":
+    setup(cmdclass={"build_py": BuildPyCommand})
diff --git a/python/tritonserver/CMakeLists.txt b/python/tritonserver/CMakeLists.txt
index c06292451..d2480bc94 100644
--- a/python/tritonserver/CMakeLists.txt
+++ b/python/tritonserver/CMakeLists.txt
@@ -33,8 +33,6 @@ file(COPY __init__.py DESTINATION .)
 file(COPY py.typed DESTINATION .)
 # Copy the '__init__.py' for the '_c' module
 file(COPY _c/__init__.py DESTINATION ./_c/.)
-file(COPY _c/__init__.pyi DESTINATION ./_c/.)
-file(COPY _c/triton_bindings.pyi DESTINATION ./_c/.)
 # Find and copy _api modules
 file(GLOB PYTHON_MODULE_FILES ./_api/*.py)
 file(COPY ${PYTHON_MODULE_FILES} DESTINATION ./_api/.)
@@ -65,3 +63,4 @@ target_compile_features(python-bindings PRIVATE cxx_std_17)
 set_property(TARGET python-bindings PROPERTY OUTPUT_NAME triton_bindings)
 # Add Triton library default path in 'rpath' for runtime library lookup
 set_target_properties(python-bindings PROPERTIES BUILD_RPATH "$ORIGIN:/opt/tritonserver/lib")
+set_target_properties(python-bindings PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/python/tritonserver/_c/)
diff --git a/python/tritonserver/_c/__init__.pyi b/python/tritonserver/_c/__init__.pyi
deleted file mode 100644
index aa7d4a57a..000000000
--- a/python/tritonserver/_c/__init__.pyi
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""Type information for Triton _c bindings."""
-
-# Note: this file was generated using mypy with an empty __init__.py
-# file in the tritonserver package directory to avoid any renaming /
-# aliasing done by the wrapper
-#
-# mypy 1.8.0 (compiled: yes)
-#
-# stubgen -p tritonserver._c
-#
-# Todo: add stub generation to build process
-
-from .triton_bindings import *
diff --git a/python/tritonserver/_c/triton_bindings.pyi b/python/tritonserver/_c/triton_bindings.pyi
deleted file mode 100644
index 71deaba6b..000000000
--- a/python/tritonserver/_c/triton_bindings.pyi
+++ /dev/null
@@ -1,696 +0,0 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#  * Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""Type information for Triton _c bindings."""
-
-# Note: this file was generated using mypy with an empty __init__.py
-# file in the tritonserver package directory to avoid any renaming /
-# aliasing done by the wrapper
-#
-# mypy 1.8.0 (compiled: yes)
-#
-# stubgen -p tritonserver._c
-#
-# Todo: add stub generation to build process
-
-from typing import Callable, ClassVar, List, Optional, Tuple, overload
-
-import numpy
-
-ALL: TRITONSERVER_RequestReleaseFlag
-COMPUTE_END: TRITONSERVER_InferenceTraceActivity
-COMPUTE_INPUT_END: TRITONSERVER_InferenceTraceActivity
-COMPUTE_OUTPUT_START: TRITONSERVER_InferenceTraceActivity
-COMPUTE_START: TRITONSERVER_InferenceTraceActivity
-DECOUPLED: TRITONSERVER_ModelTxnPropertyFlag
-DISABLED: TRITONSERVER_InferenceTraceLevel
-FINAL: TRITONSERVER_ResponseCompleteFlag
-FIRST_DIM: TRITONSERVER_ModelBatchFlag
-MAX: TRITONSERVER_InferenceTraceLevel
-MIN: TRITONSERVER_InferenceTraceLevel
-ONE_TO_ONE: TRITONSERVER_ModelTxnPropertyFlag
-QUEUE_START: TRITONSERVER_InferenceTraceActivity
-READY: TRITONSERVER_ModelIndexFlag
-REQUEST_END: TRITONSERVER_InferenceTraceActivity
-REQUEST_START: TRITONSERVER_InferenceTraceActivity
-SEQUENCE_END: TRITONSERVER_RequestFlag
-SEQUENCE_START: TRITONSERVER_RequestFlag
-TENSORS: TRITONSERVER_InferenceTraceLevel
-TENSOR_BACKEND_INPUT: TRITONSERVER_InferenceTraceActivity
-TENSOR_BACKEND_OUTPUT: TRITONSERVER_InferenceTraceActivity
-TENSOR_QUEUE_INPUT: TRITONSERVER_InferenceTraceActivity
-TIMESTAMPS: TRITONSERVER_InferenceTraceLevel
-UNKNOWN: TRITONSERVER_ModelBatchFlag
-
-class AlreadyExistsError(TritonError): ...
-class InternalError(TritonError): ...
-class InvalidArgumentError(TritonError): ...
-class NotFoundError(TritonError): ...
-
-class TRITONSERVER_BufferAttributes:
-    byte_size: int
-    cuda_ipc_handle: int
-    memory_type: TRITONSERVER_MemoryType
-    memory_type_id: int
-    def __init__(self) -> None: ...
-
-class TRITONSERVER_DataType:
-    __members__: ClassVar[dict] = ...  # read-only
-    BF16: ClassVar[TRITONSERVER_DataType] = ...
-    BOOL: ClassVar[TRITONSERVER_DataType] = ...
-    BYTES: ClassVar[TRITONSERVER_DataType] = ...
-    FP16: ClassVar[TRITONSERVER_DataType] = ...
-    FP32: ClassVar[TRITONSERVER_DataType] = ...
-    FP64: ClassVar[TRITONSERVER_DataType] = ...
-    INT16: ClassVar[TRITONSERVER_DataType] = ...
-    INT32: ClassVar[TRITONSERVER_DataType] = ...
-    INT64: ClassVar[TRITONSERVER_DataType] = ...
-    INT8: ClassVar[TRITONSERVER_DataType] = ...
-    INVALID: ClassVar[TRITONSERVER_DataType] = ...
-    UINT16: ClassVar[TRITONSERVER_DataType] = ...
-    UINT32: ClassVar[TRITONSERVER_DataType] = ...
-    UINT64: ClassVar[TRITONSERVER_DataType] = ...
-    UINT8: ClassVar[TRITONSERVER_DataType] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_InferenceRequest:
-    correlation_id: int
-    correlation_id_string: str
-    flags: int
-    id: str
-    priority: int
-    priority_uint64: int
-    timeout_microseconds: int
-    def __init__(self, arg0, arg1: str, arg2: int) -> None: ...
-    def add_input(
-        self, arg0: str, arg1: TRITONSERVER_DataType, arg2: List[int]
-    ) -> None: ...
-    def add_raw_input(self, arg0: str) -> None: ...
-    def add_requested_output(self, arg0: str) -> None: ...
-    def append_input_data(
-        self, arg0: str, arg1: int, arg2: int, arg3: TRITONSERVER_MemoryType, arg4: int
-    ) -> None: ...
-    def append_input_data_with_buffer_attributes(
-        self, arg0: str, arg1: int, arg2: TRITONSERVER_BufferAttributes
-    ) -> None: ...
-    def append_input_data_with_host_policy(
-        self,
-        arg0: str,
-        arg1: int,
-        arg2: int,
-        arg3: TRITONSERVER_MemoryType,
-        arg4: int,
-        arg5: str,
-    ) -> None: ...
-    def cancel(self) -> None: ...
-    def remove_all_input_data(self, arg0: str) -> None: ...
-    def remove_all_inputs(self) -> None: ...
-    def remove_all_requested_outputs(self) -> None: ...
-    def remove_input(self, arg0: str) -> None: ...
-    def remove_requested_output(self, arg0: str) -> None: ...
-    def set_bool_parameter(self, arg0: str, arg1: bool) -> None: ...
-    def set_int_parameter(self, arg0: str, arg1: int) -> None: ...
-    def set_release_callback(
-        self,
-        arg0: Callable[[TRITONSERVER_InferenceRequest, int, object], None],
-        arg1: object,
-    ) -> None: ...
-    def set_response_callback(
-        self,
-        arg0: object,
-        arg1: object,
-        arg2: Callable[[object, int, object], None],
-        arg3: object,
-    ) -> None: ...
-    def set_string_parameter(self, arg0: str, arg1: str) -> None: ...
-    def set_double_parameter(self, arg0: str, arg1: float) -> None: ...
-
-class TRITONSERVER_InferenceResponse:
-    def __init__(self, *args, **kwargs) -> None: ...
-    def output(
-        self, arg0: int
-    ) -> Tuple[
-        str,
-        TRITONSERVER_DataType,
-        numpy.ndarray[numpy.int64],
-        int,
-        int,
-        TRITONSERVER_MemoryType,
-        int,
-        object,
-    ]: ...
-    def output_classification_label(self, arg0: int, arg1: int) -> str: ...
-    def parameter(
-        self, arg0: int
-    ) -> Tuple[str, TRITONSERVER_ParameterType, object]: ...
-    def throw_if_response_error(self) -> None: ...
-    @property
-    def id(self) -> str: ...
-    @property
-    def model(self) -> Tuple[str, int]: ...
-    @property
-    def output_count(self) -> int: ...
-    @property
-    def parameter_count(self) -> int: ...
-
-class TRITONSERVER_InferenceTrace:
-    @overload
-    def __init__(
-        self,
-        level: int,
-        parent_id: int,
-        activity_function: Callable[
-            [object, TRITONSERVER_InferenceTraceActivity, int, object], None
-        ],
-        tensor_activity_function: Callable[
-            [
-                object,
-                TRITONSERVER_InferenceTraceActivity,
-                str,
-                TRITONSERVER_DataType,
-                int,
-                int,
-                numpy.ndarray[numpy.int64],
-                TRITONSERVER_MemoryType,
-                int,
-                object,
-            ],
-            None,
-        ],
-        release_function: Callable[[TRITONSERVER_InferenceTrace, object], None],
-        trace_userp: object,
-    ) -> None: ...
-    @overload
-    def __init__(
-        self,
-        level: int,
-        parent_id: int,
-        activity_function: Callable[
-            [object, TRITONSERVER_InferenceTraceActivity, int, object], None
-        ],
-        release_function: Callable[[TRITONSERVER_InferenceTrace, object], None],
-        trace_userp: object,
-    ) -> None: ...
-    @property
-    def id(self) -> int: ...
-    @property
-    def model_name(self) -> str: ...
-    @property
-    def model_version(self) -> int: ...
-    @property
-    def parent_id(self) -> int: ...
-    @property
-    def request_id(self) -> str: ...
-
-class TRITONSERVER_InferenceTraceActivity:
-    __members__: ClassVar[dict] = ...  # read-only
-    COMPUTE_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    COMPUTE_INPUT_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    COMPUTE_OUTPUT_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    COMPUTE_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    QUEUE_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    REQUEST_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    REQUEST_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    TENSOR_BACKEND_INPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    TENSOR_BACKEND_OUTPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    TENSOR_QUEUE_INPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_InferenceTraceLevel:
-    __members__: ClassVar[dict] = ...  # read-only
-    DISABLED: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    MAX: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    MIN: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    TENSORS: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    TIMESTAMPS: ClassVar[TRITONSERVER_InferenceTraceLevel] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_InstanceGroupKind:
-    __members__: ClassVar[dict] = ...  # read-only
-    AUTO: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    CPU: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    GPU: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    MODEL: ClassVar[TRITONSERVER_InstanceGroupKind] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_LogFormat:
-    __members__: ClassVar[dict] = ...  # read-only
-    DEFAULT: ClassVar[TRITONSERVER_LogFormat] = ...
-    ISO8601: ClassVar[TRITONSERVER_LogFormat] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_LogLevel:
-    __members__: ClassVar[dict] = ...  # read-only
-    ERROR: ClassVar[TRITONSERVER_LogLevel] = ...
-    INFO: ClassVar[TRITONSERVER_LogLevel] = ...
-    VERBOSE: ClassVar[TRITONSERVER_LogLevel] = ...
-    WARN: ClassVar[TRITONSERVER_LogLevel] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_MemoryType:
-    __members__: ClassVar[dict] = ...  # read-only
-    CPU: ClassVar[TRITONSERVER_MemoryType] = ...
-    CPU_PINNED: ClassVar[TRITONSERVER_MemoryType] = ...
-    GPU: ClassVar[TRITONSERVER_MemoryType] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Message:
-    def __init__(self, arg0: str) -> None: ...
-    def serialize_to_json(self) -> str: ...
-
-class TRITONSERVER_Metric:
-    def __init__(
-        self, arg0: TRITONSERVER_MetricFamily, arg1: List[TRITONSERVER_Parameter]
-    ) -> None: ...
-    def increment(self, arg0: float) -> None: ...
-    def set_value(self, arg0: float) -> None: ...
-    @property
-    def kind(self) -> TRITONSERVER_MetricKind: ...
-    @property
-    def value(self) -> float: ...
-
-class TRITONSERVER_MetricFamily:
-    def __init__(self, arg0: TRITONSERVER_MetricKind, arg1: str, arg2: str) -> None: ...
-
-class TRITONSERVER_MetricFormat:
-    __members__: ClassVar[dict] = ...  # read-only
-    PROMETHEUS: ClassVar[TRITONSERVER_MetricFormat] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_MetricKind:
-    __members__: ClassVar[dict] = ...  # read-only
-    COUNTER: ClassVar[TRITONSERVER_MetricKind] = ...
-    GAUGE: ClassVar[TRITONSERVER_MetricKind] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Metrics:
-    def __init__(self, *args, **kwargs) -> None: ...
-    def formatted(self, arg0: TRITONSERVER_MetricFormat) -> str: ...
-
-class TRITONSERVER_ModelBatchFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    FIRST_DIM: ClassVar[TRITONSERVER_ModelBatchFlag] = ...
-    UNKNOWN: ClassVar[TRITONSERVER_ModelBatchFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ModelControlMode:
-    __members__: ClassVar[dict] = ...  # read-only
-    EXPLICIT: ClassVar[TRITONSERVER_ModelControlMode] = ...
-    NONE: ClassVar[TRITONSERVER_ModelControlMode] = ...
-    POLL: ClassVar[TRITONSERVER_ModelControlMode] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ModelIndexFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    READY: ClassVar[TRITONSERVER_ModelIndexFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ModelTxnPropertyFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    DECOUPLED: ClassVar[TRITONSERVER_ModelTxnPropertyFlag] = ...
-    ONE_TO_ONE: ClassVar[TRITONSERVER_ModelTxnPropertyFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Parameter:
-    @overload
-    def __init__(self, arg0: str, arg1: bytes) -> None: ...
-    @overload
-    def __init__(self, arg0: str, arg1: str) -> None: ...
-    @overload
-    def __init__(self, arg0: str, arg1: int) -> None: ...
-    @overload
-    def __init__(self, arg0: str, arg1: bool) -> None: ...
-
-class TRITONSERVER_ParameterType:
-    __members__: ClassVar[dict] = ...  # read-only
-    BOOL: ClassVar[TRITONSERVER_ParameterType] = ...
-    BYTES: ClassVar[TRITONSERVER_ParameterType] = ...
-    INT: ClassVar[TRITONSERVER_ParameterType] = ...
-    STRING: ClassVar[TRITONSERVER_ParameterType] = ...
-    DOUBLE: ClassVar[TRITONSERVER_ParameterType] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_RateLimitMode:
-    __members__: ClassVar[dict] = ...  # read-only
-    EXEC_COUNT: ClassVar[TRITONSERVER_RateLimitMode] = ...
-    OFF: ClassVar[TRITONSERVER_RateLimitMode] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_RequestFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    SEQUENCE_END: ClassVar[TRITONSERVER_RequestFlag] = ...
-    SEQUENCE_START: ClassVar[TRITONSERVER_RequestFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_RequestReleaseFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    ALL: ClassVar[TRITONSERVER_RequestReleaseFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_ResponseAllocator:
-    @overload
-    def __init__(
-        self,
-        alloc_function: Callable[
-            [object, str, int, TRITONSERVER_MemoryType, int, object],
-            Tuple[int, object, TRITONSERVER_MemoryType, int],
-        ],
-        release_function: Callable[
-            [object, int, object, int, TRITONSERVER_MemoryType, int], None
-        ],
-        start_function: Callable[[object, object], None],
-    ) -> None: ...
-    @overload
-    def __init__(
-        self,
-        alloc_function: Callable[
-            [object, str, int, TRITONSERVER_MemoryType, int, object],
-            Tuple[int, object, TRITONSERVER_MemoryType, int],
-        ],
-        release_function: Callable[
-            [object, int, object, int, TRITONSERVER_MemoryType, int], None
-        ],
-    ) -> None: ...
-    def set_buffer_attributes_function(
-        self,
-        buffer_attributes_function: Callable[
-            [object, str, object, object, object], object
-        ],
-    ) -> None: ...
-    def set_query_function(
-        self,
-        query_function: Callable[
-            [object, object, str, Optional[int], TRITONSERVER_MemoryType, int],
-            Tuple[TRITONSERVER_MemoryType, int],
-        ],
-    ) -> None: ...
-
-class TRITONSERVER_ResponseCompleteFlag:
-    __members__: ClassVar[dict] = ...  # read-only
-    FINAL: ClassVar[TRITONSERVER_ResponseCompleteFlag] = ...
-    __entries: ClassVar[dict] = ...
-    def __init__(self, value: int) -> None: ...
-    def __eq__(self, other: object) -> bool: ...
-    def __hash__(self) -> int: ...
-    def __index__(self) -> int: ...
-    def __int__(self) -> int: ...
-    def __ne__(self, other: object) -> bool: ...
-    @property
-    def name(self) -> str: ...
-    @property
-    def value(self) -> int: ...
-
-class TRITONSERVER_Server:
-    def __init__(self, arg0: TRITONSERVER_ServerOptions) -> None: ...
-    @overload
-    def infer_async(
-        self, arg0: TRITONSERVER_InferenceRequest, arg1: TRITONSERVER_InferenceTrace
-    ) -> None: ...
-    @overload
-    def infer_async(self, arg0: TRITONSERVER_InferenceRequest) -> None: ...
-    def is_live(self) -> bool: ...
-    def is_ready(self) -> bool: ...
-    def load_model(self, arg0: str) -> None: ...
-    def load_model_with_parameters(
-        self, arg0: str, arg1: List[TRITONSERVER_Parameter]
-    ) -> None: ...
-    def metadata(self) -> TRITONSERVER_Message: ...
-    def metrics(self) -> TRITONSERVER_Metrics: ...
-    def model_batch_properties(self, arg0: str, arg1: int) -> Tuple[int, int]: ...
-    def model_config(self, arg0: str, arg1: int, arg2: int) -> TRITONSERVER_Message: ...
-    def model_index(self, arg0: int) -> TRITONSERVER_Message: ...
-    def model_is_ready(self, arg0: str, arg1: int) -> bool: ...
-    def model_metadata(self, arg0: str, arg1: int) -> TRITONSERVER_Message: ...
-    def model_statistics(self, arg0: str, arg1: int) -> TRITONSERVER_Message: ...
-    def model_transaction_properties(self, arg0: str, arg1: int) -> Tuple[int, int]: ...
-    @overload
-    def poll_model_repository(self) -> None: ...
-    @overload
-    def poll_model_repository(self) -> None: ...
-    def register_model_repository(
-        self, arg0: str, arg1: List[TRITONSERVER_Parameter]
-    ) -> None: ...
-    def stop(self) -> None: ...
-    def unload_model(self, arg0: str) -> None: ...
-    def unload_model_and_dependents(self, arg0: str) -> None: ...
-    def unregister_model_repository(self, arg0: str) -> None: ...
-
-class TRITONSERVER_ServerOptions:
-    def __init__(self) -> None: ...
-    def add_rate_limiter_resource(self, arg0: str, arg1: int, arg2: int) -> None: ...
-    def set_backend_config(self, arg0: str, arg1: str, arg2: str) -> None: ...
-    def set_backend_directory(self, arg0: str) -> None: ...
-    def set_buffer_manager_thread_count(self, arg0: int) -> None: ...
-    def set_cache_config(self, arg0: str, arg1: str) -> None: ...
-    def set_cache_directory(self, arg0: str) -> None: ...
-    def set_cpu_metrics(self, arg0: bool) -> None: ...
-    def set_cuda_memory_pool_byte_size(self, arg0: int, arg1: int) -> None: ...
-    def set_exit_on_error(self, arg0: bool) -> None: ...
-    def set_exit_timeout(self, arg0: int) -> None: ...
-    def set_gpu_metrics(self, arg0: bool) -> None: ...
-    def set_host_policy(self, arg0: str, arg1: str, arg2: str) -> None: ...
-    def set_log_error(self, arg0: bool) -> None: ...
-    def set_log_file(self, arg0: str) -> None: ...
-    def set_log_format(self, arg0: TRITONSERVER_LogFormat) -> None: ...
-    def set_log_info(self, arg0: bool) -> None: ...
-    def set_log_verbose(self, arg0: int) -> None: ...
-    def set_log_warn(self, arg0: bool) -> None: ...
-    def set_metrics(self, arg0: bool) -> None: ...
-    def set_metrics_config(self, arg0: str, arg1: str, arg2: str) -> None: ...
-    def set_metrics_interval(self, arg0: int) -> None: ...
-    def set_min_supported_compute_capability(self, arg0: float) -> None: ...
-    def set_model_control_mode(self, arg0: TRITONSERVER_ModelControlMode) -> None: ...
-    def set_model_load_device_limit(
-        self, arg0: TRITONSERVER_InstanceGroupKind, arg1: int, arg2: float
-    ) -> None: ...
-    def set_model_load_thread_count(self, arg0: int) -> None: ...
-    def set_model_load_retry_count(self, arg0: int) -> None: ...
-    def set_model_namespacing(self, arg0: bool) -> None: ...
-    def set_enable_peer_access(self, arg0: bool) -> None: ...
-    def set_model_repository_path(self, arg0: str) -> None: ...
-    def set_pinned_memory_pool_byte_size(self, arg0: int) -> None: ...
-    def set_rate_limiter_mode(self, arg0: TRITONSERVER_RateLimitMode) -> None: ...
-    def set_repo_agent_directory(self, arg0: str) -> None: ...
-    def set_response_cache_byte_size(self, arg0: int) -> None: ...
-    def set_server_id(self, arg0: str) -> None: ...
-    def set_startup_model(self, arg0: str) -> None: ...
-    def set_strict_model_config(self, arg0: bool) -> None: ...
-    def set_strict_readiness(self, arg0: bool) -> None: ...
-
-class TritonError(Exception): ...
-class UnavailableError(TritonError): ...
-class UnknownError(TritonError): ...
-class UnsupportedError(TritonError): ...
-
-def TRITONSERVER_DataTypeByteSize(arg0: TRITONSERVER_DataType) -> int: ...
-def TRITONSERVER_DataTypeString(arg0: TRITONSERVER_DataType) -> str: ...
-def TRITONSERVER_InferenceTraceActivityString(
-    arg0: TRITONSERVER_InferenceTraceActivity,
-) -> str: ...
-def TRITONSERVER_InferenceTraceLevelString(
-    arg0: TRITONSERVER_InferenceTraceLevel,
-) -> str: ...
-def TRITONSERVER_InstanceGroupKindString(
-    arg0: TRITONSERVER_InstanceGroupKind,
-) -> str: ...
-def TRITONSERVER_LogIsEnabled(arg0: TRITONSERVER_LogLevel) -> bool: ...
-def TRITONSERVER_LogMessage(
-    arg0: TRITONSERVER_LogLevel, arg1: str, arg2: int, arg3: str
-) -> None: ...
-def TRITONSERVER_MemoryTypeString(arg0: TRITONSERVER_MemoryType) -> str: ...
-def TRITONSERVER_ParameterTypeString(arg0: TRITONSERVER_ParameterType) -> str: ...
-def TRITONSERVER_StringToDataType(arg0: str) -> TRITONSERVER_DataType: ...
-def api_version() -> tuple: ...

From be3fa6941e6a8e8c421f96ad6f1d6b90d7fa2d31 Mon Sep 17 00:00:00 2001
From: Ryan McCormick <rmccormick@nvidia.com>
Date: Mon, 6 Jan 2025 10:48:42 -0800
Subject: [PATCH 6/8] Expose tritonserver.InferenceResponse type (#394)

---
 python/tritonserver/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/tritonserver/__init__.py b/python/tritonserver/__init__.py
index 4d25c0478..440f69fe9 100644
--- a/python/tritonserver/__init__.py
+++ b/python/tritonserver/__init__.py
@@ -55,6 +55,7 @@
 from tritonserver._api._model import ModelBatchFlag as ModelBatchFlag
 from tritonserver._api._model import ModelTxnPropertyFlag as ModelTxnPropertyFlag
 from tritonserver._api._request import InferenceRequest as InferenceRequest
+from tritonserver._api._response import InferenceResponse as InferenceResponse
 from tritonserver._api._server import InstanceGroupKind as InstanceGroupKind
 from tritonserver._api._server import LogFormat as LogFormat
 from tritonserver._api._server import Metric as Metric

From f3610e46dbaaef230aa208f8ac3170af7e2bb970 Mon Sep 17 00:00:00 2001
From: Neelay Shah <neelays@nvidia.com>
Date: Mon, 6 Jan 2025 14:19:49 -0800
Subject: [PATCH 7/8] fix: Fix memory leak with dlpack when using python Tensor
 objects (#421)

co-author: @tanmayv25
---
 python/test/test_api.py             | 98 +++++++++++++++++++++++++++++
 python/tritonserver/_api/_tensor.py | 90 +++++++++++++++-----------
 2 files changed, 153 insertions(+), 35 deletions(-)

diff --git a/python/test/test_api.py b/python/test/test_api.py
index 68aa7a318..ed96e27f0 100644
--- a/python/test/test_api.py
+++ b/python/test/test_api.py
@@ -24,9 +24,17 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import asyncio
+import copy
+import gc
 import json
 import os
 import shutil
+import sys
+import time
+import unittest
+from collections import Counter
+from contextlib import contextmanager
 
 import numpy
 import pytest
@@ -296,6 +304,96 @@ def test_tensor_from_numpy(self):
         numpy.testing.assert_array_equal(torch_tensor.numpy(), cpu_array)
         assert torch_tensor.data_ptr() == cpu_array.ctypes.data
 
+    async def _tensor_from_numpy(self):
+        owner = numpy.ones(2**27)
+        tensor = tritonserver.Tensor.from_dlpack(owner)
+        array = numpy.from_dlpack(tensor)
+        del owner
+        del tensor
+        del array
+        await asyncio.sleep(0.1)
+
+    async def _async_test_runs(self):
+        tasks = []
+        for _ in range(100):
+            tasks.append(asyncio.create_task(self._tensor_from_numpy()))
+        try:
+            await asyncio.wait(tasks)
+        except Exception as e:
+            print(e)
+
+    @staticmethod
+    @contextmanager
+    def object_collector():
+        gc.collect()
+        objects_before = gc.get_objects()
+        yield
+        objects_after = gc.get_objects()
+        new_objects = [type(x) for x in objects_after[len(objects_before) :]]
+        tensor_objects = [
+            x for x in objects_after if isinstance(x, tritonserver.Tensor)
+        ]
+        if tensor_objects:
+            print("Tensor objects")
+            print(len(tensor_objects))
+            print(type(tensor_objects[-1].memory_buffer.owner))
+            print(
+                f"\nTotal Collected Objects ({len(new_objects)}) {Counter(new_objects)}"
+            )
+        assert len(tensor_objects) == 0, "Leaked Tensors"
+
+    def test_cpu_memory_leak_async(self):
+        with TestTensor.object_collector():
+            asyncio.run(self._async_test_runs())
+
+    def test_cpu_memory_leak_sync(self):
+        with TestTensor.object_collector():
+            for _ in range(100):
+                owner = numpy.ones(2**27)
+                tensor = tritonserver.Tensor.from_dlpack(owner)
+                array = numpy.from_dlpack(tensor)
+                del owner
+                del tensor
+                del array
+
+    @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed")
+    def test_gpu_memory_leak(self):
+        with TestTensor.object_collector():
+            for _ in range(100):
+                owner = cupy.ones(2**27)
+                tensor = tritonserver.Tensor.from_dlpack(owner)
+                array = cupy.from_dlpack(tensor)
+                del owner
+                del tensor
+                del array
+
+    def test_reference_counts(self):
+        with TestTensor.object_collector():
+            owner = numpy.ones(2**27)
+            owner_data = owner.ctypes.data
+            assert sys.getrefcount(owner) - 1 == 1, "Invalid Count"
+
+            tensor = tritonserver.Tensor.from_dlpack(owner)
+            assert sys.getrefcount(owner) - 1 == 2, "Invalid Count"
+            assert sys.getrefcount(tensor) - 1 == 1, "Invalid Count"
+            del owner
+
+            numpy_array = numpy.from_dlpack(tensor)
+            assert owner_data == numpy_array.ctypes.data
+            assert sys.getrefcount(tensor) - 1 == 2, "Invalid Count"
+            assert sys.getrefcount(numpy_array) - 1 == 1, "Invalid Count"
+
+            tensor.shape = [2, 2**26]
+
+            assert numpy_array.shape == (2**27,), "Invalid Shape"
+
+            numpy_array_2 = numpy.from_dlpack(tensor)
+            del tensor
+            assert owner_data == numpy_array.ctypes.data
+            assert numpy_array_2.shape == (2, 2**26)
+            del numpy_array
+            del numpy_array_2
+
 
 class TestServer:
     def test_not_started(self):
diff --git a/python/tritonserver/_api/_tensor.py b/python/tritonserver/_api/_tensor.py
index ee21abd59..afac87d9f 100644
--- a/python/tritonserver/_api/_tensor.py
+++ b/python/tritonserver/_api/_tensor.py
@@ -217,23 +217,8 @@ def __dlpack__(self, *, stream=None):
 
         self._sync_on_requested_stream(stream)
 
-        dl_managed_tensor = Tensor._create_managed_tensor()
-        dl_managed_tensor.dl_tensor.data = self.data_ptr
-        dl_managed_tensor.dl_tensor.device = DLDevice(
-            TRITON_MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type],
-            self.memory_type_id,
-        )
+        dl_managed_tensor = self._create_managed_tensor()
 
-        dl_managed_tensor.dl_tensor.dtype = TRITON_TO_DLPACK_DTYPE[self.data_type]
-        dl_managed_tensor.dl_tensor.ndim = len(self.shape)
-        dl_managed_tensor.dl_tensor.shape = (ctypes.c_int64 * len(self.shape))(
-            *self.shape
-        )
-        dl_managed_tensor.dl_tensor.strides = ctypes.POINTER(ctypes.c_int64)()
-        dl_managed_tensor.dl_tensor.byte_offset = 0
-        dl_managed_tensor.deleter = Tensor._managed_tensor_deleter
-
-        self._set_dlpack_manager_ctx(dl_managed_tensor)
         pycapsule = ctypes.pythonapi.PyCapsule_New(
             ctypes.byref(dl_managed_tensor),
             c_str_dltensor,
@@ -600,26 +585,39 @@ def _from_numpy(obj: numpy.ndarray | numpy.generic) -> Tensor:
             size=obj.itemsize * obj.size,
             owner=obj,
         )
-
         return Tensor(data_type, shape, memory_buffer)
 
-    @staticmethod
-    def _create_managed_tensor():
+    def _create_managed_tensor(self) -> DLManagedTensor:
+        # Allocates space for a managed tensor object
+        # and fills in the fields
+        #
+        # To ensure the lifetime of the managed tensor we create a
+        # context object that includes a newly created shape array and a
+        # reference to self
+
         size = ctypes.c_size_t(ctypes.sizeof(DLManagedTensor))
         address = ctypes.pythonapi.PyMem_RawMalloc(size)
-        return DLManagedTensor.from_address(address)
+        dl_managed_tensor = DLManagedTensor.from_address(address)
+        dl_managed_tensor.dl_tensor.data = self.data_ptr
+        dl_managed_tensor.dl_tensor.device = DLDevice(
+            TRITON_MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type],
+            self.memory_type_id,
+        )
+        dl_managed_tensor.dl_tensor.dtype = TRITON_TO_DLPACK_DTYPE[self.data_type]
+        dl_managed_tensor.dl_tensor.ndim = len(self.shape)
+        manager_ctx = _ManagerCtx(self)
+        dl_managed_tensor.dl_tensor.shape = manager_ctx.shape
+        dl_managed_tensor.dl_tensor.strides = manager_ctx.strides
+        dl_managed_tensor.dl_tensor.byte_offset = 0
+        dl_managed_tensor.deleter = Tensor._managed_tensor_deleter
+        dl_managed_tensor.manager_ctx = manager_ctx.reference()
+        return dl_managed_tensor
 
     @staticmethod
     @ctypes.CFUNCTYPE(None, ctypes.c_void_p)
     def _managed_tensor_deleter(handle: int) -> None:
         dl_managed_tensor = DLManagedTensor.from_address(handle)
-        tensor_obj_ptr = ctypes.cast(
-            dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object)
-        )
-        tensor_obj = tensor_obj_ptr.contents
-        ctypes.pythonapi.Py_DecRef(tensor_obj)
-        shape_obj = ctypes.py_object(dl_managed_tensor.dl_tensor.shape)
-        ctypes.pythonapi.Py_DecRef(shape_obj)
+        _ManagerCtx.release(dl_managed_tensor.manager_ctx)
         ctypes.pythonapi.PyMem_RawFree(handle)
 
     @staticmethod
@@ -639,14 +637,36 @@ def _pycapsule_deleter(handle: ctypes.c_void_p) -> None:
             print(f"Exception occurred while deleting capsule: {e}")
             raise e
 
-    def _set_dlpack_manager_ctx(self, dl_managed_tensor):
-        tensor_obj = ctypes.py_object(self)
-        tensor_obj_ptr = ctypes.pointer(tensor_obj)
-        dl_managed_tensor.manager_ctx = ctypes.cast(tensor_obj_ptr, ctypes.c_void_p)
-        shape_obj = ctypes.py_object(dl_managed_tensor.dl_tensor.shape)
-        ctypes.pythonapi.Py_IncRef(tensor_obj)
-        ctypes.pythonapi.Py_IncRef(shape_obj)
-
     _from_converters: ClassVar[dict[type, Callable[[Any], Tensor]]] = dict(
         {numpy.ndarray: _from_numpy, numpy.generic: _from_numpy, list: _from_list},
     )
+
+
+class _ManagerCtx:
+    # To ensure the lifetime of the managed tensor we create a
+    # context object that includes a newly created shape array and a
+    # reference to self
+
+    def __init__(self, tensor: Tensor) -> None:
+        self._tensor = tensor
+        self.shape = (ctypes.c_int64 * len(tensor.shape))(*tensor.shape)
+        self.strides = ctypes.POINTER(ctypes.c_int64)()
+
+    def reference(self) -> ctypes.c_void_p:
+        py_obj = ctypes.py_object(self)
+        ctypes.pythonapi.Py_IncRef(py_obj)
+
+        # Note: Could not find a direct way to cast a python object
+        # to a c_void_p. The mechanism is to either use id(self) or
+        # cast as described here:
+        #
+        # https://groups.google.com/g/dev-python/c/QRRqVC7gkf4/m/zH7l1gTXBwAJ
+        #
+        # To avoid relying on the behavior of id() we use the casting mechanism
+
+        return ctypes.POINTER(ctypes.c_void_p)(py_obj)[0]
+
+    @staticmethod
+    def release(reference: ctypes.c_void_p) -> None:
+        py_obj = ctypes.cast(reference, ctypes.py_object)
+        ctypes.pythonapi.Py_DecRef(py_obj)

From eeb283a34c8da06c275dd003f2860aaac43f5b06 Mon Sep 17 00:00:00 2001
From: Yingge He <157551214+yinggeh@users.noreply.github.com>
Date: Wed, 8 Jan 2025 11:10:05 -0800
Subject: [PATCH 8/8] fix: Validate request correlation ID data type (#425)

---
 src/constants.h                               |  3 +-
 src/infer_request.cc                          | 62 ++++++++++++++++++-
 src/infer_request.h                           |  6 +-
 .../sequence_batch_scheduler.cc               |  8 +--
 4 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/src/constants.h b/src/constants.h
index 8415f8ee9..119d1e9d2 100644
--- a/src/constants.h
+++ b/src/constants.h
@@ -1,4 +1,4 @@
-// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -92,7 +92,6 @@ constexpr uint64_t NANOS_PER_SECOND = 1000000000;
 constexpr uint64_t NANOS_PER_MILLIS = 1000000;
 constexpr int MAX_GRPC_MESSAGE_SIZE = INT32_MAX;
 constexpr uint64_t SEQUENCE_IDLE_DEFAULT_MICROSECONDS = 1000 * 1000;
-constexpr size_t STRING_CORRELATION_ID_MAX_LENGTH_BYTES = 128;
 constexpr size_t CUDA_IPC_STRUCT_SIZE = 64;
 
 #ifdef TRITON_ENABLE_METRICS
diff --git a/src/infer_request.cc b/src/infer_request.cc
index 0d0c80a0d..83b3bb872 100644
--- a/src/infer_request.cc
+++ b/src/infer_request.cc
@@ -1,4 +1,4 @@
-// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1228,11 +1228,16 @@ InferenceRequest::Normalize()
       }
     }
   }
+
+  if (model_config.has_sequence_batching()) {
+    RETURN_IF_ERROR(ValidateCorrelationId());
+  }
+
   return Status::Success;
 }
 
 Status
-InferenceRequest::ValidateRequestInputs()
+InferenceRequest::ValidateRequestInputs() const
 {
   const inference::ModelConfig& model_config = model_raw_->Config();
   if ((original_inputs_.size() > (size_t)model_config.input_size()) ||
@@ -1404,6 +1409,59 @@ InferenceRequest::ValidateBytesInputs(
   return Status::Success;
 }
 
+Status
+InferenceRequest::ValidateCorrelationId() const
+{
+  const inference::ModelConfig& model_config = model_raw_->Config();
+  const std::string& model_name = ModelName();
+  std::string correlation_id_tensor_name;
+  inference::DataType correlation_id_datatype;
+
+  RETURN_IF_ERROR(GetTypedSequenceControlProperties(
+      model_config.sequence_batching(), model_config.name(),
+      inference::ModelSequenceBatching::Control::CONTROL_SEQUENCE_CORRID,
+      false /* required */, &correlation_id_tensor_name,
+      &correlation_id_datatype));
+
+  // Make sure request correlation ID type matches model configuration.
+  if (!correlation_id_tensor_name.empty()) {
+    const auto& correlation_id = CorrelationId();
+    bool dtypes_match = true;
+    std::string request_corrid_datatype;
+    if ((correlation_id.Type() ==
+         InferenceRequest::SequenceId::DataType::STRING) &&
+        (correlation_id_datatype != inference::DataType::TYPE_STRING)) {
+      dtypes_match = false;
+      request_corrid_datatype = triton::common::DataTypeToProtocolString(
+          inference::DataType::TYPE_STRING);
+    } else if (
+        (correlation_id.Type() ==
+         InferenceRequest::SequenceId::DataType::UINT64) &&
+        ((correlation_id_datatype != inference::DataType::TYPE_UINT64) &&
+         (correlation_id_datatype != inference::DataType::TYPE_INT64) &&
+         (correlation_id_datatype != inference::DataType::TYPE_UINT32) &&
+         (correlation_id_datatype != inference::DataType::TYPE_INT32))) {
+      dtypes_match = false;
+      request_corrid_datatype = triton::common::DataTypeToProtocolString(
+          inference::DataType::TYPE_UINT64);
+    }
+
+    if (!dtypes_match) {
+      return Status(
+          Status::Code::INVALID_ARG,
+          LogRequest() + "sequence batching control '" +
+              correlation_id_tensor_name + "' data-type is '" +
+              request_corrid_datatype + "', but model '" + model_name +
+              "' expects '" +
+              std::string(triton::common::DataTypeToProtocolString(
+                  correlation_id_datatype)) +
+              "'");
+    }
+  }
+
+  return Status::Success;
+}
+
 #ifdef TRITON_ENABLE_STATS
 
 void
diff --git a/src/infer_request.h b/src/infer_request.h
index 38c89ed63..e9bfa49bc 100644
--- a/src/infer_request.h
+++ b/src/infer_request.h
@@ -1,4 +1,4 @@
-// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -771,13 +771,15 @@ class InferenceRequest {
   Status Normalize();
 
   // Helper for validating Inputs
-  Status ValidateRequestInputs();
+  Status ValidateRequestInputs() const;
 
   Status ValidateBytesInputs(
       const std::string& input_id, const Input& input,
       const std::string& model_name,
       TRITONSERVER_MemoryType* buffer_memory_type) const;
 
+  Status ValidateCorrelationId() const;
+
   // Helpers for pending request metrics
   void IncrementPendingRequestCount();
   void DecrementPendingRequestCount();
diff --git a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc
index 74314e7ab..45e9c037c 100644
--- a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc
+++ b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc
@@ -1,4 +1,4 @@
-// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1343,9 +1343,9 @@ SequenceBatch::SetControlTensors(
     auto& seq_corr_id = seq_slot_corrid_override_;
     size_t size_p = triton::common::GetDataTypeByteSize(seq_corr_id->DType());
     if (seq_corr_id->DType() == inference::DataType::TYPE_STRING) {
-      // 4 bytes for length of string plus pre-defined max string correlation id
-      // length in bytes
-      size_p = 4 + triton::core::STRING_CORRELATION_ID_MAX_LENGTH_BYTES;
+      // 4 bytes for length of string plus string correlation id length in
+      // bytes.
+      size_p = 4 + corrid.StringValue().length();
     }
 
     TRITONSERVER_MemoryType memory_type;