From 57ab1145e3b6a60fc6880370961958130e30b530 Mon Sep 17 00:00:00 2001 From: Francesco Petrini Date: Tue, 10 Dec 2024 09:59:28 -0800 Subject: [PATCH 1/8] Pin Pybind Version (#418) * aligning the pybind versions to v2.12 for core and python_backend * Review comment --------- Co-authored-by: Kyle McGill --- python/tritonserver/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tritonserver/CMakeLists.txt b/python/tritonserver/CMakeLists.txt index c958619d8..c06292451 100644 --- a/python/tritonserver/CMakeLists.txt +++ b/python/tritonserver/CMakeLists.txt @@ -43,8 +43,8 @@ include(FetchContent) FetchContent_Declare( pybind11 GIT_REPOSITORY "https://github.com/pybind/pybind11" - # COMMIT ID for v2.10.0 - GIT_TAG "aa304c9c7d725ffb9d10af08a3b34cb372307020" + # COMMIT ID for v2.12.0 + GIT_TAG "3e9dfa2866941655c56877882565e7577de6fc7b" GIT_SHALLOW ON ) FetchContent_MakeAvailable(pybind11) From 8030611f0da50ef541e06729a3dc170cf0040a34 Mon Sep 17 00:00:00 2001 From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com> Date: Fri, 13 Dec 2024 17:38:56 -0500 Subject: [PATCH 2/8] ci: Add GitHub action for core build and python testing in pull requests (#416) --- .github/workflows/build-and-test.yml | 40 ++++++++++++++++++++++++++++ python/test/test_api.py | 6 +---- src/test/input_byte_size_test.cc | 2 +- 3 files changed, 42 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/build-and-test.yml diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml new file mode 100644 index 000000000..26bbcfd21 --- /dev/null +++ b/.github/workflows/build-and-test.yml @@ -0,0 +1,40 @@ +name: Build And Test + +on: + pull_request: + branches: + - main + types: [synchronize, opened, reopened, ready_for_review] + + +jobs: + test: + runs-on: ubuntu-latest + container: + image: nvcr.io/nvidia/tritonserver:24.10-py3 + volumes: + - ${{ github.workspace }}:/core + + steps: + - uses: actions/checkout@v3 + + - name: Install dependencies + run: | + apt update + apt install -y --no-install-recommends clang-format-15 cmake libb64-dev rapidjson-dev libre2-dev + wget -O /tmp/boost.tar.gz https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz && (cd /tmp && tar xzf boost.tar.gz) && mv /tmp/boost_1_80_0/boost /usr/include/boost + pip install build pytest + + - name: Build + run: | + mkdir -p /core/build + cd /core/build + cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_CORE_HEADERS_ONLY=OFF .. + export TRITON_PYBIND="_c/triton_bindings.cpython-310-x86_64-linux-gnu.so" + make -j8 + + - name: Run tests with pytest + run: | + cd /core + python3 -m pip install --force-reinstall build/python/generic/wheel/dist/tritonserver-*.whl + pytest python/test -v diff --git a/python/test/test_api.py b/python/test/test_api.py index c15847aab..af910e71d 100644 --- a/python/test/test_api.py +++ b/python/test/test_api.py @@ -357,11 +357,7 @@ def test_stop(self): { "backend": "python", "parameters": {"decoupled": {"string_value": "False"}}, - # Keep instance count low for fast startup/cleanup. - # Alternatively can use KIND_CPU here, but keeping gpus/count explicit. - "instance_group": [ - {"kind": "KIND_GPU", "gpus": [0], "count": 1} - ], + "instance_group": [{"kind": "KIND_CPU"}], } ) }, diff --git a/src/test/input_byte_size_test.cc b/src/test/input_byte_size_test.cc index cf3e3bd58..1774fe7b4 100644 --- a/src/test/input_byte_size_test.cc +++ b/src/test/input_byte_size_test.cc @@ -378,7 +378,7 @@ TEST_F(InputByteSizeTest, InputByteSizeLarge) "setting request release callback"); // Define input shape and data - size_t element_cnt = (1LL << 31) / sizeof(float); + int64_t element_cnt = (1LL << 31) / sizeof(float); std::vector shape{1, element_cnt}; std::vector input_data(element_cnt, 1); const auto input0_byte_size = sizeof(input_data[0]) * input_data.size(); From 132f1d47864b8c29e37f0ad4e1752e64c8b676dc Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Fri, 13 Dec 2024 14:52:08 -0800 Subject: [PATCH 3/8] fix: Add missing struct keywords to fix support for auto-generated rust bindings from C APIs (#417) --- include/triton/core/tritonserver.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h index d9701e890..efbac44be 100644 --- a/include/triton/core/tritonserver.h +++ b/include/triton/core/tritonserver.h @@ -847,9 +847,9 @@ TRITONSERVER_InferenceTraceTensorNew( /// \param timestamp The timestamp associated with the trace activity. /// \param name The trace activity name. /// \return a TRITONSERVER_Error indicating success or failure. -TRITONSERVER_DECLSPEC TRITONSERVER_Error* +TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_InferenceTraceReportActivity( - TRITONSERVER_InferenceTrace* trace, uint64_t timestamp, + struct TRITONSERVER_InferenceTrace* trace, uint64_t timestamp, const char* activity_name); /// Delete a trace object. @@ -1938,9 +1938,9 @@ TRITONSERVER_ServerOptionsSetCudaMemoryPoolByteSize( /// \param gpu_device The GPU device to set the CUDA virtual address space size /// \param size The size of the CUDA virtual address space. /// \return a TRITONSERVER_Error indicating success or failure. -TRITONSERVER_DECLSPEC TRITONSERVER_Error* +TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_ServerOptionsSetCudaVirtualAddressSize( - TRITONSERVER_ServerOptions* options, int gpu_device, + struct TRITONSERVER_ServerOptions* options, int gpu_device, size_t cuda_virtual_address_size); /// Deprecated. See TRITONSERVER_ServerOptionsSetCacheConfig instead. From f7ff33f61f527a22c71387665aba41488324a4a5 Mon Sep 17 00:00:00 2001 From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:51:47 -0500 Subject: [PATCH 4/8] refactor: Migrates Python tests to pytest (#413) * Migrates test_api.py to pytest Migrates test_api.py to pytest and removes `unittest`. Changes setup functions to fixtures and replaces `unittest` assertion methods with regular Python `assert`s. Also lowers the timeout for the server to make the tests run a bit faster. * Updates test_binding.py to use pytest Replaces all `unittest` APIs with equivalent `pytest` ones. This change also updates the tests to use `tempfile` instead of manually creating and removing files and directories. * Parametrizes tests instead of running loops * Updates L0 job to free space on host --- .github/workflows/build-and-test.yml | 14 +- python/test/test_api.py | 225 +++++++--------- python/test/test_binding.py | 375 +++++++++++++-------------- 3 files changed, 294 insertions(+), 320 deletions(-) diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 26bbcfd21..2ab3d0658 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -14,15 +14,27 @@ jobs: image: nvcr.io/nvidia/tritonserver:24.10-py3 volumes: - ${{ github.workspace }}:/core + # Mount /usr so we can free space + - /usr:/host_usr + env: + AGENT_TOOLSDIRECTORY: "$AGENT_TOOLSDIRECTORY" steps: - uses: actions/checkout@v3 + - name: Free space + run: | + rm -rf \ + /host_usr/share/dotnet /host_usr/local/lib/android /opt/ghc \ + /host_usr/local/share/powershell /host_usr/share/swift /host_usr/local/.ghcup \ + /host_usr/lib/jvm + rm -rf "$AGENT_TOOLSDIRECTORY" + - name: Install dependencies run: | apt update apt install -y --no-install-recommends clang-format-15 cmake libb64-dev rapidjson-dev libre2-dev - wget -O /tmp/boost.tar.gz https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz && (cd /tmp && tar xzf boost.tar.gz) && mv /tmp/boost_1_80_0/boost /usr/include/boost + wget -O /tmp/boost.tar.gz https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz && (cd /tmp && tar xzf boost.tar.gz) && mv /tmp/boost_1_80_0/boost /usr/include/boost && rm /tmp/boost.tar.gz pip install build pytest - name: Build diff --git a/python/test/test_api.py b/python/test/test_api.py index af910e71d..68aa7a318 100644 --- a/python/test/test_api.py +++ b/python/test/test_api.py @@ -24,14 +24,9 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import asyncio -import copy import json import os -import queue import shutil -import time -import unittest import numpy import pytest @@ -50,46 +45,44 @@ except ImportError: torch = None -module_directory = os.path.split(os.path.abspath(__file__))[0] -test_model_directory = os.path.abspath( - os.path.join(module_directory, "test_api_models") -) -test_logs_directory = os.path.abspath(os.path.join(module_directory, "test_api_logs")) - -shutil.rmtree(test_logs_directory, ignore_errors=True) - -os.makedirs(test_logs_directory) - -server_options = tritonserver.Options( - server_id="TestServer", - model_repository=test_model_directory, - log_verbose=6, - log_error=True, - log_warn=True, - log_info=True, - exit_on_error=True, - strict_model_config=False, - model_control_mode=tritonserver.ModelControlMode.EXPLICIT, - exit_timeout=30, -) - - -class ModelTests(unittest.TestCase): - def setup_method(self, method): - self._server_options = copy.copy(server_options) - self._server_options.log_file = os.path.join( - test_logs_directory, method.__name__ + ".server.log" - ) +TEST_ROOT = os.path.abspath(os.path.dirname(__file__)) +TEST_MODEL_DIR = os.path.abspath(os.path.join(TEST_ROOT, "test_api_models")) +TEST_LOGS_DIR = os.path.abspath(os.path.join(TEST_ROOT, "test_api_logs")) + + +@pytest.fixture(autouse=True, scope="module") +def create_log_dir(): + shutil.rmtree(TEST_LOGS_DIR, ignore_errors=True) + os.makedirs(TEST_LOGS_DIR) + + +@pytest.fixture() +def server_options(request): + return tritonserver.Options( + server_id="TestServer", + model_repository=TEST_MODEL_DIR, + log_verbose=6, + log_error=True, + log_warn=True, + log_info=True, + exit_on_error=True, + strict_model_config=False, + model_control_mode=tritonserver.ModelControlMode.EXPLICIT, + exit_timeout=5, + log_file=os.path.join(TEST_LOGS_DIR, request.node.name + ".server.log"), + ) + - def test_create_request(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) +class TestModels: + def test_create_request(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) request = server.models()["test"].create_request() request = tritonserver.InferenceRequest(server.model("test")) -class AllocatorTests(unittest.TestCase): +class TestAllocators: class MockMemoryAllocator(tritonserver.MemoryAllocator): def __init__(self): pass @@ -97,17 +90,11 @@ def __init__(self): def allocate(self, *args, **kwargs): raise Exception("foo") - def setup_method(self, method): - self._server_options = copy.copy(server_options) - self._server_options.log_file = os.path.join( - test_logs_directory, method.__name__ + ".server.log" - ) - @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") - def test_memory_fallback_to_cpu(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) + def test_memory_fallback_to_cpu(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) - self.assertTrue(server.ready()) + assert server.ready() allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] @@ -133,18 +120,19 @@ def test_memory_fallback_to_cpu(self): for response in server.model("test").infer( inputs={"fp16_input": fp16_input}, ): - self.assertEqual( - response.outputs["fp16_output"].memory_type, tritonserver.MemoryType.CPU + assert ( + response.outputs["fp16_output"].memory_type + == tritonserver.MemoryType.CPU ) fp16_output = numpy.from_dlpack(response.outputs["fp16_output"]) - self.assertEqual(fp16_input[0][0], fp16_output[0][0]) + assert fp16_input[0][0] == fp16_output[0][0] tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] = allocator - def test_memory_allocator_exception(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) + def test_memory_allocator_exception(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) - self.assertTrue(server.ready()) + assert server.ready() server.load( "test", @@ -158,20 +146,20 @@ def test_memory_allocator_exception(self): }, ) - with self.assertRaises(tritonserver.InternalError): + with pytest.raises(tritonserver.InternalError): for response in server.model("test").infer( inputs={ "string_input": tritonserver.Tensor.from_string_array([["hello"]]) }, output_memory_type="gpu", - output_memory_allocator=AllocatorTests.MockMemoryAllocator(), + output_memory_allocator=TestAllocators.MockMemoryAllocator(), ): pass - def test_unsupported_memory_type(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) + def test_unsupported_memory_type(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) - self.assertTrue(server.ready()) + assert server.ready() server.load( "test", @@ -194,7 +182,7 @@ def test_unsupported_memory_type(self): else: allocator = None - with self.assertRaises(tritonserver.InvalidArgumentError): + with pytest.raises(tritonserver.InvalidArgumentError): for response in server.model("test").infer( inputs={ "string_input": tritonserver.Tensor.from_string_array([["hello"]]) @@ -218,7 +206,7 @@ def test_allocate_on_cpu_and_reshape(self): cpu_array = memory_buffer.owner - self.assertEqual(memory_buffer.size, 200) + assert memory_buffer.size == 200 fp32_size = int(memory_buffer.size / 4) @@ -227,16 +215,13 @@ def test_allocate_on_cpu_and_reshape(self): ) cpu_fp32_array = numpy.from_dlpack(tensor) - self.assertEqual(cpu_array.ctypes.data, cpu_fp32_array.ctypes.data) - self.assertEqual(cpu_fp32_array.dtype, numpy.float32) - self.assertEqual(cpu_fp32_array.nbytes, 200) + assert cpu_array.ctypes.data == cpu_fp32_array.ctypes.data + assert cpu_fp32_array.dtype == numpy.float32 + assert cpu_fp32_array.nbytes == 200 @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed") def test_allocate_on_gpu_and_reshape(self): - if cupy is None: - return - allocator = tritonserver.default_memory_allocators[tritonserver.MemoryType.GPU] memory_buffer = allocator.allocate( @@ -248,7 +233,7 @@ def test_allocate_on_gpu_and_reshape(self): gpu_array = cupy.empty([10, 20], dtype=cupy.uint8) memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array) - self.assertEqual(memory_buffer.size, 200) + assert memory_buffer.size == 200 fp32_size = int(memory_buffer.size / 4) @@ -257,55 +242,51 @@ def test_allocate_on_gpu_and_reshape(self): ) gpu_fp32_array = cupy.from_dlpack(tensor) - self.assertEqual( - gpu_array.__cuda_array_interface__["data"][0], - gpu_fp32_array.__cuda_array_interface__["data"][0], + assert ( + gpu_array.__cuda_array_interface__["data"][0] + == gpu_fp32_array.__cuda_array_interface__["data"][0] ) - self.assertEqual(gpu_fp32_array.dtype, cupy.float32) - self.assertEqual(gpu_fp32_array.nbytes, 200) + + assert gpu_fp32_array.dtype == cupy.float32 + assert gpu_fp32_array.nbytes == 200 torch_fp32_tensor = torch.from_dlpack(tensor) - self.assertEqual(torch_fp32_tensor.dtype, torch.float32) - self.assertEqual( - torch_fp32_tensor.data_ptr(), gpu_array.__cuda_array_interface__["data"][0] + assert torch_fp32_tensor.dtype == torch.float32 + assert ( + torch_fp32_tensor.data_ptr() + == gpu_array.__cuda_array_interface__["data"][0] ) - self.assertEqual(torch_fp32_tensor.nbytes, 200) + assert torch_fp32_tensor.nbytes == 200 -class TensorTests(unittest.TestCase): +class TestTensor: @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") def test_cpu_to_gpu(self): - if cupy is None: - return cpu_array = numpy.random.rand(1, 3, 100, 100).astype(numpy.float32) cpu_tensor = tritonserver.Tensor.from_dlpack(cpu_array) gpu_tensor = cpu_tensor.to_device("gpu:0") gpu_array = cupy.from_dlpack(gpu_tensor) - self.assertEqual(gpu_array.device, cupy.cuda.Device(0)) + assert gpu_array.device == cupy.cuda.Device(0) numpy.testing.assert_array_equal(cpu_array, gpu_array.get()) memory_buffer = tritonserver.MemoryBuffer.from_dlpack(gpu_array) - self.assertEqual( - gpu_array.__cuda_array_interface__["data"][0], memory_buffer.data_ptr - ) + assert gpu_array.__cuda_array_interface__["data"][0] == memory_buffer.data_ptr @pytest.mark.skipif( torch is None, reason="Skipping gpu memory, torch not installed" ) @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") def test_gpu_tensor_from_dl_pack(self): - if cupy is None or torch is None: - return cupy_array = cupy.ones([100]).astype(cupy.float64) tensor = tritonserver.Tensor.from_dlpack(cupy_array) torch_tensor = torch.from_dlpack(cupy_array) - self.assertEqual(torch_tensor.data_ptr(), tensor.data_ptr) - self.assertEqual(torch_tensor.nbytes, tensor.size) - self.assertEqual(torch_tensor.__dlpack_device__(), tensor.__dlpack_device__()) + assert torch_tensor.data_ptr() == tensor.data_ptr + assert torch_tensor.nbytes == tensor.size + assert torch_tensor.__dlpack_device__() == tensor.__dlpack_device__() @pytest.mark.skipif(torch is None, reason="Skipping test, torch not installed") def test_tensor_from_numpy(self): @@ -313,42 +294,36 @@ def test_tensor_from_numpy(self): tensor = tritonserver.Tensor.from_dlpack(cpu_array) torch_tensor = torch.from_dlpack(tensor) numpy.testing.assert_array_equal(torch_tensor.numpy(), cpu_array) - self.assertEqual(torch_tensor.data_ptr(), cpu_array.ctypes.data) - + assert torch_tensor.data_ptr() == cpu_array.ctypes.data -class ServerTests(unittest.TestCase): - def setup_method(self, method): - self._server_options = copy.copy(server_options) - self._server_options.log_file = os.path.join( - test_logs_directory, method.__name__ + ".server.log" - ) +class TestServer: def test_not_started(self): server = tritonserver.Server() - with self.assertRaises(tritonserver.InvalidArgumentError): + with pytest.raises(tritonserver.InvalidArgumentError): server.ready() def test_invalid_option_type(self): server = tritonserver.Server(server_id=1) - with self.assertRaises(TypeError): + with pytest.raises(TypeError): server.start() server = tritonserver.Server(model_repository=1) - with self.assertRaises(TypeError): + with pytest.raises(TypeError): server.start() def test_invalid_repo(self): - with self.assertRaises(tritonserver.InternalError): + with pytest.raises(tritonserver.InternalError): tritonserver.Server(model_repository="foo").start() - def test_ready(self): - server = tritonserver.Server(self._server_options).start() - self.assertTrue(server.ready()) + def test_ready(self, server_options): + server = tritonserver.Server(server_options).start() + assert server.ready() - def test_stop(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) + def test_stop(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) - self.assertTrue(server.ready()) + assert server.ready() server.load( "test", @@ -376,22 +351,16 @@ def test_stop(self): server.stop() def test_model_repository_not_specified(self): - with self.assertRaises(tritonserver.InvalidArgumentError): + with pytest.raises(tritonserver.InvalidArgumentError): tritonserver.Server(model_repository=None).start() -class InferenceTests(unittest.TestCase): - def setup_method(self, method): - self._server_options = copy.copy(server_options) - self._server_options.log_file = os.path.join( - test_logs_directory, method.__name__ + ".server.log" - ) - +class TestInference: @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") - def test_gpu_output(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) + def test_gpu_output(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) - self.assertTrue(server.ready()) + assert server.ready() server.load( "test", @@ -412,14 +381,14 @@ def test_gpu_output(self): output_memory_type="gpu", ): fp16_output = cupy.from_dlpack(response.outputs["fp16_output"]) - self.assertEqual(fp16_input[0][0], fp16_output[0][0]) + assert fp16_input[0][0] == fp16_output[0][0] for response in server.model("test").infer( inputs={"string_input": [["hello"]]}, output_memory_type="gpu", ): text_output = response.outputs["string_output"].to_string_array() - self.assertEqual(text_output[0][0], "hello") + assert text_output[0][0] == "hello" for response in server.model("test").infer( inputs={"string_input": tritonserver.Tensor.from_string_array([["hello"]])}, @@ -427,12 +396,12 @@ def test_gpu_output(self): ): text_output = response.outputs["string_output"].to_string_array() text_output = response.outputs["string_output"].to_string_array() - self.assertEqual(text_output[0][0], "hello") + assert text_output[0][0] == "hello" - def test_basic_inference(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) + def test_basic_inference(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) - self.assertTrue(server.ready()) + assert server.ready() server.load( "test", @@ -475,10 +444,10 @@ def test_basic_inference(self): ) numpy.testing.assert_array_equal(input_value, output_value) - def test_parameters(self): - server = tritonserver.Server(self._server_options).start(wait_until_ready=True) + def test_parameters(self, server_options): + server = tritonserver.Server(server_options).start(wait_until_ready=True) - self.assertTrue(server.ready()) + assert server.ready() server.load( "test", @@ -513,7 +482,7 @@ def test_parameters(self): ) assert input_parameters == output_parameters - with self.assertRaises(tritonserver.InvalidArgumentError): + with pytest.raises(tritonserver.InvalidArgumentError): input_parameters = { "invalid": {"test": 1}, } @@ -525,7 +494,7 @@ def test_parameters(self): raise_on_error=True, ) - with self.assertRaises(tritonserver.InvalidArgumentError): + with pytest.raises(tritonserver.InvalidArgumentError): input_parameters = { "invalid": None, } diff --git a/python/test/test_binding.py b/python/test/test_binding.py index 8f084bec5..143e55f50 100644 --- a/python/test/test_binding.py +++ b/python/test/test_binding.py @@ -28,10 +28,11 @@ import json import os import queue -import shutil -import unittest +import re +import tempfile import numpy +import pytest from tritonserver import _c as triton_bindings @@ -226,22 +227,18 @@ def execute(self, requests): # ======================================= Test cases =========================== -class BindingTest(unittest.TestCase): - def setUp(self): - self._test_model_repo = os.path.join(os.getcwd(), "binding_test_repo") - # clear model repository that may be created for testing. - if os.path.exists(self._test_model_repo): - shutil.rmtree(self._test_model_repo) - os.makedirs(self._test_model_repo) - self._model_name = "addsub" - self._version = "1" - self._file_name = "model.py" - - def tearDown(self): +class TestBindings: + @pytest.fixture(autouse=True, scope="function") + def model_repo(self): + with tempfile.TemporaryDirectory() as repo: + self._test_model_repo = repo + self._model_name = "addsub" + self._version = "1" + self._file_name = "model.py" + + yield + gc.collect() - # clear model repository that may be created for testing. - if os.path.exists(self._test_model_repo): - shutil.rmtree(self._test_model_repo) # helper functions def _to_pyobject(self, triton_message): @@ -315,8 +312,9 @@ def _prepare_inference_request(self, server): return request, allocator, response_queue, request_counter - def test_exceptions(self): - ex_list = [ + @pytest.mark.parametrize( + "ex_type", + [ triton_bindings.UnknownError, triton_bindings.InternalError, triton_bindings.NotFoundError, @@ -324,15 +322,15 @@ def test_exceptions(self): triton_bindings.UnavailableError, triton_bindings.UnsupportedError, triton_bindings.AlreadyExistsError, - ] - for ex_type in ex_list: - with self.assertRaises(triton_bindings.TritonError) as ctx: - raise ex_type("Error message") - self.assertTrue(isinstance(ctx.exception, ex_type)) - self.assertEqual(str(ctx.exception), "Error message") - - def test_data_type(self): - t_list = [ + ], + ) + def test_exceptions(self, ex_type): + with pytest.raises(ex_type, match="Error message") as ctx: + raise ex_type("Error message") + + @pytest.mark.parametrize( + "t, t_str, t_size", + [ (triton_bindings.TRITONSERVER_DataType.INVALID, "", 0), (triton_bindings.TRITONSERVER_DataType.BOOL, "BOOL", 1), (triton_bindings.TRITONSERVER_DataType.UINT8, "UINT8", 1), @@ -348,31 +346,35 @@ def test_data_type(self): (triton_bindings.TRITONSERVER_DataType.FP64, "FP64", 8), (triton_bindings.TRITONSERVER_DataType.BYTES, "BYTES", 0), (triton_bindings.TRITONSERVER_DataType.BF16, "BF16", 2), - ] - - for t, t_str, t_size in t_list: - self.assertEqual(triton_bindings.TRITONSERVER_DataTypeString(t), t_str) - self.assertEqual(triton_bindings.TRITONSERVER_StringToDataType(t_str), t) - self.assertEqual(triton_bindings.TRITONSERVER_DataTypeByteSize(t), t_size) - - def test_memory_type(self): - t_list = [ + ], + ) + def test_data_type(self, t, t_str, t_size): + assert triton_bindings.TRITONSERVER_DataTypeString(t) == t_str + assert triton_bindings.TRITONSERVER_StringToDataType(t_str) == t + assert triton_bindings.TRITONSERVER_DataTypeByteSize(t) == t_size + + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_MemoryType.CPU, "CPU"), (triton_bindings.TRITONSERVER_MemoryType.CPU_PINNED, "CPU_PINNED"), (triton_bindings.TRITONSERVER_MemoryType.GPU, "GPU"), - ] - for t, t_str in t_list: - self.assertEqual(triton_bindings.TRITONSERVER_MemoryTypeString(t), t_str) - - def test_parameter_type(self): - t_list = [ + ], + ) + def test_memory_type(self, t, t_str): + assert triton_bindings.TRITONSERVER_MemoryTypeString(t) == t_str + + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_ParameterType.STRING, "STRING"), (triton_bindings.TRITONSERVER_ParameterType.INT, "INT"), (triton_bindings.TRITONSERVER_ParameterType.BOOL, "BOOL"), (triton_bindings.TRITONSERVER_ParameterType.BYTES, "BYTES"), - ] - for t, t_str in t_list: - self.assertEqual(triton_bindings.TRITONSERVER_ParameterTypeString(t), t_str) + ], + ) + def test_parameter_type(self, t, t_str): + assert triton_bindings.TRITONSERVER_ParameterTypeString(t) == t_str def test_parameter(self): # C API doesn't provide additional API for parameter, can only test @@ -389,17 +391,17 @@ def test_parameter(self): del bytes_param gc.collect() - def test_instance_kind(self): - t_list = [ + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_InstanceGroupKind.AUTO, "AUTO"), (triton_bindings.TRITONSERVER_InstanceGroupKind.CPU, "CPU"), (triton_bindings.TRITONSERVER_InstanceGroupKind.GPU, "GPU"), (triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL, "MODEL"), - ] - for t, t_str in t_list: - self.assertEqual( - triton_bindings.TRITONSERVER_InstanceGroupKindString(t), t_str - ) + ], + ) + def test_instance_kind(self, t, t_str): + assert triton_bindings.TRITONSERVER_InstanceGroupKindString(t) == t_str def test_log(self): # This test depends on 'TRITONSERVER_ServerOptions' operates properly @@ -426,7 +428,7 @@ def test_log(self): (triton_bindings.TRITONSERVER_LogLevel.ERROR, True), (triton_bindings.TRITONSERVER_LogLevel.VERBOSE, False), ]: - self.assertEqual(triton_bindings.TRITONSERVER_LogIsEnabled(ll), enabled) + assert triton_bindings.TRITONSERVER_LogIsEnabled(ll) == enabled # Write message to each of the log level triton_bindings.TRITONSERVER_LogMessage( triton_bindings.TRITONSERVER_LogLevel.INFO, @@ -455,14 +457,14 @@ def test_log(self): with open(log_file, "r") as f: log = f.read() # Check level - self.assertRegex(log, r"filename:123.*info_message") - self.assertNotRegex(log, r"filename:456.*warn_message") - self.assertRegex(log, r"filename:789.*error_message") - self.assertNotRegex(log, r"filename:147.*verbose_message") + assert re.search(r"filename:123.*info_message", log) + assert not re.search(r"filename:456.*warn_message", log) + assert re.search(r"filename:789.*error_message", log) + assert not re.search(r"filename:147.*verbose_message", log) # Check format "MMDD hh:mm:ss.ssssss". - self.assertRegex(log, default_format_regex) + assert re.search(default_format_regex, log) # sanity check that there is no log with other format "YYYY-MM-DDThh:mm:ssZ L" - self.assertNotRegex(log, iso8601_format_regex) + assert not re.search(iso8601_format_regex, log) # Test different format options.set_log_format(triton_bindings.TRITONSERVER_LogFormat.ISO8601) triton_bindings.TRITONSERVER_LogMessage( @@ -470,8 +472,8 @@ def test_log(self): ) with open(log_file, "r") as f: log = f.read() - self.assertRegex(log, r"fn:258.*info_message") - self.assertRegex(log, iso8601_format_regex) + assert re.search(r"fn:258.*info_message", log) + assert re.search(iso8601_format_regex, log) finally: # Must make sure the log settings are reset as the logger is unique # within the process @@ -489,11 +491,11 @@ def test_buffer_attributes(self): expected_byte_size = 1024 buffer_attributes = triton_bindings.TRITONSERVER_BufferAttributes() buffer_attributes.memory_type_id = expected_memory_type_id - self.assertEqual(buffer_attributes.memory_type_id, expected_memory_type_id) + assert buffer_attributes.memory_type_id == expected_memory_type_id buffer_attributes.memory_type = expected_memory_type - self.assertEqual(buffer_attributes.memory_type, expected_memory_type) + assert buffer_attributes.memory_type == expected_memory_type buffer_attributes.byte_size = expected_byte_size - self.assertEqual(buffer_attributes.byte_size, expected_byte_size) + assert buffer_attributes.byte_size == expected_byte_size # cuda_ipc_handle is supposed to be cudaIpcMemHandle_t, must initialize buffer # of that size to avoid segfault. The handle getter/setter is different from other # attributes that different pointers may be returned from the getter, but the byte @@ -508,7 +510,7 @@ def test_buffer_attributes(self): buffer_attributes.cuda_ipc_handle ) for i in range(handle_byte_size): - self.assertEqual(int.from_bytes(res_arr[i], "big"), mock_handle[i]) + assert int.from_bytes(res_arr[i], "big") == mock_handle[i] def test_allocator(self): def alloc_fn( @@ -554,7 +556,7 @@ def buffer_fn( def test_message(self): expected_dict = {"key_0": [1, 2, "3"], "key_1": {"nested_key": "nested_value"}} message = triton_bindings.TRITONSERVER_Message(json.dumps(expected_dict)) - self.assertEqual(expected_dict, json.loads(message.serialize_to_json())) + assert expected_dict == json.loads(message.serialize_to_json()) def test_metrics(self): # This test depends on 'TRITONSERVER_Server' operates properly @@ -570,35 +572,39 @@ def test_metrics(self): server = triton_bindings.TRITONSERVER_Server(options) metrics = server.metrics() # Check one of the metrics is reported - self.assertTrue( - "nv_cpu_memory_used_bytes" - in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS) + assert "nv_cpu_memory_used_bytes" in metrics.formatted( + triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS ) - def test_trace_enum(self): - t_list = [ + @pytest.mark.parametrize( + "t, t_str", + [ (triton_bindings.TRITONSERVER_InferenceTraceLevel.DISABLED, "DISABLED"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.MIN, "MIN"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.MAX, "MAX"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS, "TIMESTAMPS"), (triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS, "TENSORS"), - ] - for t, t_str in t_list: - self.assertEqual( - triton_bindings.TRITONSERVER_InferenceTraceLevelString(t), t_str - ) + ], + ) + def test_trace_enum(self, t, t_str): + assert triton_bindings.TRITONSERVER_InferenceTraceLevelString(t) == t_str + + def test_trace_bitwise_operations(self): # bit-wise operation level = int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) | int( triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS ) - self.assertNotEqual( - level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS), 0 + assert ( + level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TIMESTAMPS) + != 0 ) - self.assertNotEqual( - level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS), 0 + assert ( + level & int(triton_bindings.TRITONSERVER_InferenceTraceLevel.TENSORS) != 0 ) - t_list = [ + @pytest.mark.parametrize( + "t, t_str", + [ ( triton_bindings.TRITONSERVER_InferenceTraceActivity.REQUEST_START, "REQUEST_START", @@ -639,11 +645,10 @@ def test_trace_enum(self): triton_bindings.TRITONSERVER_InferenceTraceActivity.TENSOR_BACKEND_OUTPUT, "TENSOR_BACKEND_OUTPUT", ), - ] - for t, t_str in t_list: - self.assertEqual( - triton_bindings.TRITONSERVER_InferenceTraceActivityString(t), t_str - ) + ], + ) + def test_trace_activity_enum(self, t, t_str): + assert triton_bindings.TRITONSERVER_InferenceTraceActivityString(t) == t_str def test_trace(self): # This test depends on 'test_infer_async' test to capture @@ -678,7 +683,7 @@ def test_trace(self): _ = trace_dict["signal_queue"].get(block=True, timeout=10) # check 'trace_dict' - self.assertTrue(trace_id in trace_dict) + assert trace_id in trace_dict # check activity are logged correctly, # value of 0 indicate it is timestamp trace, @@ -701,22 +706,22 @@ def test_trace(self): } for tl in trace_dict[trace_id]: # basic check - self.assertEqual(tl["id"], trace_id) - self.assertEqual(tl["parent_id"], 123) - self.assertEqual(tl["model_name"], self._model_name) - self.assertEqual(tl["model_version"], 1) - self.assertEqual(tl["request_id"], "req_0") - self.assertTrue(tl["activity"] in expected_activities) + assert tl["id"] == trace_id + assert tl["parent_id"] == 123 + assert tl["model_name"] == self._model_name + assert tl["model_version"] == 1 + assert tl["request_id"] == "req_0" + assert tl["activity"] in expected_activities if expected_activities[tl["activity"]] == 0: - self.assertTrue("timestamp" in tl) + assert "timestamp" in tl else: - self.assertTrue("tensor" in tl) + assert "tensor" in tl expected_activities[tl["activity"]] -= 1 if expected_activities[tl["activity"]] == 0: del expected_activities[tl["activity"]] # check if dict is empty to ensure the activity are logged in correct # amount. - self.assertFalse(bool(expected_activities)) + assert not (bool(expected_activities)) request_counter.get() def test_options(self): @@ -751,9 +756,10 @@ def test_options(self): triton_bindings.TRITONSERVER_InstanceGroupKind.CPU, triton_bindings.TRITONSERVER_InstanceGroupKind.MODEL, ]: - with self.assertRaises(triton_bindings.TritonError) as context: + with pytest.raises( + triton_bindings.TritonError, match="not supported" + ) as context: options.set_model_load_device_limit(k, 0, 0) - self.assertTrue("not supported" in str(context.exception)) # Backend options.set_backend_directory("backend_dir_0") @@ -780,7 +786,7 @@ def test_options(self): options.set_cache_directory("cache_dir_1") # Log try: - options.set_log_file("some_file") + options.set_log_file(tempfile.NamedTemporaryFile().name) options.set_log_info(True) options.set_log_warn(True) options.set_log_error(True) @@ -807,9 +813,11 @@ def test_options(self): options.set_metrics_config("metrics_group", "setting", "value") # Misc.. - with self.assertRaises(triton_bindings.TritonError) as context: + with pytest.raises( + triton_bindings.TritonError, match="Unsupported host policy setting" + ) as context: options.set_host_policy("policy_name", "setting", "value") - self.assertTrue("Unsupported host policy setting" in str(context.exception)) + options.set_repo_agent_directory("repo_agent_dir_0") options.set_repo_agent_directory("repo_agent_dir_1") options.set_buffer_manager_thread_count(4) @@ -817,48 +825,48 @@ def test_options(self): def test_server(self): server = self._start_polling_server() # is_live - self.assertTrue(server.is_live()) + assert server.is_live() # is_ready - self.assertTrue(server.is_ready()) + assert server.is_ready() # model_is_ready - self.assertTrue(server.model_is_ready(self._model_name, -1)) + assert server.model_is_ready(self._model_name, -1) # model_batch_properties expected_batch_properties = ( int(triton_bindings.TRITONSERVER_ModelBatchFlag.UNKNOWN), 0, ) - self.assertEqual( - server.model_batch_properties(self._model_name, -1), - expected_batch_properties, + assert ( + server.model_batch_properties(self._model_name, -1) + == expected_batch_properties ) # model_transaction_properties expected_transaction_policy = ( int(triton_bindings.TRITONSERVER_ModelTxnPropertyFlag.ONE_TO_ONE), 0, ) - self.assertEqual( - server.model_transaction_properties(self._model_name, -1), - expected_transaction_policy, + assert ( + server.model_transaction_properties(self._model_name, -1) + == expected_transaction_policy ) # metadata server_meta_data = self._to_pyobject(server.metadata()) - self.assertTrue("name" in server_meta_data) - self.assertEqual(server_meta_data["name"], "testing_server") + assert "name" in server_meta_data + assert server_meta_data["name"] == "testing_server" # model_metadata model_meta_data = self._to_pyobject(server.model_metadata(self._model_name, -1)) - self.assertTrue("name" in model_meta_data) - self.assertEqual(model_meta_data["name"], self._model_name) + assert "name" in model_meta_data + assert model_meta_data["name"] == self._model_name # model_statistics model_statistics = self._to_pyobject( server.model_statistics(self._model_name, -1) ) - self.assertTrue("model_stats" in model_statistics) + assert "model_stats" in model_statistics # model_config model_config = self._to_pyobject(server.model_config(self._model_name, -1, 1)) - self.assertTrue("input" in model_config) + assert "input" in model_config # model_index model_index = self._to_pyobject(server.model_index(0)) - self.assertEqual(model_index[0]["name"], self._model_name) + assert model_index[0]["name"] == self._model_name # metrics (see test_metrics) # infer_async (see test_infer_async) @@ -867,11 +875,10 @@ def test_request(self): # the request server = self._start_polling_server() - with self.assertRaises(triton_bindings.NotFoundError) as ctx: + with pytest.raises(triton_bindings.NotFoundError, match="unknown model") as ctx: _ = triton_bindings.TRITONSERVER_InferenceRequest( server, "not_existing_model", -1 ) - self.assertTrue("unknown model" in str(ctx.exception)) expected_request_id = "request" expected_flags = int( @@ -889,23 +896,24 @@ def test_request(self): # request metadata request.id = expected_request_id - self.assertEqual(request.id, expected_request_id) + assert request.id == expected_request_id request.flags = expected_flags - self.assertEqual(request.flags, expected_flags) + assert request.flags == expected_flags request.correlation_id = expected_correlation_id - self.assertEqual(request.correlation_id, expected_correlation_id) + assert request.correlation_id == expected_correlation_id request.correlation_id_string = expected_correlation_id_string - self.assertEqual(request.correlation_id_string, expected_correlation_id_string) + assert request.correlation_id_string == expected_correlation_id_string # Expect error from retrieving correlation id in a wrong type, # wrap in lambda function to avoid early evaluation that raises # exception before assert - self.assertRaises(triton_bindings.TritonError, lambda: request.correlation_id) + with pytest.raises(triton_bindings.TritonError): + request.correlation_id request.priority = expected_priority - self.assertEqual(request.priority, expected_priority) + assert request.priority == expected_priority request.priority_uint64 = expected_priority_uint64 - self.assertEqual(request.priority_uint64, 10) + assert request.priority_uint64 == 10 request.timeout_microseconds = expected_timeout_microseconds - self.assertEqual(request.timeout_microseconds, expected_timeout_microseconds) + assert request.timeout_microseconds == expected_timeout_microseconds request.set_string_parameter("str_key", "str_val") request.set_int_parameter("int_key", 567) @@ -922,33 +930,26 @@ def test_request(self): request.add_input( "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape ) - self.assertRaises(triton_bindings.TritonError, request.remove_input, "INPUT2") + with pytest.raises(triton_bindings.TritonError): + request.remove_input("INPUT2") # raw input assumes single input - self.assertRaises(triton_bindings.TritonError, request.add_raw_input, "INPUT1") + with pytest.raises(triton_bindings.TritonError): + request.add_raw_input("INPUT1") request.remove_input("INPUT0") request.add_raw_input("INPUT1") request.remove_all_inputs() # all inputs are removed, all 'append' functions should raise exceptions aid_args = ["INPUT0", buffer, ba.byte_size, ba.memory_type, ba.memory_type_id] - self.assertRaises( - triton_bindings.TritonError, request.append_input_data, *aid_args - ) - self.assertRaises( + with pytest.raises(triton_bindings.TritonError): + request.append_input_data(*aid_args) + with pytest.raises(triton_bindings.TritonError): + request.append_input_data_with_host_policy(*aid_args, "host_policy_name") + with pytest.raises( triton_bindings.TritonError, - request.append_input_data_with_host_policy, - *aid_args, - "host_policy_name" - ) - self.assertRaises( - triton_bindings.TritonError, - request.append_input_data_with_buffer_attributes, - "INPUT0", - buffer, - ba, - ) - self.assertRaises( - triton_bindings.TritonError, request.remove_all_input_data, "INPUT0" - ) + ): + request.append_input_data_with_buffer_attributes("INPUT0", buffer, ba) + with pytest.raises(triton_bindings.TritonError): + request.remove_all_input_data("INPUT0") # Add back input request.add_input( "INPUT0", triton_bindings.TRITONSERVER_DataType.FP32, input.shape @@ -1004,20 +1005,19 @@ def test_infer_async(self): # Expect every response to be returned in 10 seconds flags, res = response_queue.get(block=True, timeout=10) - self.assertEqual( - flags, int(triton_bindings.TRITONSERVER_ResponseCompleteFlag.FINAL) - ) + assert flags == int(triton_bindings.TRITONSERVER_ResponseCompleteFlag.FINAL) # expect no error res.throw_if_response_error() # version will be actual model version - self.assertEqual(res.model, (self._model_name, 1)) - self.assertEqual(res.id, request.id) - self.assertEqual(res.parameter_count, 0) + assert res.model == (self._model_name, 1) + assert res.id == request.id + assert res.parameter_count == 0 # out of range access - self.assertRaises(triton_bindings.TritonError, res.parameter, 0) + with pytest.raises(triton_bindings.TritonError): + res.parameter(0) # read output tensor - self.assertEqual(res.output_count, 2) + assert res.output_count == 2 for out, expected_name, expected_data in [ (res.output(0), "OUTPUT0", input + input), (res.output(1), "OUTPUT1", input - input), @@ -1032,34 +1032,32 @@ def test_infer_async(self): memory_type_id, numpy_buffer, ) = out - self.assertEqual(name, expected_name) - self.assertEqual(data_type, triton_bindings.TRITONSERVER_DataType.FP32) - self.assertEqual(shape, expected_data.shape) - self.assertEqual(out_buffer, numpy_buffer.ctypes.data) + assert name == expected_name + assert data_type == triton_bindings.TRITONSERVER_DataType.FP32 + assert shape == expected_data.shape + assert out_buffer == numpy_buffer.ctypes.data # buffer attribute used for input doesn't necessarily to # match output buffer attributes, this is just knowing the detail. - self.assertEqual(byte_size, ba.byte_size) - self.assertEqual(memory_type, ba.memory_type) - self.assertEqual(memory_type_id, ba.memory_type_id) - self.assertTrue( - numpy.allclose( - numpy_buffer.view(dtype=expected_data.dtype).reshape(shape), - expected_data, - ) + assert byte_size == ba.byte_size + assert memory_type == ba.memory_type + assert memory_type_id == ba.memory_type_id + assert numpy.allclose( + numpy_buffer.view(dtype=expected_data.dtype).reshape(shape), + expected_data, ) # label (no label so empty) - self.assertEqual(len(res.output_classification_label(0, 1)), 0) + assert len(res.output_classification_label(0, 1)) == 0 # [FIXME] keep alive behavior is not established between response # and server, so must explicitly handle the destruction order for now. del res # sanity check on user objects - self.assertEqual(allocator_counter["start"], 1) - self.assertEqual(allocator_counter["alloc"], 2) + assert allocator_counter["start"] == 1 + assert allocator_counter["alloc"] == 2 # Knowing implementation detail that the backend doesn't use query API - self.assertTrue("query" not in allocator_counter) - self.assertEqual(allocator_counter["buffer"], 2) + assert "query" not in allocator_counter + assert allocator_counter["buffer"] == 2 # Expect request to be released in 10 seconds request = request_counter.get(block=True, timeout=10) @@ -1080,25 +1078,24 @@ def test_server_explicit(self): ), ] server.load_model_with_parameters("wired_addsub", load_file_params) - self.assertTrue(server.model_is_ready("wired_addsub", -1)) + assert server.model_is_ready("wired_addsub", -1) # Model Repository - self.assertFalse(server.model_is_ready(self._model_name, -1)) + assert not (server.model_is_ready(self._model_name, -1)) # unregister server.unregister_model_repository(self._test_model_repo) - self.assertRaises( - triton_bindings.TritonError, server.load_model, self._model_name - ) + with pytest.raises(triton_bindings.TritonError): + server.load_model(self._model_name) # register server.register_model_repository(self._test_model_repo, []) server.load_model(self._model_name) - self.assertTrue(server.model_is_ready(self._model_name, -1)) + assert server.model_is_ready(self._model_name, -1) # unload server.unload_model("wired_addsub") - self.assertFalse(server.model_is_ready("wired_addsub", -1)) + assert not (server.model_is_ready("wired_addsub", -1)) server.unload_model_and_dependents(self._model_name) - self.assertFalse(server.model_is_ready(self._model_name, -1)) + assert not (server.model_is_ready(self._model_name, -1)) def test_custom_metric(self): options = triton_bindings.TRITONSERVER_ServerOptions() @@ -1116,18 +1113,14 @@ def test_custom_metric(self): ) m = triton_bindings.TRITONSERVER_Metric(mf, []) m.increment(2) - self.assertEqual(m.kind, triton_bindings.TRITONSERVER_MetricKind.COUNTER) - self.assertEqual(m.value, 2) + assert m.kind == triton_bindings.TRITONSERVER_MetricKind.COUNTER + assert m.value == 2 # can't use 'set_value' due to wrong kind - self.assertRaises(triton_bindings.TritonError, m.set_value, 5) + with pytest.raises(triton_bindings.TritonError): + m.set_value(5) # Check custom metric is reported metrics = server.metrics() - self.assertTrue( - "custom_metric_familiy" - in metrics.formatted(triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS) + assert "custom_metric_familiy" in metrics.formatted( + triton_bindings.TRITONSERVER_MetricFormat.PROMETHEUS ) - - -if __name__ == "__main__": - unittest.main() From fc02544dfcdb028de3a1faf51a0e975e996b0d19 Mon Sep 17 00:00:00 2001 From: pranavm-nvidia <49246958+pranavm-nvidia@users.noreply.github.com> Date: Fri, 20 Dec 2024 18:21:57 -0500 Subject: [PATCH 5/8] build: Improves python packaging infrastructure (#414) --- .github/workflows/build-and-test.yml | 1 - pyproject.toml | 48 ++ python/CMakeLists.txt | 1 + python/build_wheel.py | 5 +- python/setup.py | 95 +-- python/tritonserver/CMakeLists.txt | 3 +- python/tritonserver/_c/__init__.pyi | 39 -- python/tritonserver/_c/triton_bindings.pyi | 696 --------------------- 8 files changed, 67 insertions(+), 821 deletions(-) delete mode 100644 python/tritonserver/_c/__init__.pyi delete mode 100644 python/tritonserver/_c/triton_bindings.pyi diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 2ab3d0658..1baed09b8 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -42,7 +42,6 @@ jobs: mkdir -p /core/build cd /core/build cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DTRITON_CORE_HEADERS_ONLY=OFF .. - export TRITON_PYBIND="_c/triton_bindings.cpython-310-x86_64-linux-gnu.so" make -j8 - name: Run tests with pytest diff --git a/pyproject.toml b/pyproject.toml index 5e8749f81..3ce1a8a91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,54 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +[project] +name = "tritonserver" +authors = [{ name = "NVIDIA Inc.", email = "sw-dl-triton@nvidia.com" }] +description = "Triton Inference Server In-Process Python API" +license = { file = "LICENSE.txt" } +dynamic = ["version"] +dependencies = ["numpy<2"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Intended Audience :: Information Technology", + "Topic :: Scientific/Engineering", + "Topic :: Scientific/Engineering :: Image Recognition", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Software Development :: Libraries", + "Topic :: Utilities", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.12", + "Environment :: Console", + "Natural Language :: English", + "Operating System :: OS Independent", +] + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.package-data] +tritonserver = ["_c/triton_bindings.*.so"] + +[build-system] +requires = [ + "setuptools==75.3.0", + "wheel==0.44.0", + # For stubgen: + "mypy==1.11.0", + "numpy<2", +] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +GPU = ["cupy-cuda12x"] +test = ["pytest"] +all = ["tritonserver[GPU]", "tritonserver[test]"] + + [tool.codespell] # note: pre-commit passes explicit lists of files here, which this skip file list doesn't override - # this is only to allow you to run codespell interactively diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 871f682f9..df7f1bde0 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -30,6 +30,7 @@ add_subdirectory(tritonserver) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/TRITON_VERSION ${TRITON_VERSION}) configure_file(../LICENSE LICENSE.txt COPYONLY) configure_file(setup.py setup.py @ONLY) +configure_file(../pyproject.toml pyproject.toml COPYONLY) file(COPY test/ DESTINATION ./test/.) set(WHEEL_DEPENDS diff --git a/python/build_wheel.py b/python/build_wheel.py index 150a3e346..2888cfe01 100755 --- a/python/build_wheel.py +++ b/python/build_wheel.py @@ -108,17 +108,18 @@ def sed(pattern, replace, source, dest=None): shutil.copyfile("LICENSE.txt", os.path.join(FLAGS.whl_dir, "LICENSE.txt")) shutil.copyfile("setup.py", os.path.join(FLAGS.whl_dir, "setup.py")) + shutil.copyfile("pyproject.toml", os.path.join(FLAGS.whl_dir, "pyproject.toml")) os.chdir(FLAGS.whl_dir) print("=== Building wheel") - args = ["python3", "setup.py", "bdist_wheel"] + args = ["python3", "-m", "build"] wenv = os.environ.copy() wenv["VERSION"] = FLAGS.triton_version wenv["TRITON_PYBIND"] = PYBIND_LIB p = subprocess.Popen(args, env=wenv) p.wait() - fail_if(p.returncode != 0, "setup.py failed") + fail_if(p.returncode != 0, "Building wheel failed failed") cpdir("dist", FLAGS.dest_dir) diff --git a/python/setup.py b/python/setup.py index 3d371eaac..9b9b29104 100755 --- a/python/setup.py +++ b/python/setup.py @@ -25,90 +25,23 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import os -import sys -from itertools import chain -from setuptools import find_packages, setup +import subprocess -if "--plat-name" in sys.argv: - PLATFORM_FLAG = sys.argv[sys.argv.index("--plat-name") + 1] -else: - PLATFORM_FLAG = "any" +from setuptools import setup +from setuptools.command.build_py import build_py -if "VERSION" not in os.environ: - raise Exception("envvar VERSION must be specified") -VERSION = os.environ["VERSION"] +class BuildPyCommand(build_py): + def run(self): + build_py.run(self) + # Generate stub files: + package_name = self.distribution.metadata.name + subprocess.run( + ["stubgen", "-p", f"{package_name}._c", "-o", f"{self.build_lib}"], + check=True, + ) -try: - from wheel.bdist_wheel import bdist_wheel as _bdist_wheel - class bdist_wheel(_bdist_wheel): - def finalize_options(self): - _bdist_wheel.finalize_options(self) - self.root_is_pure = False - - def get_tag(self): - pyver, abi, plat = "py3", "none", PLATFORM_FLAG - return pyver, abi, plat - -except ImportError: - bdist_wheel = None - -this_directory = os.path.abspath(os.path.dirname(__file__)) - -data_files = [ - ("", ["LICENSE.txt"]), -] - -# Type checking marker file indicating support for type checkers. -# https://peps.python.org/pep-0561/ -# Type hints for c extension generated by mypy -platform_package_data = [ - os.environ["TRITON_PYBIND"], - "py.typed", - "_c/__init__.pyi", - "_c/triton_bindings.pyi", -] - -gpu_extras = ["cupy-cuda12x"] -test_extras = ["pytest"] -all_extras = gpu_extras + test_extras - -setup( - name="tritonserver", - version=VERSION, - author="NVIDIA Inc.", - author_email="sw-dl-triton@nvidia.com", - description="Triton Inference Server In-Process Python API", - license="BSD", - url="https://developer.nvidia.com/nvidia-triton-inference-server", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Intended Audience :: Information Technology", - "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Image Recognition", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Topic :: Software Development :: Libraries", - "Topic :: Utilities", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.12", - "Environment :: Console", - "Natural Language :: English", - "Operating System :: OS Independent", - ], - packages=find_packages(), - package_data={ - "": platform_package_data, - }, - zip_safe=False, - cmdclass={"bdist_wheel": bdist_wheel}, - data_files=data_files, - install_requires=["numpy<2"], - extras_require={"GPU": gpu_extras, "test": test_extras, "all": all_extras}, -) +if __name__ == "__main__": + setup(cmdclass={"build_py": BuildPyCommand}) diff --git a/python/tritonserver/CMakeLists.txt b/python/tritonserver/CMakeLists.txt index c06292451..d2480bc94 100644 --- a/python/tritonserver/CMakeLists.txt +++ b/python/tritonserver/CMakeLists.txt @@ -33,8 +33,6 @@ file(COPY __init__.py DESTINATION .) file(COPY py.typed DESTINATION .) # Copy the '__init__.py' for the '_c' module file(COPY _c/__init__.py DESTINATION ./_c/.) -file(COPY _c/__init__.pyi DESTINATION ./_c/.) -file(COPY _c/triton_bindings.pyi DESTINATION ./_c/.) # Find and copy _api modules file(GLOB PYTHON_MODULE_FILES ./_api/*.py) file(COPY ${PYTHON_MODULE_FILES} DESTINATION ./_api/.) @@ -65,3 +63,4 @@ target_compile_features(python-bindings PRIVATE cxx_std_17) set_property(TARGET python-bindings PROPERTY OUTPUT_NAME triton_bindings) # Add Triton library default path in 'rpath' for runtime library lookup set_target_properties(python-bindings PROPERTIES BUILD_RPATH "$ORIGIN:/opt/tritonserver/lib") +set_target_properties(python-bindings PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/python/tritonserver/_c/) diff --git a/python/tritonserver/_c/__init__.pyi b/python/tritonserver/_c/__init__.pyi deleted file mode 100644 index aa7d4a57a..000000000 --- a/python/tritonserver/_c/__init__.pyi +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""Type information for Triton _c bindings.""" - -# Note: this file was generated using mypy with an empty __init__.py -# file in the tritonserver package directory to avoid any renaming / -# aliasing done by the wrapper -# -# mypy 1.8.0 (compiled: yes) -# -# stubgen -p tritonserver._c -# -# Todo: add stub generation to build process - -from .triton_bindings import * diff --git a/python/tritonserver/_c/triton_bindings.pyi b/python/tritonserver/_c/triton_bindings.pyi deleted file mode 100644 index 71deaba6b..000000000 --- a/python/tritonserver/_c/triton_bindings.pyi +++ /dev/null @@ -1,696 +0,0 @@ -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of NVIDIA CORPORATION nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY -# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY -# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -"""Type information for Triton _c bindings.""" - -# Note: this file was generated using mypy with an empty __init__.py -# file in the tritonserver package directory to avoid any renaming / -# aliasing done by the wrapper -# -# mypy 1.8.0 (compiled: yes) -# -# stubgen -p tritonserver._c -# -# Todo: add stub generation to build process - -from typing import Callable, ClassVar, List, Optional, Tuple, overload - -import numpy - -ALL: TRITONSERVER_RequestReleaseFlag -COMPUTE_END: TRITONSERVER_InferenceTraceActivity -COMPUTE_INPUT_END: TRITONSERVER_InferenceTraceActivity -COMPUTE_OUTPUT_START: TRITONSERVER_InferenceTraceActivity -COMPUTE_START: TRITONSERVER_InferenceTraceActivity -DECOUPLED: TRITONSERVER_ModelTxnPropertyFlag -DISABLED: TRITONSERVER_InferenceTraceLevel -FINAL: TRITONSERVER_ResponseCompleteFlag -FIRST_DIM: TRITONSERVER_ModelBatchFlag -MAX: TRITONSERVER_InferenceTraceLevel -MIN: TRITONSERVER_InferenceTraceLevel -ONE_TO_ONE: TRITONSERVER_ModelTxnPropertyFlag -QUEUE_START: TRITONSERVER_InferenceTraceActivity -READY: TRITONSERVER_ModelIndexFlag -REQUEST_END: TRITONSERVER_InferenceTraceActivity -REQUEST_START: TRITONSERVER_InferenceTraceActivity -SEQUENCE_END: TRITONSERVER_RequestFlag -SEQUENCE_START: TRITONSERVER_RequestFlag -TENSORS: TRITONSERVER_InferenceTraceLevel -TENSOR_BACKEND_INPUT: TRITONSERVER_InferenceTraceActivity -TENSOR_BACKEND_OUTPUT: TRITONSERVER_InferenceTraceActivity -TENSOR_QUEUE_INPUT: TRITONSERVER_InferenceTraceActivity -TIMESTAMPS: TRITONSERVER_InferenceTraceLevel -UNKNOWN: TRITONSERVER_ModelBatchFlag - -class AlreadyExistsError(TritonError): ... -class InternalError(TritonError): ... -class InvalidArgumentError(TritonError): ... -class NotFoundError(TritonError): ... - -class TRITONSERVER_BufferAttributes: - byte_size: int - cuda_ipc_handle: int - memory_type: TRITONSERVER_MemoryType - memory_type_id: int - def __init__(self) -> None: ... - -class TRITONSERVER_DataType: - __members__: ClassVar[dict] = ... # read-only - BF16: ClassVar[TRITONSERVER_DataType] = ... - BOOL: ClassVar[TRITONSERVER_DataType] = ... - BYTES: ClassVar[TRITONSERVER_DataType] = ... - FP16: ClassVar[TRITONSERVER_DataType] = ... - FP32: ClassVar[TRITONSERVER_DataType] = ... - FP64: ClassVar[TRITONSERVER_DataType] = ... - INT16: ClassVar[TRITONSERVER_DataType] = ... - INT32: ClassVar[TRITONSERVER_DataType] = ... - INT64: ClassVar[TRITONSERVER_DataType] = ... - INT8: ClassVar[TRITONSERVER_DataType] = ... - INVALID: ClassVar[TRITONSERVER_DataType] = ... - UINT16: ClassVar[TRITONSERVER_DataType] = ... - UINT32: ClassVar[TRITONSERVER_DataType] = ... - UINT64: ClassVar[TRITONSERVER_DataType] = ... - UINT8: ClassVar[TRITONSERVER_DataType] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_InferenceRequest: - correlation_id: int - correlation_id_string: str - flags: int - id: str - priority: int - priority_uint64: int - timeout_microseconds: int - def __init__(self, arg0, arg1: str, arg2: int) -> None: ... - def add_input( - self, arg0: str, arg1: TRITONSERVER_DataType, arg2: List[int] - ) -> None: ... - def add_raw_input(self, arg0: str) -> None: ... - def add_requested_output(self, arg0: str) -> None: ... - def append_input_data( - self, arg0: str, arg1: int, arg2: int, arg3: TRITONSERVER_MemoryType, arg4: int - ) -> None: ... - def append_input_data_with_buffer_attributes( - self, arg0: str, arg1: int, arg2: TRITONSERVER_BufferAttributes - ) -> None: ... - def append_input_data_with_host_policy( - self, - arg0: str, - arg1: int, - arg2: int, - arg3: TRITONSERVER_MemoryType, - arg4: int, - arg5: str, - ) -> None: ... - def cancel(self) -> None: ... - def remove_all_input_data(self, arg0: str) -> None: ... - def remove_all_inputs(self) -> None: ... - def remove_all_requested_outputs(self) -> None: ... - def remove_input(self, arg0: str) -> None: ... - def remove_requested_output(self, arg0: str) -> None: ... - def set_bool_parameter(self, arg0: str, arg1: bool) -> None: ... - def set_int_parameter(self, arg0: str, arg1: int) -> None: ... - def set_release_callback( - self, - arg0: Callable[[TRITONSERVER_InferenceRequest, int, object], None], - arg1: object, - ) -> None: ... - def set_response_callback( - self, - arg0: object, - arg1: object, - arg2: Callable[[object, int, object], None], - arg3: object, - ) -> None: ... - def set_string_parameter(self, arg0: str, arg1: str) -> None: ... - def set_double_parameter(self, arg0: str, arg1: float) -> None: ... - -class TRITONSERVER_InferenceResponse: - def __init__(self, *args, **kwargs) -> None: ... - def output( - self, arg0: int - ) -> Tuple[ - str, - TRITONSERVER_DataType, - numpy.ndarray[numpy.int64], - int, - int, - TRITONSERVER_MemoryType, - int, - object, - ]: ... - def output_classification_label(self, arg0: int, arg1: int) -> str: ... - def parameter( - self, arg0: int - ) -> Tuple[str, TRITONSERVER_ParameterType, object]: ... - def throw_if_response_error(self) -> None: ... - @property - def id(self) -> str: ... - @property - def model(self) -> Tuple[str, int]: ... - @property - def output_count(self) -> int: ... - @property - def parameter_count(self) -> int: ... - -class TRITONSERVER_InferenceTrace: - @overload - def __init__( - self, - level: int, - parent_id: int, - activity_function: Callable[ - [object, TRITONSERVER_InferenceTraceActivity, int, object], None - ], - tensor_activity_function: Callable[ - [ - object, - TRITONSERVER_InferenceTraceActivity, - str, - TRITONSERVER_DataType, - int, - int, - numpy.ndarray[numpy.int64], - TRITONSERVER_MemoryType, - int, - object, - ], - None, - ], - release_function: Callable[[TRITONSERVER_InferenceTrace, object], None], - trace_userp: object, - ) -> None: ... - @overload - def __init__( - self, - level: int, - parent_id: int, - activity_function: Callable[ - [object, TRITONSERVER_InferenceTraceActivity, int, object], None - ], - release_function: Callable[[TRITONSERVER_InferenceTrace, object], None], - trace_userp: object, - ) -> None: ... - @property - def id(self) -> int: ... - @property - def model_name(self) -> str: ... - @property - def model_version(self) -> int: ... - @property - def parent_id(self) -> int: ... - @property - def request_id(self) -> str: ... - -class TRITONSERVER_InferenceTraceActivity: - __members__: ClassVar[dict] = ... # read-only - COMPUTE_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - COMPUTE_INPUT_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - COMPUTE_OUTPUT_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - COMPUTE_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - QUEUE_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - REQUEST_END: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - REQUEST_START: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - TENSOR_BACKEND_INPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - TENSOR_BACKEND_OUTPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - TENSOR_QUEUE_INPUT: ClassVar[TRITONSERVER_InferenceTraceActivity] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_InferenceTraceLevel: - __members__: ClassVar[dict] = ... # read-only - DISABLED: ClassVar[TRITONSERVER_InferenceTraceLevel] = ... - MAX: ClassVar[TRITONSERVER_InferenceTraceLevel] = ... - MIN: ClassVar[TRITONSERVER_InferenceTraceLevel] = ... - TENSORS: ClassVar[TRITONSERVER_InferenceTraceLevel] = ... - TIMESTAMPS: ClassVar[TRITONSERVER_InferenceTraceLevel] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_InstanceGroupKind: - __members__: ClassVar[dict] = ... # read-only - AUTO: ClassVar[TRITONSERVER_InstanceGroupKind] = ... - CPU: ClassVar[TRITONSERVER_InstanceGroupKind] = ... - GPU: ClassVar[TRITONSERVER_InstanceGroupKind] = ... - MODEL: ClassVar[TRITONSERVER_InstanceGroupKind] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_LogFormat: - __members__: ClassVar[dict] = ... # read-only - DEFAULT: ClassVar[TRITONSERVER_LogFormat] = ... - ISO8601: ClassVar[TRITONSERVER_LogFormat] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_LogLevel: - __members__: ClassVar[dict] = ... # read-only - ERROR: ClassVar[TRITONSERVER_LogLevel] = ... - INFO: ClassVar[TRITONSERVER_LogLevel] = ... - VERBOSE: ClassVar[TRITONSERVER_LogLevel] = ... - WARN: ClassVar[TRITONSERVER_LogLevel] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_MemoryType: - __members__: ClassVar[dict] = ... # read-only - CPU: ClassVar[TRITONSERVER_MemoryType] = ... - CPU_PINNED: ClassVar[TRITONSERVER_MemoryType] = ... - GPU: ClassVar[TRITONSERVER_MemoryType] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_Message: - def __init__(self, arg0: str) -> None: ... - def serialize_to_json(self) -> str: ... - -class TRITONSERVER_Metric: - def __init__( - self, arg0: TRITONSERVER_MetricFamily, arg1: List[TRITONSERVER_Parameter] - ) -> None: ... - def increment(self, arg0: float) -> None: ... - def set_value(self, arg0: float) -> None: ... - @property - def kind(self) -> TRITONSERVER_MetricKind: ... - @property - def value(self) -> float: ... - -class TRITONSERVER_MetricFamily: - def __init__(self, arg0: TRITONSERVER_MetricKind, arg1: str, arg2: str) -> None: ... - -class TRITONSERVER_MetricFormat: - __members__: ClassVar[dict] = ... # read-only - PROMETHEUS: ClassVar[TRITONSERVER_MetricFormat] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_MetricKind: - __members__: ClassVar[dict] = ... # read-only - COUNTER: ClassVar[TRITONSERVER_MetricKind] = ... - GAUGE: ClassVar[TRITONSERVER_MetricKind] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_Metrics: - def __init__(self, *args, **kwargs) -> None: ... - def formatted(self, arg0: TRITONSERVER_MetricFormat) -> str: ... - -class TRITONSERVER_ModelBatchFlag: - __members__: ClassVar[dict] = ... # read-only - FIRST_DIM: ClassVar[TRITONSERVER_ModelBatchFlag] = ... - UNKNOWN: ClassVar[TRITONSERVER_ModelBatchFlag] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_ModelControlMode: - __members__: ClassVar[dict] = ... # read-only - EXPLICIT: ClassVar[TRITONSERVER_ModelControlMode] = ... - NONE: ClassVar[TRITONSERVER_ModelControlMode] = ... - POLL: ClassVar[TRITONSERVER_ModelControlMode] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_ModelIndexFlag: - __members__: ClassVar[dict] = ... # read-only - READY: ClassVar[TRITONSERVER_ModelIndexFlag] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_ModelTxnPropertyFlag: - __members__: ClassVar[dict] = ... # read-only - DECOUPLED: ClassVar[TRITONSERVER_ModelTxnPropertyFlag] = ... - ONE_TO_ONE: ClassVar[TRITONSERVER_ModelTxnPropertyFlag] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_Parameter: - @overload - def __init__(self, arg0: str, arg1: bytes) -> None: ... - @overload - def __init__(self, arg0: str, arg1: str) -> None: ... - @overload - def __init__(self, arg0: str, arg1: int) -> None: ... - @overload - def __init__(self, arg0: str, arg1: bool) -> None: ... - -class TRITONSERVER_ParameterType: - __members__: ClassVar[dict] = ... # read-only - BOOL: ClassVar[TRITONSERVER_ParameterType] = ... - BYTES: ClassVar[TRITONSERVER_ParameterType] = ... - INT: ClassVar[TRITONSERVER_ParameterType] = ... - STRING: ClassVar[TRITONSERVER_ParameterType] = ... - DOUBLE: ClassVar[TRITONSERVER_ParameterType] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_RateLimitMode: - __members__: ClassVar[dict] = ... # read-only - EXEC_COUNT: ClassVar[TRITONSERVER_RateLimitMode] = ... - OFF: ClassVar[TRITONSERVER_RateLimitMode] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_RequestFlag: - __members__: ClassVar[dict] = ... # read-only - SEQUENCE_END: ClassVar[TRITONSERVER_RequestFlag] = ... - SEQUENCE_START: ClassVar[TRITONSERVER_RequestFlag] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_RequestReleaseFlag: - __members__: ClassVar[dict] = ... # read-only - ALL: ClassVar[TRITONSERVER_RequestReleaseFlag] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_ResponseAllocator: - @overload - def __init__( - self, - alloc_function: Callable[ - [object, str, int, TRITONSERVER_MemoryType, int, object], - Tuple[int, object, TRITONSERVER_MemoryType, int], - ], - release_function: Callable[ - [object, int, object, int, TRITONSERVER_MemoryType, int], None - ], - start_function: Callable[[object, object], None], - ) -> None: ... - @overload - def __init__( - self, - alloc_function: Callable[ - [object, str, int, TRITONSERVER_MemoryType, int, object], - Tuple[int, object, TRITONSERVER_MemoryType, int], - ], - release_function: Callable[ - [object, int, object, int, TRITONSERVER_MemoryType, int], None - ], - ) -> None: ... - def set_buffer_attributes_function( - self, - buffer_attributes_function: Callable[ - [object, str, object, object, object], object - ], - ) -> None: ... - def set_query_function( - self, - query_function: Callable[ - [object, object, str, Optional[int], TRITONSERVER_MemoryType, int], - Tuple[TRITONSERVER_MemoryType, int], - ], - ) -> None: ... - -class TRITONSERVER_ResponseCompleteFlag: - __members__: ClassVar[dict] = ... # read-only - FINAL: ClassVar[TRITONSERVER_ResponseCompleteFlag] = ... - __entries: ClassVar[dict] = ... - def __init__(self, value: int) -> None: ... - def __eq__(self, other: object) -> bool: ... - def __hash__(self) -> int: ... - def __index__(self) -> int: ... - def __int__(self) -> int: ... - def __ne__(self, other: object) -> bool: ... - @property - def name(self) -> str: ... - @property - def value(self) -> int: ... - -class TRITONSERVER_Server: - def __init__(self, arg0: TRITONSERVER_ServerOptions) -> None: ... - @overload - def infer_async( - self, arg0: TRITONSERVER_InferenceRequest, arg1: TRITONSERVER_InferenceTrace - ) -> None: ... - @overload - def infer_async(self, arg0: TRITONSERVER_InferenceRequest) -> None: ... - def is_live(self) -> bool: ... - def is_ready(self) -> bool: ... - def load_model(self, arg0: str) -> None: ... - def load_model_with_parameters( - self, arg0: str, arg1: List[TRITONSERVER_Parameter] - ) -> None: ... - def metadata(self) -> TRITONSERVER_Message: ... - def metrics(self) -> TRITONSERVER_Metrics: ... - def model_batch_properties(self, arg0: str, arg1: int) -> Tuple[int, int]: ... - def model_config(self, arg0: str, arg1: int, arg2: int) -> TRITONSERVER_Message: ... - def model_index(self, arg0: int) -> TRITONSERVER_Message: ... - def model_is_ready(self, arg0: str, arg1: int) -> bool: ... - def model_metadata(self, arg0: str, arg1: int) -> TRITONSERVER_Message: ... - def model_statistics(self, arg0: str, arg1: int) -> TRITONSERVER_Message: ... - def model_transaction_properties(self, arg0: str, arg1: int) -> Tuple[int, int]: ... - @overload - def poll_model_repository(self) -> None: ... - @overload - def poll_model_repository(self) -> None: ... - def register_model_repository( - self, arg0: str, arg1: List[TRITONSERVER_Parameter] - ) -> None: ... - def stop(self) -> None: ... - def unload_model(self, arg0: str) -> None: ... - def unload_model_and_dependents(self, arg0: str) -> None: ... - def unregister_model_repository(self, arg0: str) -> None: ... - -class TRITONSERVER_ServerOptions: - def __init__(self) -> None: ... - def add_rate_limiter_resource(self, arg0: str, arg1: int, arg2: int) -> None: ... - def set_backend_config(self, arg0: str, arg1: str, arg2: str) -> None: ... - def set_backend_directory(self, arg0: str) -> None: ... - def set_buffer_manager_thread_count(self, arg0: int) -> None: ... - def set_cache_config(self, arg0: str, arg1: str) -> None: ... - def set_cache_directory(self, arg0: str) -> None: ... - def set_cpu_metrics(self, arg0: bool) -> None: ... - def set_cuda_memory_pool_byte_size(self, arg0: int, arg1: int) -> None: ... - def set_exit_on_error(self, arg0: bool) -> None: ... - def set_exit_timeout(self, arg0: int) -> None: ... - def set_gpu_metrics(self, arg0: bool) -> None: ... - def set_host_policy(self, arg0: str, arg1: str, arg2: str) -> None: ... - def set_log_error(self, arg0: bool) -> None: ... - def set_log_file(self, arg0: str) -> None: ... - def set_log_format(self, arg0: TRITONSERVER_LogFormat) -> None: ... - def set_log_info(self, arg0: bool) -> None: ... - def set_log_verbose(self, arg0: int) -> None: ... - def set_log_warn(self, arg0: bool) -> None: ... - def set_metrics(self, arg0: bool) -> None: ... - def set_metrics_config(self, arg0: str, arg1: str, arg2: str) -> None: ... - def set_metrics_interval(self, arg0: int) -> None: ... - def set_min_supported_compute_capability(self, arg0: float) -> None: ... - def set_model_control_mode(self, arg0: TRITONSERVER_ModelControlMode) -> None: ... - def set_model_load_device_limit( - self, arg0: TRITONSERVER_InstanceGroupKind, arg1: int, arg2: float - ) -> None: ... - def set_model_load_thread_count(self, arg0: int) -> None: ... - def set_model_load_retry_count(self, arg0: int) -> None: ... - def set_model_namespacing(self, arg0: bool) -> None: ... - def set_enable_peer_access(self, arg0: bool) -> None: ... - def set_model_repository_path(self, arg0: str) -> None: ... - def set_pinned_memory_pool_byte_size(self, arg0: int) -> None: ... - def set_rate_limiter_mode(self, arg0: TRITONSERVER_RateLimitMode) -> None: ... - def set_repo_agent_directory(self, arg0: str) -> None: ... - def set_response_cache_byte_size(self, arg0: int) -> None: ... - def set_server_id(self, arg0: str) -> None: ... - def set_startup_model(self, arg0: str) -> None: ... - def set_strict_model_config(self, arg0: bool) -> None: ... - def set_strict_readiness(self, arg0: bool) -> None: ... - -class TritonError(Exception): ... -class UnavailableError(TritonError): ... -class UnknownError(TritonError): ... -class UnsupportedError(TritonError): ... - -def TRITONSERVER_DataTypeByteSize(arg0: TRITONSERVER_DataType) -> int: ... -def TRITONSERVER_DataTypeString(arg0: TRITONSERVER_DataType) -> str: ... -def TRITONSERVER_InferenceTraceActivityString( - arg0: TRITONSERVER_InferenceTraceActivity, -) -> str: ... -def TRITONSERVER_InferenceTraceLevelString( - arg0: TRITONSERVER_InferenceTraceLevel, -) -> str: ... -def TRITONSERVER_InstanceGroupKindString( - arg0: TRITONSERVER_InstanceGroupKind, -) -> str: ... -def TRITONSERVER_LogIsEnabled(arg0: TRITONSERVER_LogLevel) -> bool: ... -def TRITONSERVER_LogMessage( - arg0: TRITONSERVER_LogLevel, arg1: str, arg2: int, arg3: str -) -> None: ... -def TRITONSERVER_MemoryTypeString(arg0: TRITONSERVER_MemoryType) -> str: ... -def TRITONSERVER_ParameterTypeString(arg0: TRITONSERVER_ParameterType) -> str: ... -def TRITONSERVER_StringToDataType(arg0: str) -> TRITONSERVER_DataType: ... -def api_version() -> tuple: ... From be3fa6941e6a8e8c421f96ad6f1d6b90d7fa2d31 Mon Sep 17 00:00:00 2001 From: Ryan McCormick Date: Mon, 6 Jan 2025 10:48:42 -0800 Subject: [PATCH 6/8] Expose tritonserver.InferenceResponse type (#394) --- python/tritonserver/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/tritonserver/__init__.py b/python/tritonserver/__init__.py index 4d25c0478..440f69fe9 100644 --- a/python/tritonserver/__init__.py +++ b/python/tritonserver/__init__.py @@ -55,6 +55,7 @@ from tritonserver._api._model import ModelBatchFlag as ModelBatchFlag from tritonserver._api._model import ModelTxnPropertyFlag as ModelTxnPropertyFlag from tritonserver._api._request import InferenceRequest as InferenceRequest +from tritonserver._api._response import InferenceResponse as InferenceResponse from tritonserver._api._server import InstanceGroupKind as InstanceGroupKind from tritonserver._api._server import LogFormat as LogFormat from tritonserver._api._server import Metric as Metric From f3610e46dbaaef230aa208f8ac3170af7e2bb970 Mon Sep 17 00:00:00 2001 From: Neelay Shah Date: Mon, 6 Jan 2025 14:19:49 -0800 Subject: [PATCH 7/8] fix: Fix memory leak with dlpack when using python Tensor objects (#421) co-author: @tanmayv25 --- python/test/test_api.py | 98 +++++++++++++++++++++++++++++ python/tritonserver/_api/_tensor.py | 90 +++++++++++++++----------- 2 files changed, 153 insertions(+), 35 deletions(-) diff --git a/python/test/test_api.py b/python/test/test_api.py index 68aa7a318..ed96e27f0 100644 --- a/python/test/test_api.py +++ b/python/test/test_api.py @@ -24,9 +24,17 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import asyncio +import copy +import gc import json import os import shutil +import sys +import time +import unittest +from collections import Counter +from contextlib import contextmanager import numpy import pytest @@ -296,6 +304,96 @@ def test_tensor_from_numpy(self): numpy.testing.assert_array_equal(torch_tensor.numpy(), cpu_array) assert torch_tensor.data_ptr() == cpu_array.ctypes.data + async def _tensor_from_numpy(self): + owner = numpy.ones(2**27) + tensor = tritonserver.Tensor.from_dlpack(owner) + array = numpy.from_dlpack(tensor) + del owner + del tensor + del array + await asyncio.sleep(0.1) + + async def _async_test_runs(self): + tasks = [] + for _ in range(100): + tasks.append(asyncio.create_task(self._tensor_from_numpy())) + try: + await asyncio.wait(tasks) + except Exception as e: + print(e) + + @staticmethod + @contextmanager + def object_collector(): + gc.collect() + objects_before = gc.get_objects() + yield + objects_after = gc.get_objects() + new_objects = [type(x) for x in objects_after[len(objects_before) :]] + tensor_objects = [ + x for x in objects_after if isinstance(x, tritonserver.Tensor) + ] + if tensor_objects: + print("Tensor objects") + print(len(tensor_objects)) + print(type(tensor_objects[-1].memory_buffer.owner)) + print( + f"\nTotal Collected Objects ({len(new_objects)}) {Counter(new_objects)}" + ) + assert len(tensor_objects) == 0, "Leaked Tensors" + + def test_cpu_memory_leak_async(self): + with TestTensor.object_collector(): + asyncio.run(self._async_test_runs()) + + def test_cpu_memory_leak_sync(self): + with TestTensor.object_collector(): + for _ in range(100): + owner = numpy.ones(2**27) + tensor = tritonserver.Tensor.from_dlpack(owner) + array = numpy.from_dlpack(tensor) + del owner + del tensor + del array + + @pytest.mark.skipif(cupy is None, reason="Skipping gpu memory, cupy not installed") + def test_gpu_memory_leak(self): + with TestTensor.object_collector(): + for _ in range(100): + owner = cupy.ones(2**27) + tensor = tritonserver.Tensor.from_dlpack(owner) + array = cupy.from_dlpack(tensor) + del owner + del tensor + del array + + def test_reference_counts(self): + with TestTensor.object_collector(): + owner = numpy.ones(2**27) + owner_data = owner.ctypes.data + assert sys.getrefcount(owner) - 1 == 1, "Invalid Count" + + tensor = tritonserver.Tensor.from_dlpack(owner) + assert sys.getrefcount(owner) - 1 == 2, "Invalid Count" + assert sys.getrefcount(tensor) - 1 == 1, "Invalid Count" + del owner + + numpy_array = numpy.from_dlpack(tensor) + assert owner_data == numpy_array.ctypes.data + assert sys.getrefcount(tensor) - 1 == 2, "Invalid Count" + assert sys.getrefcount(numpy_array) - 1 == 1, "Invalid Count" + + tensor.shape = [2, 2**26] + + assert numpy_array.shape == (2**27,), "Invalid Shape" + + numpy_array_2 = numpy.from_dlpack(tensor) + del tensor + assert owner_data == numpy_array.ctypes.data + assert numpy_array_2.shape == (2, 2**26) + del numpy_array + del numpy_array_2 + class TestServer: def test_not_started(self): diff --git a/python/tritonserver/_api/_tensor.py b/python/tritonserver/_api/_tensor.py index ee21abd59..afac87d9f 100644 --- a/python/tritonserver/_api/_tensor.py +++ b/python/tritonserver/_api/_tensor.py @@ -217,23 +217,8 @@ def __dlpack__(self, *, stream=None): self._sync_on_requested_stream(stream) - dl_managed_tensor = Tensor._create_managed_tensor() - dl_managed_tensor.dl_tensor.data = self.data_ptr - dl_managed_tensor.dl_tensor.device = DLDevice( - TRITON_MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type], - self.memory_type_id, - ) + dl_managed_tensor = self._create_managed_tensor() - dl_managed_tensor.dl_tensor.dtype = TRITON_TO_DLPACK_DTYPE[self.data_type] - dl_managed_tensor.dl_tensor.ndim = len(self.shape) - dl_managed_tensor.dl_tensor.shape = (ctypes.c_int64 * len(self.shape))( - *self.shape - ) - dl_managed_tensor.dl_tensor.strides = ctypes.POINTER(ctypes.c_int64)() - dl_managed_tensor.dl_tensor.byte_offset = 0 - dl_managed_tensor.deleter = Tensor._managed_tensor_deleter - - self._set_dlpack_manager_ctx(dl_managed_tensor) pycapsule = ctypes.pythonapi.PyCapsule_New( ctypes.byref(dl_managed_tensor), c_str_dltensor, @@ -600,26 +585,39 @@ def _from_numpy(obj: numpy.ndarray | numpy.generic) -> Tensor: size=obj.itemsize * obj.size, owner=obj, ) - return Tensor(data_type, shape, memory_buffer) - @staticmethod - def _create_managed_tensor(): + def _create_managed_tensor(self) -> DLManagedTensor: + # Allocates space for a managed tensor object + # and fills in the fields + # + # To ensure the lifetime of the managed tensor we create a + # context object that includes a newly created shape array and a + # reference to self + size = ctypes.c_size_t(ctypes.sizeof(DLManagedTensor)) address = ctypes.pythonapi.PyMem_RawMalloc(size) - return DLManagedTensor.from_address(address) + dl_managed_tensor = DLManagedTensor.from_address(address) + dl_managed_tensor.dl_tensor.data = self.data_ptr + dl_managed_tensor.dl_tensor.device = DLDevice( + TRITON_MEMORY_TYPE_TO_DLPACK_DEVICE_TYPE[self.memory_type], + self.memory_type_id, + ) + dl_managed_tensor.dl_tensor.dtype = TRITON_TO_DLPACK_DTYPE[self.data_type] + dl_managed_tensor.dl_tensor.ndim = len(self.shape) + manager_ctx = _ManagerCtx(self) + dl_managed_tensor.dl_tensor.shape = manager_ctx.shape + dl_managed_tensor.dl_tensor.strides = manager_ctx.strides + dl_managed_tensor.dl_tensor.byte_offset = 0 + dl_managed_tensor.deleter = Tensor._managed_tensor_deleter + dl_managed_tensor.manager_ctx = manager_ctx.reference() + return dl_managed_tensor @staticmethod @ctypes.CFUNCTYPE(None, ctypes.c_void_p) def _managed_tensor_deleter(handle: int) -> None: dl_managed_tensor = DLManagedTensor.from_address(handle) - tensor_obj_ptr = ctypes.cast( - dl_managed_tensor.manager_ctx, ctypes.POINTER(ctypes.py_object) - ) - tensor_obj = tensor_obj_ptr.contents - ctypes.pythonapi.Py_DecRef(tensor_obj) - shape_obj = ctypes.py_object(dl_managed_tensor.dl_tensor.shape) - ctypes.pythonapi.Py_DecRef(shape_obj) + _ManagerCtx.release(dl_managed_tensor.manager_ctx) ctypes.pythonapi.PyMem_RawFree(handle) @staticmethod @@ -639,14 +637,36 @@ def _pycapsule_deleter(handle: ctypes.c_void_p) -> None: print(f"Exception occurred while deleting capsule: {e}") raise e - def _set_dlpack_manager_ctx(self, dl_managed_tensor): - tensor_obj = ctypes.py_object(self) - tensor_obj_ptr = ctypes.pointer(tensor_obj) - dl_managed_tensor.manager_ctx = ctypes.cast(tensor_obj_ptr, ctypes.c_void_p) - shape_obj = ctypes.py_object(dl_managed_tensor.dl_tensor.shape) - ctypes.pythonapi.Py_IncRef(tensor_obj) - ctypes.pythonapi.Py_IncRef(shape_obj) - _from_converters: ClassVar[dict[type, Callable[[Any], Tensor]]] = dict( {numpy.ndarray: _from_numpy, numpy.generic: _from_numpy, list: _from_list}, ) + + +class _ManagerCtx: + # To ensure the lifetime of the managed tensor we create a + # context object that includes a newly created shape array and a + # reference to self + + def __init__(self, tensor: Tensor) -> None: + self._tensor = tensor + self.shape = (ctypes.c_int64 * len(tensor.shape))(*tensor.shape) + self.strides = ctypes.POINTER(ctypes.c_int64)() + + def reference(self) -> ctypes.c_void_p: + py_obj = ctypes.py_object(self) + ctypes.pythonapi.Py_IncRef(py_obj) + + # Note: Could not find a direct way to cast a python object + # to a c_void_p. The mechanism is to either use id(self) or + # cast as described here: + # + # https://groups.google.com/g/dev-python/c/QRRqVC7gkf4/m/zH7l1gTXBwAJ + # + # To avoid relying on the behavior of id() we use the casting mechanism + + return ctypes.POINTER(ctypes.c_void_p)(py_obj)[0] + + @staticmethod + def release(reference: ctypes.c_void_p) -> None: + py_obj = ctypes.cast(reference, ctypes.py_object) + ctypes.pythonapi.Py_DecRef(py_obj) From eeb283a34c8da06c275dd003f2860aaac43f5b06 Mon Sep 17 00:00:00 2001 From: Yingge He <157551214+yinggeh@users.noreply.github.com> Date: Wed, 8 Jan 2025 11:10:05 -0800 Subject: [PATCH 8/8] fix: Validate request correlation ID data type (#425) --- src/constants.h | 3 +- src/infer_request.cc | 62 ++++++++++++++++++- src/infer_request.h | 6 +- .../sequence_batch_scheduler.cc | 8 +-- 4 files changed, 69 insertions(+), 10 deletions(-) diff --git a/src/constants.h b/src/constants.h index 8415f8ee9..119d1e9d2 100644 --- a/src/constants.h +++ b/src/constants.h @@ -1,4 +1,4 @@ -// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -92,7 +92,6 @@ constexpr uint64_t NANOS_PER_SECOND = 1000000000; constexpr uint64_t NANOS_PER_MILLIS = 1000000; constexpr int MAX_GRPC_MESSAGE_SIZE = INT32_MAX; constexpr uint64_t SEQUENCE_IDLE_DEFAULT_MICROSECONDS = 1000 * 1000; -constexpr size_t STRING_CORRELATION_ID_MAX_LENGTH_BYTES = 128; constexpr size_t CUDA_IPC_STRUCT_SIZE = 64; #ifdef TRITON_ENABLE_METRICS diff --git a/src/infer_request.cc b/src/infer_request.cc index 0d0c80a0d..83b3bb872 100644 --- a/src/infer_request.cc +++ b/src/infer_request.cc @@ -1,4 +1,4 @@ -// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -1228,11 +1228,16 @@ InferenceRequest::Normalize() } } } + + if (model_config.has_sequence_batching()) { + RETURN_IF_ERROR(ValidateCorrelationId()); + } + return Status::Success; } Status -InferenceRequest::ValidateRequestInputs() +InferenceRequest::ValidateRequestInputs() const { const inference::ModelConfig& model_config = model_raw_->Config(); if ((original_inputs_.size() > (size_t)model_config.input_size()) || @@ -1404,6 +1409,59 @@ InferenceRequest::ValidateBytesInputs( return Status::Success; } +Status +InferenceRequest::ValidateCorrelationId() const +{ + const inference::ModelConfig& model_config = model_raw_->Config(); + const std::string& model_name = ModelName(); + std::string correlation_id_tensor_name; + inference::DataType correlation_id_datatype; + + RETURN_IF_ERROR(GetTypedSequenceControlProperties( + model_config.sequence_batching(), model_config.name(), + inference::ModelSequenceBatching::Control::CONTROL_SEQUENCE_CORRID, + false /* required */, &correlation_id_tensor_name, + &correlation_id_datatype)); + + // Make sure request correlation ID type matches model configuration. + if (!correlation_id_tensor_name.empty()) { + const auto& correlation_id = CorrelationId(); + bool dtypes_match = true; + std::string request_corrid_datatype; + if ((correlation_id.Type() == + InferenceRequest::SequenceId::DataType::STRING) && + (correlation_id_datatype != inference::DataType::TYPE_STRING)) { + dtypes_match = false; + request_corrid_datatype = triton::common::DataTypeToProtocolString( + inference::DataType::TYPE_STRING); + } else if ( + (correlation_id.Type() == + InferenceRequest::SequenceId::DataType::UINT64) && + ((correlation_id_datatype != inference::DataType::TYPE_UINT64) && + (correlation_id_datatype != inference::DataType::TYPE_INT64) && + (correlation_id_datatype != inference::DataType::TYPE_UINT32) && + (correlation_id_datatype != inference::DataType::TYPE_INT32))) { + dtypes_match = false; + request_corrid_datatype = triton::common::DataTypeToProtocolString( + inference::DataType::TYPE_UINT64); + } + + if (!dtypes_match) { + return Status( + Status::Code::INVALID_ARG, + LogRequest() + "sequence batching control '" + + correlation_id_tensor_name + "' data-type is '" + + request_corrid_datatype + "', but model '" + model_name + + "' expects '" + + std::string(triton::common::DataTypeToProtocolString( + correlation_id_datatype)) + + "'"); + } + } + + return Status::Success; +} + #ifdef TRITON_ENABLE_STATS void diff --git a/src/infer_request.h b/src/infer_request.h index 38c89ed63..e9bfa49bc 100644 --- a/src/infer_request.h +++ b/src/infer_request.h @@ -1,4 +1,4 @@ -// Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -771,13 +771,15 @@ class InferenceRequest { Status Normalize(); // Helper for validating Inputs - Status ValidateRequestInputs(); + Status ValidateRequestInputs() const; Status ValidateBytesInputs( const std::string& input_id, const Input& input, const std::string& model_name, TRITONSERVER_MemoryType* buffer_memory_type) const; + Status ValidateCorrelationId() const; + // Helpers for pending request metrics void IncrementPendingRequestCount(); void DecrementPendingRequestCount(); diff --git a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc index 74314e7ab..45e9c037c 100644 --- a/src/sequence_batch_scheduler/sequence_batch_scheduler.cc +++ b/src/sequence_batch_scheduler/sequence_batch_scheduler.cc @@ -1,4 +1,4 @@ -// Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright 2018-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions @@ -1343,9 +1343,9 @@ SequenceBatch::SetControlTensors( auto& seq_corr_id = seq_slot_corrid_override_; size_t size_p = triton::common::GetDataTypeByteSize(seq_corr_id->DType()); if (seq_corr_id->DType() == inference::DataType::TYPE_STRING) { - // 4 bytes for length of string plus pre-defined max string correlation id - // length in bytes - size_p = 4 + triton::core::STRING_CORRELATION_ID_MAX_LENGTH_BYTES; + // 4 bytes for length of string plus string correlation id length in + // bytes. + size_p = 4 + corrid.StringValue().length(); } TRITONSERVER_MemoryType memory_type;