diff --git a/.clang-format b/.clang-format
index 2e606ba4bb..1defc175de 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,7 +2,8 @@
BasedOnStyle: Google
IndentWidth: 2
-ContinuationIndentWidth: 2
+ColumnLimit: 80
+ContinuationIndentWidth: 4
UseTab: Never
MaxEmptyLinesToKeep: 2
@@ -34,4 +35,5 @@ BinPackArguments: true
BinPackParameters: true
ConstructorInitializerAllOnOneLineOrOnePerLine: false
-IndentCaseLabels: true
\ No newline at end of file
+IndentCaseLabels: true
+
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000000..df06a0e5fb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,24 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Description**
+A clear and concise description of what the bug is.
+
+**Triton Information**
+What version of Triton are you using?
+
+Are you using the Triton container or did you build it yourself?
+
+**To Reproduce**
+Steps to reproduce the behavior.
+
+Describe the models (framework, inputs, outputs), ideally include the model configuration file (if using an ensemble include the model configuration file for that as well).
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000..bbcbbe7d61
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
new file mode 100644
index 0000000000..745a33730b
--- /dev/null
+++ b/.github/workflows/codeql.yml
@@ -0,0 +1,84 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "CodeQL"
+
+on:
+ pull_request:
+
+jobs:
+ analyze:
+ name: Analyze
+ runs-on: ubuntu-latest
+ permissions:
+ actions: read
+ contents: read
+ security-events: write
+
+ strategy:
+ fail-fast: false
+ matrix:
+ language: [ 'python' ]
+ # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
+ # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v3
+
+ # Initializes the CodeQL tools for scanning.
+ - name: Initialize CodeQL
+ uses: github/codeql-action/init@v2
+ with:
+ languages: ${{ matrix.language }}
+ # If you wish to specify custom queries, you can do so here or in a config file.
+ # By default, queries listed here will override any specified in a config file.
+ # Prefix the list here with "+" to use these queries and those in the config file.
+
+ # Details on CodeQL's query packs refer to:
+ # https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+ queries: +security-and-quality
+
+
+ # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
+ # If this step fails, then you should remove it and run the build manually (see below)
+ - name: Autobuild
+ uses: github/codeql-action/autobuild@v2
+
+ # Command-line programs to run using the OS shell.
+ # See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+
+ # If the Autobuild fails above, remove it and uncomment the following three lines.
+ # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
+
+ # - run: |
+ # echo "Run, Build Application using script"
+ # ./location_of_script_within_repo/buildscript.sh
+
+ - name: Perform CodeQL Analysis
+ uses: github/codeql-action/analyze@v2
+ with:
+ category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
new file mode 100644
index 0000000000..531cc2911b
--- /dev/null
+++ b/.github/workflows/pre-commit.yaml
@@ -0,0 +1,39 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: pre-commit
+
+on:
+ pull_request:
+
+jobs:
+ pre-commit:
+ runs-on: ubuntu-22.04
+ steps:
+ - uses: actions/checkout@v3
+ - uses: actions/setup-python@v3
+ - uses: pre-commit/action@v3.0.0
+
diff --git a/.gitignore b/.gitignore
index 4e1f8ef0cc..f1b69cb25e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,8 @@
-/bazel-bin
-/bazel-ci_build-cache
-/bazel-genfiles
-/bazel-trtserver
-/bazel-out
-/bazel-serving
-/bazel-tensorflow
-/bazel-tensorflow_serving
-/bazel-testlogs
-/bazel-tf
-/bazel-workspace
+/build
+/builddir
+/.vscode
+*.so
+__pycache__
+tmp
+*.log
+test_results.txt
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000..f44f815351
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,74 @@
+# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+repos:
+- repo: https://github.com/timothycrosley/isort
+ rev: 5.12.0
+ hooks:
+ - id: isort
+ additional_dependencies: [toml]
+- repo: https://github.com/psf/black
+ rev: 23.1.0
+ hooks:
+ - id: black
+ types_or: [python, cython]
+- repo: https://github.com/PyCQA/flake8
+ rev: 5.0.4
+ hooks:
+ - id: flake8
+ args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]
+ types_or: [python, cython]
+- repo: https://github.com/pre-commit/mirrors-clang-format
+ rev: v16.0.5
+ hooks:
+ - id: clang-format
+ types_or: [c, c++, cuda, proto, textproto, java]
+ args: ["-fallback-style=none", "-style=file", "-i"]
+- repo: https://github.com/codespell-project/codespell
+ rev: v2.2.4
+ hooks:
+ - id: codespell
+ additional_dependencies: [tomli]
+ args: ["--toml", "pyproject.toml"]
+ exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)
+# More details about these pre-commit hooks here:
+# https://pre-commit.com/hooks.html
+- repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.4.0
+ hooks:
+ - id: check-case-conflict
+ - id: check-executables-have-shebangs
+ - id: check-merge-conflict
+ - id: check-json
+ - id: check-toml
+ - id: check-yaml
+ exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
+ - id: check-shebang-scripts-are-executable
+ - id: end-of-file-fixer
+ types_or: [c, c++, cuda, proto, textproto, java, python]
+ - id: mixed-line-ending
+ - id: requirements-txt-fixer
+ - id: trailing-whitespace
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000..f8fb8d09fb
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,7 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+title: "Triton Inference Server: An Optimized Cloud and Edge Inferencing Solution."
+url: https://github.com/triton-inference-server
+repository-code: https://github.com/triton-inference-server/server
+authors:
+ - name: "NVIDIA Corporation"
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000000..ff578c9724
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,273 @@
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cmake_minimum_required(VERSION 3.18)
+
+project(tritonserver LANGUAGES C CXX)
+
+include(CMakeDependentOption)
+
+# Use C++17 standard as Triton's minimum required.
+set(TRITON_MIN_CXX_STANDARD 17 CACHE STRING "The minimum C++ standard which features are requested to build this target.")
+
+set(TRITON_VERSION "0.0.0" CACHE STRING "The version of the Triton shared library" )
+
+option(TRITON_ENABLE_LOGGING "Include logging support in server" ON)
+option(TRITON_ENABLE_STATS "Include statistics collections in server" ON)
+option(TRITON_ENABLE_TRACING "Include tracing support in server" OFF)
+option(TRITON_ENABLE_NVTX "Include NVTX support in server" OFF)
+option(TRITON_ENABLE_GPU "Enable GPU support in server" ON)
+option(TRITON_ENABLE_MALI_GPU "Enable Arm Mali GPU support in server" OFF)
+option(TRITON_IGPU_BUILD "Enable options for iGPU compilation in sever" OFF)
+set(TRITON_MIN_COMPUTE_CAPABILITY "6.0" CACHE STRING
+ "The minimum CUDA compute capability supported by Triton" )
+set(TRITON_EXTRA_LIB_PATHS "" CACHE PATH "Extra library paths for Triton Server build")
+
+# Ensemble
+option(TRITON_ENABLE_ENSEMBLE "Include ensemble support in server" OFF)
+
+# Endpoints
+option(TRITON_ENABLE_HTTP "Include HTTP API in server" ON)
+option(TRITON_ENABLE_GRPC "Include GRPC API in server" ON)
+option(TRITON_ENABLE_SAGEMAKER "Include AWS SageMaker API in server" OFF)
+option(TRITON_ENABLE_VERTEX_AI "Include Vertex AI API in server" OFF)
+
+# Metrics
+option(TRITON_ENABLE_METRICS "Include metrics support in server" ON)
+option(TRITON_ENABLE_METRICS_GPU "Include GPU metrics support in server" ON)
+option(TRITON_ENABLE_METRICS_CPU "Include CPU metrics support in server" ON)
+
+# Cloud storage
+option(TRITON_ENABLE_GCS "Include GCS Filesystem support in server" OFF)
+option(TRITON_ENABLE_S3 "Include S3 Filesystem support in server" OFF)
+option(TRITON_ENABLE_AZURE_STORAGE "Include Azure Storage Filesystem support in server" OFF)
+
+# Need to know if TensorRT is available when building unit tests
+option(TRITON_ENABLE_TENSORRT "Include TensorRT backend in server" OFF)
+
+# ASAN
+option(TRITON_ENABLE_ASAN "Build with address sanitizer" OFF)
+
+# Repo tags
+set(TRITON_REPO_ORGANIZATION "https://github.com/triton-inference-server" CACHE STRING "Git repository to pull from")
+set(TRITON_THIRD_PARTY_REPO_TAG "main" CACHE STRING
+ "Tag for triton-inference-server/third_party repo")
+set(TRITON_COMMON_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/common repo")
+set(TRITON_CORE_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/core repo")
+set(TRITON_BACKEND_REPO_TAG "main" CACHE STRING "Tag for triton-inference-server/backend repo")
+
+# Third-party location
+set(TRITON_THIRD_PARTY_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/third-party" CACHE STRING "Location of third-party build")
+set(TRITON_THIRD_PARTY_SRC_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/third-party-src" CACHE STRING "Location of third-party source")
+
+if(TRITON_ENABLE_METRICS AND NOT TRITON_ENABLE_STATS)
+ message(FATAL_ERROR "TRITON_ENABLE_METRICS=ON requires TRITON_ENABLE_STATS=ON")
+endif()
+
+if(TRITON_ENABLE_TRACING AND NOT TRITON_ENABLE_STATS)
+ message(FATAL_ERROR "TRITON_ENABLE_TRACING=ON requires TRITON_ENABLE_STATS=ON")
+endif()
+
+if (TRITON_ENABLE_METRICS_CPU AND NOT TRITON_ENABLE_METRICS)
+ message(FATAL_ERROR "TRITON_ENABLE_METRICS_CPU=ON requires TRITON_ENABLE_METRICS=ON")
+endif()
+
+if (TRITON_ENABLE_METRICS_GPU AND NOT TRITON_ENABLE_METRICS)
+ message(FATAL_ERROR "TRITON_ENABLE_METRICS_GPU=ON requires TRITON_ENABLE_METRICS=ON")
+endif()
+
+if (TRITON_ENABLE_METRICS_GPU AND NOT TRITON_ENABLE_GPU)
+ message(FATAL_ERROR "TRITON_ENABLE_METRICS_GPU=ON requires TRITON_ENABLE_GPU=ON")
+endif()
+
+if(TRITON_ENABLE_ASAN AND TRITON_ENABLE_GPU)
+ message(FATAL_ERROR "TRITON_ENABLE_ASAN=ON requires TRITON_ENABLE_GPU=OFF")
+endif()
+
+#
+# Dependencies
+#
+include(FetchContent)
+
+FetchContent_Declare(
+ repo-core
+ GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git
+ GIT_TAG ${TRITON_CORE_REPO_TAG}
+)
+FetchContent_Declare(
+ repo-third-party
+ GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/third_party.git
+ GIT_TAG ${TRITON_THIRD_PARTY_REPO_TAG}
+)
+
+# Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead
+# of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos
+set (LIB_DIR "lib")
+# /etc/os-release does not exist on Windows
+if(EXISTS "/etc/os-release")
+ file(STRINGS /etc/os-release DISTRO REGEX "^NAME=")
+ string(REGEX REPLACE "NAME=\"(.*)\"" "\\1" DISTRO "${DISTRO}")
+ message(STATUS "Distro Name: ${DISTRO}")
+ if(DISTRO MATCHES "CentOS.*")
+ set (LIB_DIR "lib64")
+ endif()
+endif()
+
+set(TRITON_CORE_HEADERS_ONLY OFF)
+
+FetchContent_MakeAvailable(repo-third-party repo-core)
+
+#
+# Triton server executable and examples
+#
+
+# Need to use ExternalProject for our builds so that we can get the
+# correct dependencies between Triton executable and the
+# ExternalProject dependencies (found in the third_party repo)
+include(ExternalProject)
+
+# If CMAKE_TOOLCHAIN_FILE is set, propagate that hint path to the external
+# projects.
+set(_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE "")
+if (CMAKE_TOOLCHAIN_FILE)
+ set(_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE "-DCMAKE_TOOLCHAIN_FILE:PATH=${CMAKE_TOOLCHAIN_FILE}")
+endif()
+
+# If VCPKG_TARGET_TRIPLET is set, propagate that hint path to the external
+# projects.
+set(_CMAKE_ARGS_VCPKG_TARGET_TRIPLET "")
+if (VCPKG_TARGET_TRIPLET)
+ set(_CMAKE_ARGS_VCPKG_TARGET_TRIPLET "-DVCPKG_TARGET_TRIPLET:STRING=${VCPKG_TARGET_TRIPLET}")
+endif()
+
+# If OPENSSL_ROOT_DIR is set, propagate that hint path to the external
+# projects with OpenSSL dependency.
+set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "")
+if (OPENSSL_ROOT_DIR)
+ set(_CMAKE_ARGS_OPENSSL_ROOT_DIR "-DOPENSSL_ROOT_DIR:PATH=${OPENSSL_ROOT_DIR}")
+endif()
+
+# Location where protobuf-config.cmake will be installed varies by
+# platform
+if (WIN32)
+ set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/protobuf/cmake")
+else()
+ set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/protobuf/${LIB_DIR}/cmake/protobuf")
+endif()
+
+# Triton with Opentelemetry is not supported on Windows
+# FIXME: add location for Windows, when support is added
+# JIRA DLIS-4786
+if (WIN32)
+ set(_FINDPACKAGE_OPENTELEMETRY_CONFIG_DIR "")
+else()
+ set(_FINDPACKAGE_OPENTELEMETRY_CONFIG_DIR "${TRITON_THIRD_PARTY_INSTALL_PREFIX}/opentelemetry-cpp/${LIB_DIR}/cmake/opentelemetry-cpp")
+endif()
+
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+ set(TRITON_INSTALL_PREFIX ${CMAKE_CURRENT_BINARY_DIR}/install)
+else()
+ set(TRITON_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+endif()
+
+set(TRITON_DEPENDS triton-core protobuf googletest re2)
+if(${TRITON_ENABLE_GCS})
+ set(TRITON_DEPENDS ${TRITON_DEPENDS} google-cloud-cpp)
+endif() # TRITON_ENABLE_GCS
+if(${TRITON_ENABLE_S3})
+ set(TRITON_DEPENDS ${TRITON_DEPENDS} aws-sdk-cpp)
+endif() # TRITON_ENABLE_S3
+if(${TRITON_ENABLE_HTTP} OR ${TRITON_ENABLE_METRICS} OR ${TRITON_ENABLE_SAGEMAKER} OR ${TRITON_ENABLE_VERTEX_AI})
+ set(TRITON_DEPENDS ${TRITON_DEPENDS} libevent libevhtp)
+endif() # TRITON_ENABLE_HTTP || TRITON_ENABLE_METRICS || TRITON_ENABLE_SAGEMAKER || TRITON_ENABLE_VERTEX_AI
+if(${TRITON_ENABLE_GRPC})
+ set(TRITON_DEPENDS ${TRITON_DEPENDS} grpc)
+endif() # TRITON_ENABLE_GRPC
+if(NOT WIN32 AND ${TRITON_ENABLE_TRACING})
+ set(TRITON_DEPENDS ${TRITON_DEPENDS} opentelemetry-cpp)
+endif() # TRITON_ENABLE_TRACING
+
+ExternalProject_Add(triton-server
+ PREFIX triton-server
+ SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/src"
+ BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/triton-server"
+ CMAKE_CACHE_ARGS
+ -DProtobuf_DIR:PATH=${_FINDPACKAGE_PROTOBUF_CONFIG_DIR}
+ ${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
+ ${_CMAKE_ARGS_CMAKE_TOOLCHAIN_FILE}
+ ${_CMAKE_ARGS_VCPKG_TARGET_TRIPLET}
+ -DGTEST_ROOT:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/googletest
+ -DgRPC_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/grpc/lib/cmake/grpc
+ -Dc-ares_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/c-ares/${LIB_DIR}/cmake/c-ares
+ -Dre2_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/re2/${LIB_DIR}/cmake/re2
+ -Dabsl_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/absl/${LIB_DIR}/cmake/absl
+ -DCURL_DIR:STRING=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/curl/${LIB_DIR}/cmake/CURL
+ -Dnlohmann_json_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/nlohmann_json/${LIB_DIR}/cmake/nlohmann_json
+ -DLibevent_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/libevent/lib/cmake/libevent
+ -Dlibevhtp_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/libevhtp/lib/cmake/libevhtp
+ -Dstorage_client_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/google-cloud-cpp/${LIB_DIR}/cmake/storage_client
+ -Dgoogle_cloud_cpp_common_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/google-cloud-cpp/${LIB_DIR}/cmake/google_cloud_cpp_common
+ -DCrc32c_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/crc32c/${LIB_DIR}/cmake/Crc32c
+ -DAWSSDK_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/cmake/AWSSDK
+ -Daws-cpp-sdk-core_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/cmake/aws-cpp-sdk-core
+ -Daws-cpp-sdk-s3_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/cmake/aws-cpp-sdk-s3
+ -Daws-c-event-stream_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/aws-c-event-stream/cmake
+ -Daws-c-common_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/aws-c-common/cmake
+ -Daws-checksums_DIR:PATH=${TRITON_THIRD_PARTY_INSTALL_PREFIX}/aws-sdk-cpp/${LIB_DIR}/aws-checksums/cmake
+ -Dopentelemetry-cpp_DIR:PATH=${_FINDPACKAGE_OPENTELEMETRY_CONFIG_DIR}
+ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION}
+ -DTRITON_IGPU_BUILD:BOOL=${TRITON_IGPU_BUILD}
+ -DTRITON_THIRD_PARTY_REPO_TAG:STRING=${TRITON_THIRD_PARTY_REPO_TAG}
+ -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG}
+ -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG}
+ -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG}
+ -DTRITON_EXTRA_LIB_PATHS:PATH=${TRITON_EXTRA_LIB_PATHS}
+ -DTRITON_ENABLE_ASAN:BOOL=${TRITON_ENABLE_ASAN}
+ -DTRITON_ENABLE_NVTX:BOOL=${TRITON_ENABLE_NVTX}
+ -DTRITON_ENABLE_TRACING:BOOL=${TRITON_ENABLE_TRACING}
+ -DTRITON_ENABLE_LOGGING:BOOL=${TRITON_ENABLE_LOGGING}
+ -DTRITON_ENABLE_STATS:BOOL=${TRITON_ENABLE_STATS}
+ -DTRITON_ENABLE_GPU:BOOL=${TRITON_ENABLE_GPU}
+ -DTRITON_ENABLE_MALI_GPU:BOOL=${TRITON_ENABLE_MALI_GPU}
+ -DTRITON_ENABLE_HTTP:BOOL=${TRITON_ENABLE_HTTP}
+ -DTRITON_ENABLE_SAGEMAKER:BOOL=${TRITON_ENABLE_SAGEMAKER}
+ -DTRITON_ENABLE_VERTEX_AI:BOOL=${TRITON_ENABLE_VERTEX_AI}
+ -DTRITON_ENABLE_GRPC:BOOL=${TRITON_ENABLE_GRPC}
+ -DTRITON_MIN_COMPUTE_CAPABILITY:STRING=${TRITON_MIN_COMPUTE_CAPABILITY}
+ -DTRITON_ENABLE_METRICS:BOOL=${TRITON_ENABLE_METRICS}
+ -DTRITON_ENABLE_METRICS_GPU:BOOL=${TRITON_ENABLE_METRICS_GPU}
+ -DTRITON_ENABLE_METRICS_CPU:BOOL=${TRITON_ENABLE_METRICS_CPU}
+ -DTRITON_ENABLE_GCS:BOOL=${TRITON_ENABLE_GCS}
+ -DTRITON_ENABLE_AZURE_STORAGE:BOOL=${TRITON_ENABLE_AZURE_STORAGE}
+ -DTRITON_ENABLE_S3:BOOL=${TRITON_ENABLE_S3}
+ -DTRITON_ENABLE_TENSORRT:BOOL=${TRITON_ENABLE_TENSORRT}
+ -DTRITON_ENABLE_ENSEMBLE:BOOL=${TRITON_ENABLE_ENSEMBLE}
+ -DTRITON_MIN_CXX_STANDARD:STRING=${TRITON_MIN_CXX_STANDARD}
+ -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE}
+ -DCMAKE_INSTALL_PREFIX:PATH=${TRITON_INSTALL_PREFIX}
+ -DTRITON_VERSION:STRING=${TRITON_VERSION}
+ DEPENDS ${TRITON_DEPENDS}
+)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 84be37f175..59e0ace975 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,5 @@
+# Contribution Guidelines
+
+Contributions that fix documentation errors or that make small changes
+to existing code can be contributed directly by following the rules
+below and submitting an appropriate PR.
+
+Contributions intended to add significant new functionality must
+follow a more collaborative path described in the following
+points. Before submitting a large PR that adds a major enhancement or
+extension, be sure to submit a GitHub issue that describes the
+proposed change so that the Triton team can provide feedback.
+
+- As part of the GitHub issue discussion, a design for your change
+ will be agreed upon. An up-front design discussion is required to
+ ensure that your enhancement is done in a manner that is consistent
+ with Triton's overall architecture.
+
+- The Triton project is spread across multiple repos. The Triton team
+ will provide guidance about how and where your enhancement should be
+ implemented.
+
+- [Testing](docs/customization_guide/test.md) is a critical part of any Triton
+ enhancement. You should plan on spending significant time on
+ creating tests for your change. The Triton team will help you to
+ design your testing so that it is compatible with existing testing
+ infrastructure.
+
+- If your enhancement provides a user visible feature then you need to
+ provide documentation.
+
# Contribution Rules
-- The code style convention is enforced by clang-format. See the
- Developer Guide for instructions on how to ensure your contributions
- conform. In general please follow the existing conventions in the
- relevant file, submodule, module, and project when you add new code
- or when you extend/fix existing functionality.
+- The code style convention is enforced by clang-format. See below on
+ how to ensure your contributions conform. In general please follow
+ the existing conventions in the relevant file, submodule, module,
+ and project when you add new code or when you extend/fix existing
+ functionality.
- Avoid introducing unnecessary complexity into existing code so that
maintainability and readability are preserved.
@@ -54,10 +84,10 @@
- Make sure all `L0_*` tests pass:
- In the `qa/` directory, there are basic sanity tests scripted in
- directories named `L0_...`. See the Testing section in the
- Developer Guide for instructions on running these tests.
+ directories named `L0_...`. See the [Test](docs/customization_guide/test.md)
+ documentation for instructions on running these tests.
-- TensorRT Inference Server's default build assumes recent versions of
+- Triton Inference Server's default build assumes recent versions of
dependencies (CUDA, TensorFlow, PyTorch, TensorRT,
etc.). Contributions that add compatibility with older versions of
those dependencies will be considered, but NVIDIA cannot guarantee
@@ -66,64 +96,32 @@
- Make sure that you can contribute your work to open source (no
license and/or patent conflict is introduced by your code). You need
- to [`sign`](#Sign) your commit.
+ to complete the CLA described below before your PR can be merged.
- Thanks in advance for your patience as we review your contributions;
we do appreciate them!
-Sign Your Work
---------------
-
-We require that all contributors "sign-off" on their commits. This
-certifies that the contribution is your original work, or you have
-rights to submit it under the same license, or a compatible license.
-
-Any contribution which contains commits that are not Signed-Off will
-not be accepted.
-
-To sign off on a commit you simply use the `--signoff` (or `-s`)
-option when committing your changes:
-
- $ git commit -s -m "Add cool feature."
-
-This will append the following to your commit message:
-
- Signed-off-by: Your Name
-
-By doing this you certify the below:
-
- Developer Certificate of Origin
- Version 1.1
-
- Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
- 1 Letterman Drive
- Suite D4700
- San Francisco, CA, 94129
-
- Everyone is permitted to copy and distribute verbatim copies of
- this license document, but changing it is not allowed.
-
-
- Developer's Certificate of Origin 1.1
-
- By making a contribution to this project, I certify that:
-
- (a) The contribution was created in whole or in part by me and I
- have the right to submit it under the open source license
- indicated in the file; or
-
- (b) The contribution is based upon previous work that, to the best
- of my knowledge, is covered under an appropriate open source
- license and I have the right under that license to submit that
- work with modifications, whether created in whole or in part by
- me, under the same open source license (unless I am permitted to
- submit under a different license), as indicated in the file; or
-
- (c) The contribution was provided directly to me by some other
- person who certified (a), (b) or (c) and I have not modified it.
-
- (d) I understand and agree that this project and the contribution
- are public and that a record of the contribution (including all
- personal information I submit with it, including my sign-off) is
- maintained indefinitely and may be redistributed consistent with
- this project or the open source license(s) involved.
+# Coding Convention
+
+All pull requests are checked against the
+[pre-commit hooks](https://github.com/pre-commit/pre-commit-hooks)
+located [in the repository's top-level .pre-commit-config.yaml](https://github.com/NVIDIA/triton-inference-server/blob/master/pre-commit-config.yaml).
+The hooks do some sanity checking like linting and formatting.
+These checks must pass to merge a change.
+
+To run these locally, you can
+[install pre-commit,](https://pre-commit.com/#install)
+then run `pre-commit install` inside the cloned repo. When you
+commit a change, the pre-commit hooks will run automatically.
+If a fix is implemented by a pre-commit hook, adding the file again
+and running `git commit` a second time will pass and successfully
+commit.
+
+# Contributor License Agreement (CLA)
+
+Triton requires that all contributors (or their corporate entity) send
+a signed copy of the [Contributor License
+Agreement](https://github.com/NVIDIA/triton-inference-server/blob/master/Triton-CCLA-v1.pdf)
+to triton-cla@nvidia.com.
+*NOTE*: Contributors with no company affiliation can fill `N/A` in the
+`Corporation Name` and `Corporation Address` fields.
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index ff42f64f4a..0000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,269 +0,0 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of NVIDIA CORPORATION nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#
-# Multistage build.
-#
-
-ARG BASE_IMAGE=nvcr.io/nvidia/tensorrtserver:18.11-py3
-ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:18.11-py3
-ARG TENSORFLOW_IMAGE=nvcr.io/nvidia/tensorflow:18.11-py3
-
-############################################################################
-## Caffe2 stage: Use PyTorch container to get Caffe2 backend
-############################################################################
-FROM ${PYTORCH_IMAGE} AS trtserver_caffe2
-
-ARG BUILD_CLIENTS_ONLY=0
-
-# We cannot just pull libraries from the PyTorch container... we need
-# to:
-# - copy over netdef_bundle_c2 interface so it can build with other
-# C2 sources
-# - need to patch to delegate logging to the inference server.
-
-# Copy netdef_bundle_c2 into Caffe2 core so it builds into the
-# libcaffe2 library. We want netdef_bundle_c2 to build against the
-# Caffe2 protobuf since it interfaces with that code.
-COPY src/servables/caffe2/netdef_bundle_c2.* \
- /opt/pytorch/pytorch/caffe2/core/
-
-# Modify the C2 logging library to delegate logging to the trtserver
-# logger. Use a checksum to detect if the C2 logging file has
-# changed... if it has need to verify our patch is still valid and
-# update the patch/checksum as necessary.
-COPY tools/patch/caffe2 /tmp/patch/caffe2
-RUN sha1sum -c /tmp/patch/caffe2/checksums && \
- patch -i /tmp/patch/caffe2/core/logging.cc \
- /opt/pytorch/pytorch/caffe2/core/logging.cc && \
- patch -i /tmp/patch/caffe2/core/logging_is_not_google_glog.h \
- /opt/pytorch/pytorch/caffe2/core/logging_is_not_google_glog.h && \
- patch -i /tmp/patch/caffe2/core/context_gpu.cu \
- /opt/pytorch/pytorch/caffe2/core/context_gpu.cu
-
-# Build same as in pytorch container... except for the NO_DISTRIBUTED
-# line where we turn off features not needed for trtserver
-WORKDIR /opt/pytorch
-RUN pip uninstall -y torch
-RUN bash -c 'if [ "$BUILD_CLIENTS_ONLY" != "1" ]; then \
- cd pytorch && \
- TORCH_CUDA_ARCH_LIST="5.2 6.0 6.1 7.0 7.5+PTX" \
- CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \
- NCCL_INCLUDE_DIR="/usr/include/" \
- NCCL_LIB_DIR="/usr/lib/" \
- NO_DISTRIBUTED=1 NO_TEST=1 NO_MIOPEN=1 USE_OPENCV=OFF USE_LEVELDB=OFF \
- python setup.py install && python setup.py clean; \
- else \
- mkdir -p /opt/conda/lib/python3.6/site-packages/torch/lib; \
- mkdir -p /opt/conda/lib; \
- touch /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_detectron_ops_gpu.so; \
- touch /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2.so; \
- touch /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so; \
- touch /opt/conda/lib/python3.6/site-packages/torch/lib/libc10.so; \
- touch /opt/conda/lib/libmkl_avx2.so; \
- touch /opt/conda/lib/libmkl_core.so; \
- touch /opt/conda/lib/libmkl_def.so; \
- touch /opt/conda/lib/libmkl_gnu_thread.so; \
- touch /opt/conda/lib/libmkl_intel_lp64.so; fi'
-
-############################################################################
-## Build stage: Build inference server based on TensorFlow container
-############################################################################
-FROM ${TENSORFLOW_IMAGE} AS trtserver_build
-
-ARG TRTIS_VERSION=0.10.0dev
-ARG TRTIS_CONTAINER_VERSION=19.01dev
-ARG PYVER=3.5
-ARG BUILD_CLIENTS_ONLY=0
-
-# The TFServing release branch must match the TF release used by
-# TENSORFLOW_IMAGE
-ARG TFS_BRANCH=r1.12
-
-RUN apt-get update && \
- apt-get install -y --no-install-recommends \
- automake \
- libcurl3-dev \
- libopencv-dev \
- libopencv-core-dev \
- libtool
-
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
- python$PYVER get-pip.py && \
- rm get-pip.py
-
-RUN pip install --upgrade setuptools
-
-# Caffe2 library requirements...
-COPY --from=trtserver_caffe2 \
- /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_detectron_ops_gpu.so \
- /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 \
- /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2.so \
- /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 \
- /opt/conda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so \
- /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 \
- /opt/conda/lib/python3.6/site-packages/torch/lib/libc10.so \
- /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_avx2.so /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_core.so /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_def.so /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_gnu_thread.so /opt/tensorrtserver/lib/
-COPY --from=trtserver_caffe2 /opt/conda/lib/libmkl_intel_lp64.so /opt/tensorrtserver/lib/
-
-# Copy entire repo into container even though some is not needed for
-# build itself... because we want to be able to copyright check on
-# files that aren't directly needed for build.
-WORKDIR /workspace
-RUN rm -fr *
-COPY . .
-
-# Pull the TFS release that matches the version of TF being used.
-RUN git clone --single-branch -b ${TFS_BRANCH} https://github.com/tensorflow/serving.git
-
-# Modify the TF logging library to delegate logging to the trtserver
-# logger. Use a checksum to detect if the TF logging file has
-# changed... if it has need to verify our patch is still valid and
-# update the patch/checksum as necessary.
-RUN sha1sum -c tools/patch/tensorflow/checksums && \
- patch -i tools/patch/tensorflow/cc/saved_model/loader.cc \
- /opt/tensorflow/tensorflow/cc/saved_model/loader.cc && \
- patch -i tools/patch/tensorflow/core/platform/default/logging.cc \
- /opt/tensorflow/tensorflow/core/platform/default/logging.cc
-
-# TFS modifications. Use a checksum to detect if the TFS file has
-# changed... if it has need to verify our patch is still valid and
-# update the patch/checksum as necessary.
-RUN sha1sum -c tools/patch/tfs/checksums && \
- patch -i tools/patch/tfs/model_servers/server_core.cc \
- /workspace/serving/tensorflow_serving/model_servers/server_core.cc && \
- patch -i tools/patch/tfs/sources/storage_path/file_system_storage_path_source.cc \
- /workspace/serving/tensorflow_serving/sources/storage_path/file_system_storage_path_source.cc && \
- patch -i tools/patch/tfs/sources/storage_path/file_system_storage_path_source.h \
- /workspace/serving/tensorflow_serving/sources/storage_path/file_system_storage_path_source.h && \
- patch -i tools/patch/tfs/sources/storage_path/file_system_storage_path_source.proto \
- /workspace/serving/tensorflow_serving/sources/storage_path/file_system_storage_path_source.proto && \
- patch -i tools/patch/tfs/util/retrier.cc \
- /workspace/serving/tensorflow_serving/util/retrier.cc && \
- patch -i tools/patch/tfs/util/BUILD \
- /workspace/serving/tensorflow_serving/util/BUILD && \
- patch -i tools/patch/tfs/util/net_http/server/internal/evhttp_request.cc \
- /workspace/serving/tensorflow_serving/util/net_http/server/internal/evhttp_request.cc && \
- patch -i tools/patch/tfs/util/net_http/server/internal/evhttp_request.h \
- /workspace/serving/tensorflow_serving/util/net_http/server/internal/evhttp_request.h && \
- patch -i tools/patch/tfs/util/net_http/server/public/BUILD \
- /workspace/serving/tensorflow_serving/util/net_http/server/public/BUILD && \
- patch -i tools/patch/tfs/util/net_http/server/public/server_request_interface.h \
- /workspace/serving/tensorflow_serving/util/net_http/server/public/server_request_interface.h && \
- patch -i tools/patch/tfs/workspace.bzl \
- /workspace/serving/tensorflow_serving/workspace.bzl
-
-ENV TF_NEED_GCP 1
-ENV TF_NEED_S3 1
-
-# Build the server, clients and any testing artifacts
-RUN (cd /opt/tensorflow && ./nvbuild.sh --python$PYVER --configonly) && \
- (cd tools && mv bazel.rc bazel.orig && \
- cat bazel.orig /opt/tensorflow/.tf_configure.bazelrc > bazel.rc) && \
- bash -c 'if [ "$BUILD_CLIENTS_ONLY" != "1" ]; then \
- bazel build -c opt --config=cuda src/servers/trtserver src/clients/... src/test/...; \
- else \
- bazel build -c opt src/clients/...; \
- fi' && \
- (cd /opt/tensorrtserver && ln -s /workspace/qa qa) && \
- mkdir -p /opt/tensorrtserver/bin && \
- cp bazel-bin/src/clients/c++/image_client /opt/tensorrtserver/bin/. && \
- cp bazel-bin/src/clients/c++/perf_client /opt/tensorrtserver/bin/. && \
- cp bazel-bin/src/clients/c++/simple_client /opt/tensorrtserver/bin/. && \
- mkdir -p /opt/tensorrtserver/lib && \
- cp bazel-bin/src/clients/c++/librequest.so /opt/tensorrtserver/lib/. && \
- cp bazel-bin/src/clients/c++/librequest.a /opt/tensorrtserver/lib/. && \
- mkdir -p /opt/tensorrtserver/pip && \
- bazel-bin/src/clients/python/build_pip /opt/tensorrtserver/pip/. && \
- bash -c 'if [ "$BUILD_CLIENTS_ONLY" != "1" ]; then \
- cp bazel-bin/src/servers/trtserver /opt/tensorrtserver/bin/.; \
- cp bazel-bin/src/test/caffe2plan /opt/tensorrtserver/bin/.; \
- fi' && \
- bazel clean --expunge && \
- rm -rf /root/.cache/bazel && \
- rm -rf /tmp/*
-
-ENV TENSORRT_SERVER_VERSION ${TRTIS_VERSION}
-ENV NVIDIA_TENSORRT_SERVER_VERSION ${TRTIS_CONTAINER_VERSION}
-ENV PYVER ${PYVER}
-
-COPY nvidia_entrypoint.sh /opt/tensorrtserver
-ENTRYPOINT ["/opt/tensorrtserver/nvidia_entrypoint.sh"]
-
-############################################################################
-## Production stage: Create container with just inference server executable
-############################################################################
-FROM ${BASE_IMAGE}
-
-ARG TRTIS_VERSION=0.10.0dev
-ARG TRTIS_CONTAINER_VERSION=19.01dev
-ARG PYVER=3.5
-
-ENV TENSORRT_SERVER_VERSION ${TRTIS_VERSION}
-ENV NVIDIA_TENSORRT_SERVER_VERSION ${TRTIS_CONTAINER_VERSION}
-LABEL com.nvidia.tensorrtserver.version="${TENSORRT_SERVER_VERSION}"
-
-ENV LD_LIBRARY_PATH /opt/tensorrtserver/lib:${LD_LIBRARY_PATH}
-ENV PATH /opt/tensorrtserver/bin:${PATH}
-ENV PYVER ${PYVER}
-
-ENV TF_ADJUST_HUE_FUSED 1
-ENV TF_ADJUST_SATURATION_FUSED 1
-ENV TF_ENABLE_WINOGRAD_NONFUSED 1
-ENV TF_AUTOTUNE_THRESHOLD 2
-
-# Create a user that can be used to run the tensorrt-server as
-# non-root. Make sure that this user to given ID 1000.
-ENV TENSORRT_SERVER_USER=tensorrt-server
-RUN id -u $TENSORRT_SERVER_USER > /dev/null 2>&1 || \
- useradd $TENSORRT_SERVER_USER && \
- [ `id -u $TENSORRT_SERVER_USER` -eq 1000 ] && \
- [ `id -g $TENSORRT_SERVER_USER` -eq 1000 ]
-
-WORKDIR /opt/tensorrtserver
-RUN rm -fr /opt/tensorrtserver/*
-COPY LICENSE .
-COPY --from=trtserver_build /workspace/serving/LICENSE LICENSE.tfserving
-COPY --from=trtserver_build /opt/tensorflow/LICENSE LICENSE.tensorflow
-COPY --from=trtserver_caffe2 /opt/pytorch/pytorch/LICENSE LICENSE.pytorch
-COPY --from=trtserver_build /opt/tensorrtserver/bin/trtserver bin/
-COPY --from=trtserver_build /opt/tensorrtserver/lib lib
-
-COPY nvidia_entrypoint.sh /opt/tensorrtserver
-ENTRYPOINT ["/opt/tensorrtserver/nvidia_entrypoint.sh"]
-
-ARG NVIDIA_BUILD_ID
-ENV NVIDIA_BUILD_ID ${NVIDIA_BUILD_ID:-}
-LABEL com.nvidia.build.id="${NVIDIA_BUILD_ID}"
-ARG NVIDIA_BUILD_REF
-LABEL com.nvidia.build.ref="${NVIDIA_BUILD_REF}"
diff --git a/Dockerfile.QA b/Dockerfile.QA
index bb607c3ac0..3e986a9400 100644
--- a/Dockerfile.QA
+++ b/Dockerfile.QA
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@@ -24,72 +24,374 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#
-# Multistage build.
-#
+ARG BASE_IMAGE=tritonserver
+ARG CIBASE_IMAGE=tritonserver_cibase
+ARG SDK_IMAGE=tritonserver_sdk
+ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server
+ARG TRITON_COMMON_REPO_TAG=main
+ARG TRITON_CORE_REPO_TAG=main
+ARG TRITON_THIRD_PARTY_REPO_TAG=main
+ARG TRITON_BACKEND_REPO_TAG=main
+ARG TRITONTMP_DIR=/tmp
+ARG IGPU_BUILD=0
+
+############################################################################
+## Test artifacts built as part of the tritonserver build are
+## available in CIBASE_IMAGE. Copy these artifacts into the QA area.
+############################################################################
+FROM ${CIBASE_IMAGE} AS cibase
+
+ARG TRITONTMP_DIR
+ARG TRITON_REPO_ORGANIZATION
+ARG TRITON_COMMON_REPO_TAG
+ARG TRITON_CORE_REPO_TAG
+ARG TRITON_THIRD_PARTY_REPO_TAG
+ARG TRITON_BACKEND_REPO_TAG
+ARG IGPU_BUILD
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ build-essential \
+ libarchive-dev \
+ libboost-dev \
+ python3-dev \
+ python3-pip \
+ rapidjson-dev \
+ software-properties-common && \
+ rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --upgrade pip && \
+ pip3 install --upgrade wheel setuptools
+
+RUN apt update -q=2 \
+ && apt install -y gpg wget \
+ && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
+ && . /etc/os-release \
+ && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
+ && apt-get update -q=2 \
+ && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7*
+
+# Add inception_graphdef model to example repo
+WORKDIR /workspace/docs/examples/model_repository
+RUN mkdir -p inception_graphdef/1 && \
+ wget -O ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb.tar.gz \
+ https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz && \
+ (cd ${TRITONTMP_DIR} && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz) && \
+ mv ${TRITONTMP_DIR}/inception_v3_2016_08_28_frozen.pb inception_graphdef/1/model.graphdef
+
+# Update the qa/ directory with test executables, models, etc.
+WORKDIR /workspace
+RUN mkdir -p qa/common && \
+ cp -r /workspace/src/test/models/repeat_int32 qa/L0_decoupled/models/ && \
+ cp -r /workspace/src/test/models/square_int32 qa/L0_decoupled/models/ && \
+ mkdir qa/L0_simple_example/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_simple_example/models/. && \
+ mkdir qa/L0_simple_go_client/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_simple_go_client/models/. && \
+ mkdir qa/L0_backend_release/simple_models && \
+ cp -r docs/examples/model_repository/simple qa/L0_backend_release/simple_models/. && \
+ mkdir qa/L0_simple_nodejs_client/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_simple_nodejs_client/models/. && \
+ mkdir qa/L0_backend_release/simple_seq_models && \
+ cp -r /workspace/docs/examples/model_repository/simple_sequence qa/L0_backend_release/simple_seq_models/. && \
+ mkdir qa/L0_shared_memory/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_shared_memory/models/. && \
+ mkdir qa/L0_cuda_shared_memory/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_cuda_shared_memory/models/. && \
+ mkdir qa/L0_client_java/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_client_java/models && \
+ mkdir qa/L0_grpc/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_grpc/models && \
+ cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_grpc/models && \
+ cp -r docs/examples/model_repository/simple_int8 qa/L0_grpc/models && \
+ cp -r docs/examples/model_repository/simple_identity qa/L0_grpc/models && \
+ cp -r docs/examples/model_repository/simple_sequence qa/L0_grpc/models && \
+ cp -r docs/examples/model_repository/simple_string qa/L0_grpc/models && \
+ cp -r docs/examples/model_repository/inception_graphdef qa/L0_grpc/models && \
+ mkdir qa/L0_grpc_state_cleanup/models && \
+ cp -r /workspace/src/test/models/repeat_int32 qa/L0_grpc_state_cleanup/models/ && \
+ mkdir qa/L0_http/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_http/models && \
+ cp -r docs/examples/model_repository/simple_dyna_sequence qa/L0_http/models && \
+ cp -r docs/examples/model_repository/simple_identity qa/L0_http/models && \
+ cp -r docs/examples/model_repository/simple_sequence qa/L0_http/models && \
+ cp -r docs/examples/model_repository/simple_string qa/L0_http/models && \
+ cp -r docs/examples/model_repository/inception_graphdef qa/L0_http/models && \
+ mkdir qa/L0_https/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_https/models/. && \
+ mkdir qa/L0_secure_grpc/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_secure_grpc/models/. && \
+ cp bin/simple qa/L0_simple_lib/. && \
+ cp bin/memory_alloc qa/L0_io/. && \
+ cp bin/multi_server qa/L0_multi_server/. && \
+ cp bin/memory_test qa/L0_memory/. && \
+ cp bin/pinned_memory_manager_test qa/L0_memory/. && \
+ cp bin/repo_agent_test qa/L0_triton_repo_agent/. && \
+ cp lib/libtritonrepoagent_relocation.so qa/L0_triton_repo_agent/. && \
+ mkdir qa/L0_query/models/query/1 && \
+ cp tritonbuild/tritonserver/backends/query/libtriton_query.so qa/L0_query/models/query/1/. && \
+ cp bin/query_test qa/L0_query/. && \
+ mkdir qa/L0_iterative_sequence/models/iterative_sequence/1 && \
+ cp tritonbuild/tritonserver/backends/iterative_sequence/libtriton_iterative_sequence.so qa/L0_iterative_sequence/models/iterative_sequence/1/. && \
+ cp bin/register_api_test qa/L0_register/. && \
+ cp bin/async_work_queue_test qa/L0_async_work_queue/. && \
+ cp tritonbuild/tritonserver/backends/implicit_state/libtriton_implicit_state.so \
+ qa/L0_implicit_state/. && \
+ mkdir qa/L0_data_compression/models && \
+ cp -r docs/examples/model_repository/simple qa/L0_data_compression/models && \
+ cp bin/data_compressor_test qa/L0_data_compression/. && \
+ cp bin/metrics_api_test qa/L0_metrics/. && \
+ cp bin/response_cache_test qa/L0_response_cache/. && \
+ cp bin/request_cancellation_test qa/L0_request_cancellation/. && \
+ cp bin/triton_json_test qa/L0_json/. && \
+ cp bin/backend_output_detail_test qa/L0_backend_output_detail/. && \
+ cp -r deploy/mlflow-triton-plugin qa/L0_mlflow/.
+
+RUN mkdir -p qa/pkgs && \
+ cp python/triton*.whl qa/pkgs/. && \
+ cp -rf python/test/. qa/L0_python_api/.
+
+# caffe2plan will not exist if the build was done without TensorRT enabled
+RUN if [ -f bin/caffe2plan ]; then \
+ cp bin/caffe2plan qa/common/.; \
+ fi
-ARG BASE_IMAGE=tensorrtserver
-ARG BUILD_IMAGE=tensorrtserver_build
+RUN mkdir -p qa/L0_simple_ensemble/models/simple/1 && \
+ cp docs/examples/model_repository/simple/1/model.graphdef \
+ qa/L0_simple_ensemble/models/simple/1/. && \
+ mkdir -p qa/L0_simple_ensemble/models/simple/2 && \
+ cp docs/examples/model_repository/simple/1/model.graphdef \
+ qa/L0_simple_ensemble/models/simple/2/. && \
+ mkdir -p qa/L0_socket/models/simple/1 && \
+ cp docs/examples/model_repository/simple/1/model.graphdef \
+ qa/L0_socket/models/simple/1/.
+
+RUN mkdir -p qa/L0_backend_identity/models && \
+ cp -r src/test/models/identity_fp32 qa/L0_backend_identity/models/. && \
+ mkdir -p qa/L0_backend_identity/models/identity_fp32/1
+
+RUN mkdir -p qa/custom_models/custom_sequence_int32/1 && \
+ cp tritonbuild/tritonserver/backends/sequence/libtriton_sequence.so \
+ qa/custom_models/custom_sequence_int32/1/. && \
+ mkdir -p qa/custom_models/custom_dyna_sequence_int32/1 && \
+ cp tritonbuild/tritonserver/backends/dyna_sequence/libtriton_dyna_sequence.so \
+ qa/custom_models/custom_dyna_sequence_int32/1/.
+
+# L0_lifecycle needs No-GPU build of identity backend.
+RUN cd tritonbuild/identity && \
+ rm -rf install build && mkdir build && cd build && \
+ cmake -DTRITON_ENABLE_GPU=OFF \
+ -DCMAKE_INSTALL_PREFIX:PATH=/workspace/tritonbuild/identity/install \
+ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
+ -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \
+ -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \
+ -DTRITON_THIRD_PARTY_REPO_TAG:STRING=${TRITON_THIRD_PARTY_REPO_TAG} \
+ -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} .. && \
+ make -j16 install
+
+# L0_backend_python test require triton_shm_monitor
+RUN cd tritonbuild/python && \
+ rm -rf install build && mkdir build && cd build && \
+ cmake -DCMAKE_INSTALL_PREFIX:PATH=/workspace/tritonbuild/python/install \
+ -DTRITON_REPO_ORGANIZATION:STRING=${TRITON_REPO_ORGANIZATION} \
+ -DTRITON_COMMON_REPO_TAG:STRING=${TRITON_COMMON_REPO_TAG} \
+ -DTRITON_CORE_REPO_TAG:STRING=${TRITON_CORE_REPO_TAG} \
+ -DTRITON_BACKEND_REPO_TAG:STRING=${TRITON_BACKEND_REPO_TAG} .. && \
+ make -j16 triton-shm-monitor install
+
+RUN cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \
+ qa/L0_lifecycle/. && \
+ cp tritonbuild/python/install/backends/python/triton_shm_monitor*.so \
+ qa/common/. && \
+ mkdir -p qa/L0_perf_nomodel/custom_models/custom_zero_1_float32/1 && \
+ mkdir -p qa/L0_perf_pyclients/custom_models/custom_zero_1_int32/1 && \
+ mkdir -p qa/L0_infer_shm && \
+ cp -r qa/L0_infer/. qa/L0_infer_shm && \
+ mkdir -p qa/L0_infer_cudashm && \
+ cp -r qa/L0_infer/. qa/L0_infer_cudashm && \
+ mkdir -p qa/L0_infer_valgrind && \
+ cp -r qa/L0_infer/. qa/L0_infer_valgrind && \
+ mkdir -p qa/L0_trt_shape_tensors_shm && \
+ cp -r qa/L0_trt_shape_tensors/. qa/L0_trt_shape_tensors_shm && \
+ mkdir -p qa/L0_trt_shape_tensors_cudashm && \
+ cp -r qa/L0_trt_shape_tensors/. qa/L0_trt_shape_tensors_cudashm && \
+ mkdir -p qa/L0_batcher_shm && \
+ cp -r qa/L0_batcher/. qa/L0_batcher_shm && \
+ mkdir -p qa/L0_batcher_cudashm && \
+ cp -r qa/L0_batcher/. qa/L0_batcher_cudashm && \
+ mkdir -p qa/L0_batcher_valgrind && \
+ cp -r qa/L0_batcher/. qa/L0_batcher_valgrind && \
+ mkdir -p qa/L0_sequence_batcher_shm && \
+ cp -r qa/L0_sequence_batcher/. qa/L0_sequence_batcher_shm && \
+ mkdir -p qa/L0_sequence_batcher_cudashm && \
+ cp -r qa/L0_sequence_batcher/. qa/L0_sequence_batcher_cudashm && \
+ mkdir -p qa/L0_sequence_batcher_valgrind && \
+ cp -r qa/L0_sequence_batcher/. qa/L0_sequence_batcher_valgrind && \
+ mkdir -p qa/L0_perf_nomodel_shm && \
+ cp -r qa/L0_perf_nomodel/. qa/L0_perf_nomodel_shm && \
+ mkdir -p qa/L0_perf_nomodel_cudashm && \
+ cp -r qa/L0_perf_nomodel/. qa/L0_perf_nomodel_cudashm
+
+# L0_model_control_stress will not be present if gitlab tests are not available
+RUN if [ -d qa/L0_model_control_stress ]; then \
+ mkdir -p qa/L0_model_control_stress_valgrind && \
+ cp -r qa/L0_model_control_stress/. qa/L0_model_control_stress_valgrind && \
+ mkdir -p qa/L0_model_control_stress_valgrind_massif && \
+ cp -r qa/L0_model_control_stress/. qa/L0_model_control_stress_valgrind_massif; \
+ fi
+
+RUN mkdir -p qa/L0_decoupled/models/repeat_int32/1 && \
+ mkdir -p qa/L0_decoupled/models/square_int32/1 && \
+ mkdir -p qa/L0_decoupled/models/identity_int32/1 && \
+ mkdir -p qa/L0_decoupled/models/simple_repeat/1 && \
+ mkdir -p qa/L0_decoupled/models/fan_repeat/1 && \
+ mkdir -p qa/L0_decoupled/models/sequence_repeat/1 && \
+ mkdir -p qa/L0_decoupled/models/repeat_square/1 && \
+ mkdir -p qa/L0_decoupled/models/nested_square/1 && \
+ mkdir -p qa/L0_grpc_state_cleanup/models/repeat_int32/1
+
+RUN if [ "$IGPU_BUILD" == "0" ]; then \
+ cp backends/repeat/libtriton_repeat.so qa/L0_model_config && \
+ cp backends/repeat/libtriton_repeat.so qa/L0_decoupled/models/repeat_int32/1 && \
+ cp backends/repeat/libtriton_repeat.so qa/L0_grpc_state_cleanup/models/repeat_int32/1/. && \
+ cp backends/square/libtriton_square.so qa/L0_decoupled/models/square_int32/1; \
+ fi
+
+RUN cp -r qa/L0_decoupled/models qa/L0_decoupled/python_models/ && \
+ cp /workspace/tritonbuild/python/examples/decoupled/repeat_model.py \
+ qa/L0_decoupled/python_models/repeat_int32/1/. && \
+ cp /workspace/tritonbuild/python/examples/decoupled/repeat_config.pbtxt \
+ qa/L0_decoupled/python_models/repeat_int32/. && \
+ cp /workspace/tritonbuild/python/examples/decoupled/square_model.py \
+ qa/L0_decoupled/python_models/square_int32/1/. && \
+ cp /workspace/tritonbuild/python/examples/decoupled/square_config.pbtxt \
+ qa/L0_decoupled/python_models/square_int32/.
+
+RUN mkdir -p qa/L0_repoagent_checksum/models/identity_int32/1 && \
+ cp tritonbuild/identity/install/backends/identity/libtriton_identity.so \
+ qa/L0_repoagent_checksum/models/identity_int32/1/.
+RUN mkdir -p qa/L0_passive_instance/models/distributed_int32_int32_int32/1 && \
+ cp tritonbuild/tritonserver/backends/distributed_addsub/libtriton_distributed_addsub.so \
+ qa/L0_passive_instance/models/distributed_int32_int32_int32/1/.
############################################################################
-## Build necessary artifacts needed for CI and initialize the qa/ directory.
+## Copy artifacts from sdk container
############################################################################
-FROM ${BUILD_IMAGE} AS trtserver_qa
+FROM ${SDK_IMAGE} AS sdk
+ARG TARGETPLATFORM
WORKDIR /workspace
+COPY --from=cibase /workspace/qa/ qa/
RUN mkdir -p qa/clients && mkdir -p qa/pkgs && \
- cp src/clients/python/grpc_image_client.py qa/clients/. && \
- cp src/clients/python/image_client.py qa/clients/. && \
- cp src/clients/python/simple_client.py qa/clients/. && \
- cp /opt/tensorrtserver/bin/image_client qa/clients/. && \
- cp /opt/tensorrtserver/bin/perf_client qa/clients/. && \
- cp /opt/tensorrtserver/bin/simple_client qa/clients/. && \
- cp /opt/tensorrtserver/bin/caffe2plan qa/common/. && \
- cp /opt/tensorrtserver/pip/tensorrtserver*.whl qa/pkgs/. && \
- mkdir qa/L0_simple_example/models && \
- cp -r docs/examples/model_repository/simple qa/L0_simple_example/models/.
+ cp -a install/bin/* qa/clients/. && \
+ cp install/lib/libgrpcclient.so qa/clients/. && \
+ cp install/lib/libhttpclient.so qa/clients/. && \
+ cp install/python/*.py qa/clients/. && \
+ cp install/python/triton*.whl qa/pkgs/. && \
+ cp install/java/examples/*.jar qa/clients/.
+RUN cp client/src/grpc_generated/go/*.go qa/L0_simple_go_client/. && \
+ cp client/src/grpc_generated/javascript/*.js qa/L0_simple_nodejs_client/. && \
+ cp client/src/grpc_generated/javascript/*.json qa/L0_simple_nodejs_client/. && \
+ cp -r client/src/grpc_generated/java qa/L0_client_java/.
############################################################################
## Create CI enabled image
############################################################################
FROM $BASE_IMAGE
-ARG PYVER=3.5
+ARG TARGETPLATFORM
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+
+# install platform specific packages
+RUN if [ $(cat /etc/os-release | grep 'VERSION_ID="20.04"' | wc -l) -ne 0 ]; then \
+ apt-get update && \
+ apt-get install -y --no-install-recommends \
+ libpng-dev; \
+ elif [ $(cat /etc/os-release | grep 'VERSION_ID="22.04"' | wc -l) -ne 0 ]; then \
+ apt-get update && \
+ apt-get install -y --no-install-recommends \
+ libpng-dev; \
+ elif [ $(cat /etc/os-release | grep 'VERSION_ID="18.04"' | wc -l) -ne 0 ]; then \
+ apt-get update && \
+ apt-get install -y --no-install-recommends \
+ libpng-dev; \
+ else \
+ echo "Ubuntu version must be either 18.04, 20.04 or 22.04" && \
+ exit 1; \
+ fi
+# CI/QA for memcheck requires valgrind
+# libarchive-dev is required by Python backend
RUN apt-get update && apt-get install -y --no-install-recommends \
- jmeter \
- jmeter-http \
- libcurl3 \
+ curl \
+ gdb \
libopencv-dev \
+ libarchive-dev \
libopencv-core-dev \
- libpng12-dev \
libzmq3-dev \
- python$PYVER \
- python$PYVER-dev \
- python$PYVER-numpy \
- python`echo $PYVER | cut -c1-1`-pil \
- python-protobuf \
- swig && \
+ maven \
+ openjdk-11-jdk \
+ nginx \
+ npm \
+ protobuf-compiler \
+ python3-dev \
+ python3-pip \
+ python3-protobuf \
+ python3-setuptools \
+ swig \
+ valgrind && \
rm -rf /var/lib/apt/lists/*
-# Use the PYVER version of python
+# CI/QA expects "python" executable (not python3).
RUN rm -f /usr/bin/python && \
- rm -f /usr/bin/python`echo $PYVER | cut -c1-1` && \
- ln -s /usr/bin/python$PYVER /usr/bin/python && \
- ln -s /usr/bin/python$PYVER /usr/bin/python`echo $PYVER | cut -c1-1`
+ ln -s /usr/bin/python3 /usr/bin/python
-RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
- python$PYVER get-pip.py && \
- rm get-pip.py
-RUN pip install --upgrade numpy future grpcio
+RUN pip3 install --upgrade wheel setuptools && \
+ pip3 install --upgrade numpy pillow attrdict future grpcio requests gsutil \
+ awscli six grpcio-channelz prettytable virtualenv \
+ check-jsonschema
-# CI expects tests in /opt/tensorrtserver/qa
-WORKDIR /opt/tensorrtserver
-COPY --from=trtserver_qa /workspace/qa/ qa/
+# go needed for example go client test.
+RUN if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ wget https://golang.org/dl/go1.19.1.linux-arm64.tar.gz && \
+ rm -rf /usr/local/go && tar -C /usr/local -xzf go1.19.1.linux-arm64.tar.gz && \
+ rm -f go1.19.1.linux-arm64.tar.gz; \
+ else \
+ wget https://golang.org/dl/go1.19.1.linux-amd64.tar.gz && \
+ rm -rf /usr/local/go && tar -C /usr/local -xzf go1.19.1.linux-amd64.tar.gz && \
+ rm -f go1.19.1.linux-amd64.tar.gz; \
+ fi
+ENV GOPATH /root/go
+ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin
+RUN GO111MODULE=off go get github.com/golang/protobuf/protoc-gen-go && \
+ GO111MODULE=off go get google.golang.org/grpc
+
+# CI expects tests in /opt/tritonserver/qa. The triton-server (1000)
+# user should own all artifacts in case CI is run using triton-server
+# user.
+WORKDIR /opt/tritonserver
+COPY --chown=1000:1000 --from=sdk /workspace/qa/ qa/
# Remove CI tests that are meant to run only on build image and
-# install the tensorrtserver python client APIs.
-RUN rm -fr qa/L0_copyrights qa/L0_unit_test qa/L1_tfs_unit_test && \
- pip install --upgrade qa/pkgs/tensorrtserver-*.whl
+# install the tritonserver/triton python client APIs.
+RUN rm -fr qa/L0_copyrights qa/L0_build_variants && \
+ find qa/pkgs/ -maxdepth 1 -type f -name \
+ "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
+ xargs pip3 install --upgrade
+
+# Install Triton Python API
+RUN find qa/pkgs/ -maxdepth 1 -type f -name \
+ "tritonserver-*.whl" | xargs -I {} pip3 install --upgrade {}[all]
+
+ENV LD_LIBRARY_PATH /opt/tritonserver/qa/clients:${LD_LIBRARY_PATH}
+
+# DLIS-3631: Needed to run Perf Analyzer CI tests correctly
+ENV LD_LIBRARY_PATH /opt/hpcx/ompi/lib:${LD_LIBRARY_PATH}
-ENV PYVER ${PYVER}
+# Required for PyTorch to pickup the correct HPCX libraries
+ENV LD_LIBRARY_PATH /opt/hpcx/ucc/lib/:/opt/hpcx/ucx/lib/:${LD_LIBRARY_PATH}
diff --git a/Dockerfile.sdk b/Dockerfile.sdk
new file mode 100644
index 0000000000..7ae8cf0ee8
--- /dev/null
+++ b/Dockerfile.sdk
@@ -0,0 +1,256 @@
+# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#
+# Multistage build.
+#
+
+# Base image on the minimum Triton container
+ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.03-py3-min
+
+ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
+ARG TRITON_COMMON_REPO_TAG=main
+ARG TRITON_CORE_REPO_TAG=main
+ARG TRITON_THIRD_PARTY_REPO_TAG=main
+ARG TRITON_MODEL_ANALYZER_REPO_TAG=main
+ARG TRITON_ENABLE_GPU=ON
+ARG JAVA_BINDINGS_MAVEN_VERSION=3.8.4
+ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG=1.5.8
+
+# DCGM version to install for Model Analyzer
+ARG DCGM_VERSION=3.2.6
+
+ARG NVIDIA_TRITON_SERVER_SDK_VERSION=unknown
+ARG NVIDIA_BUILD_ID=unknown
+
+############################################################################
+## Build image
+############################################################################
+
+FROM ${BASE_IMAGE} AS sdk_build
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ ca-certificates \
+ software-properties-common \
+ autoconf \
+ automake \
+ build-essential \
+ curl \
+ git \
+ gperf \
+ libb64-dev \
+ libgoogle-perftools-dev \
+ libopencv-dev \
+ libopencv-core-dev \
+ libssl-dev \
+ libtool \
+ pkg-config \
+ python3 \
+ python3-pip \
+ python3-dev \
+ rapidjson-dev \
+ vim \
+ wget \
+ python3-pdfkit \
+ openjdk-11-jdk \
+ maven && \
+ pip3 install --upgrade wheel setuptools && \
+ pip3 install --upgrade grpcio-tools && \
+ pip3 install --upgrade pip
+
+# Client build requires recent version of CMake (FetchContent required)
+# Using CMAKE installation instruction from:: https://apt.kitware.com/
+RUN apt update -q=2 \
+ && apt install -y gpg wget \
+ && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \
+ && . /etc/os-release \
+ && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \
+ && apt-get update -q=2 \
+ && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7* \
+ && cmake --version
+
+# Build expects "python" executable (not python3).
+RUN rm -f /usr/bin/python && \
+ ln -s /usr/bin/python3 /usr/bin/python
+
+# Build the client library and examples
+ARG TRITON_REPO_ORGANIZATION
+ARG TRITON_CLIENT_REPO_SUBDIR
+ARG TRITON_COMMON_REPO_TAG
+ARG TRITON_CORE_REPO_TAG
+ARG TRITON_THIRD_PARTY_REPO_TAG
+ARG TRITON_ENABLE_GPU
+ARG JAVA_BINDINGS_MAVEN_VERSION
+ARG JAVA_BINDINGS_JAVACPP_PRESETS_TAG
+ARG TARGETPLATFORM
+
+WORKDIR /workspace
+COPY TRITON_VERSION .
+COPY ${TRITON_CLIENT_REPO_SUBDIR} client
+
+WORKDIR /workspace/build
+RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
+ -DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
+ -DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
+ -DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
+ -DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
+ -DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
+ -DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
+ -DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \
+ -DTRITON_ENABLE_JAVA_HTTP=ON \
+ -DTRITON_ENABLE_PERF_ANALYZER=ON \
+ -DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
+ -DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
+ -DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
+ -DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \
+ -DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
+ -DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
+RUN make -j16 cc-clients python-clients java-clients && \
+ rm -fr ~/.m2
+
+# Install Java API Bindings
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+ source /workspace/client/src/java-api-bindings/scripts/install_dependencies_and_build.sh \
+ --maven-version ${JAVA_BINDINGS_MAVEN_VERSION} \
+ --core-tag ${TRITON_CORE_REPO_TAG} \
+ --javacpp-tag ${JAVA_BINDINGS_JAVACPP_PRESETS_TAG} \
+ --jar-install-path /workspace/install/java-api-bindings; \
+ fi
+
+RUN pip3 install build \
+ && cd /workspace/client/src/c++/perf_analyzer/genai-perf \
+ && python3 -m build --wheel --outdir /workspace/install/python
+############################################################################
+## Create sdk container
+############################################################################
+FROM ${BASE_IMAGE}
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+
+ARG DCGM_VERSION
+ARG TRITON_REPO_ORGANIZATION
+ARG TRITON_CORE_REPO_TAG
+ARG TARGETPLATFORM
+ARG TRITON_ENABLE_GPU
+
+RUN apt-get update && \
+ apt-get install -y --no-install-recommends \
+ software-properties-common \
+ curl \
+ git \
+ gperf \
+ libb64-dev \
+ libgoogle-perftools-dev \
+ libopencv-dev \
+ libopencv-core-dev \
+ libssl-dev \
+ libtool \
+ python3 \
+ python3-pip \
+ python3-dev \
+ vim \
+ wget \
+ python3-pdfkit \
+ maven \
+ default-jdk && \
+ pip3 install --upgrade wheel setuptools && \
+ pip3 install --upgrade grpcio-tools && \
+ pip3 install --upgrade pip
+
+WORKDIR /workspace
+COPY TRITON_VERSION .
+COPY NVIDIA_Deep_Learning_Container_License.pdf .
+COPY --from=sdk_build /workspace/client/ client/
+COPY --from=sdk_build /workspace/install/ install/
+RUN cd install && \
+ export VERSION=`cat /workspace/TRITON_VERSION` && \
+ tar zcf /workspace/v$VERSION.clients.tar.gz *
+
+# For CI testing need to copy over L0_sdk test and L0_client_build_variants test.
+RUN mkdir qa
+COPY qa/L0_sdk qa/L0_sdk
+COPY qa/L0_client_build_variants qa/L0_client_build_variants
+
+# Create a directory for all the python client tests to enable unit testing
+RUN mkdir -p qa/python_client_unit_tests/
+COPY --from=sdk_build /workspace/client/src/python/library/tests/* qa/python_client_unit_tests/
+
+# Install an image needed by the quickstart and other documentation.
+COPY qa/images/mug.jpg images/mug.jpg
+
+# Install the dependencies needed to run the client examples. These
+# are not needed for building but including them allows this image to
+# be used to run the client examples.
+RUN pip3 install --upgrade numpy pillow attrdict && \
+ find install/python/ -maxdepth 1 -type f -name \
+ "tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
+ xargs pip3 install --upgrade
+
+RUN pip3 install install/python/genai_perf-*.whl
+
+# Install DCGM
+RUN if [ "$TRITON_ENABLE_GPU" = "ON" ]; then \
+ [ "$(uname -m)" != "x86_64" ] && arch="sbsa" || arch="x86_64" && \
+ curl -o /tmp/cuda-keyring.deb \
+ https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$arch/cuda-keyring_1.0-1_all.deb \
+ && apt install /tmp/cuda-keyring.deb && rm /tmp/cuda-keyring.deb && \
+ apt-get update && apt-get install -y datacenter-gpu-manager=1:${DCGM_VERSION}; \
+ fi
+
+# Build expects "python" executable (not python3).
+RUN rm -f /usr/bin/python && \
+ ln -s /usr/bin/python3 /usr/bin/python
+
+# Install Model Analyzer
+ARG TRITON_MODEL_ANALYZER_REPO_TAG
+ARG TRITON_MODEL_ANALYZER_REPO="${TRITON_REPO_ORGANIZATION}/model_analyzer@${TRITON_MODEL_ANALYZER_REPO_TAG}"
+RUN pip3 install "git+${TRITON_MODEL_ANALYZER_REPO}"
+
+# Entrypoint Banner
+ENV NVIDIA_PRODUCT_NAME="Triton Server SDK"
+COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/
+RUN sed 's/Server/Server SDK/' /opt/nvidia/entrypoint.d/10-banner.txt | \
+ sed 's/^===/=======/' > /opt/nvidia/entrypoint.d/10-banner.new && \
+ mv /opt/nvidia/entrypoint.d/10-banner.new /opt/nvidia/entrypoint.d/10-banner.txt
+
+ARG NVIDIA_TRITON_SERVER_SDK_VERSION
+ARG NVIDIA_BUILD_ID
+ENV NVIDIA_TRITON_SERVER_SDK_VERSION=${NVIDIA_TRITON_SERVER_SDK_VERSION}
+ENV NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID}
+
+ENV PATH /workspace/install/bin:${PATH}
+ENV LD_LIBRARY_PATH /workspace/install/lib:${LD_LIBRARY_PATH}
+
+# DLIS-3631: Needed to run Perf Analyzer CI tests correctly
+ENV LD_LIBRARY_PATH /opt/hpcx/ompi/lib:${LD_LIBRARY_PATH}
+
+# Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc
+ENV TCMALLOC_RELEASE_RATE 200
diff --git a/Dockerfile.win10.min b/Dockerfile.win10.min
new file mode 100644
index 0000000000..107b2e8ac0
--- /dev/null
+++ b/Dockerfile.win10.min
@@ -0,0 +1,208 @@
+# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Windows min container for Triton build
+
+ARG BASE_IMAGE=mcr.microsoft.com/windows:10.0.19042.1889
+
+FROM ${BASE_IMAGE} as dependency_base
+
+RUN powershell.exe Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine
+RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.ServicePointManager]::SecurityProtocol=[Net.SecurityProtocolType]::Tls,[Net.SecurityProtocolType]::Tls11,[Net.SecurityProtocolType]::Tls12,[Net.SecurityProtocolType]::Ssl3;Invoke-Expression( New-Object System.Net.WebClient ).DownloadString('https://chocolatey.org/install.ps1')
+RUN choco install unzip -y
+
+#
+# Installing TensorRT
+#
+ARG TENSORRT_VERSION
+ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.0.zip"
+ARG TENSORRT_SOURCE=${TENSORRT_ZIP}
+# COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
+ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
+RUN unzip /tmp/%TENSORRT_ZIP%
+RUN move TensorRT-* TensorRT
+
+LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
+
+
+#
+# Installing cuDNN
+#
+ARG CUDNN_VERSION
+ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
+ARG CUDNN_SOURCE=${CUDNN_ZIP}
+ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
+RUN unzip /tmp/%CUDNN_ZIP%
+RUN move cudnn-* cudnn
+
+LABEL CUDNN_VERSION="${CUDNN_VERSION}"
+
+
+FROM ${BASE_IMAGE} as build_base
+
+SHELL ["cmd", "/S", "/C"]
+
+RUN mkdir c:\tmp
+WORKDIR /tmp
+
+RUN powershell.exe Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope LocalMachine
+RUN powershell.exe [Net.ServicePointManager]::Expect100Continue=$true;[Net.ServicePointManager]::SecurityProtocol=[Net.SecurityProtocolType]::Tls,[Net.SecurityProtocolType]::Tls11,[Net.SecurityProtocolType]::Tls12,[Net.SecurityProtocolType]::Ssl3;Invoke-Expression( New-Object System.Net.WebClient ).DownloadString('https://chocolatey.org/install.ps1')
+RUN choco install git docker unzip -y
+
+#
+# Installing python
+#
+ARG PYTHON_VERSION=3.8.10
+ARG PYTHON_SOURCE=https://www.python.org/ftp/python/${PYTHON_VERSION}/python-${PYTHON_VERSION}-amd64.exe
+ADD ${PYTHON_SOURCE} python-${PYTHON_VERSION}-amd64.exe
+RUN python-%PYTHON_VERSION%-amd64.exe /quiet InstallAllUsers=1 PrependPath=1 Include_doc=0 TargetDir="C:\python%PYTHON_VERSION%"
+RUN mklink "C:\python%PYTHON_VERSION%\python3.exe" "C:\python%PYTHON_VERSION%\python.exe"
+RUN pip install --upgrade wheel setuptools docker
+RUN pip install grpcio-tools psutil
+
+LABEL PYTHON_VERSION=${PYTHON_VERSION}
+
+#
+# Installing CMake
+#
+ARG CMAKE_VERSION=3.27.1
+ARG CMAKE_FILE=cmake-${CMAKE_VERSION}-windows-x86_64
+ARG CMAKE_SOURCE=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_FILE}.zip
+
+ADD ${CMAKE_SOURCE} ${CMAKE_FILE}.zip
+RUN unzip %CMAKE_FILE%.zip
+RUN move %CMAKE_FILE% "c:\CMake"
+RUN setx PATH "c:\CMake\bin;%PATH%"
+
+ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake
+ENV VCPKG_TARGET_TRIPLET x64-windows
+
+LABEL CMAKE_VERSION=${CMAKE_VERSION}
+
+# Be aware that pip can interact badly with VS cmd shell so need to pip install before
+# vsdevcmd.bat (see https://bugs.python.org/issue38989)
+
+
+#
+# Installing Visual Studio BuildTools: VS17 2022
+#
+ARG BUILDTOOLS_VERSION
+# Download collect.exe in case of an install failure.
+ADD https://aka.ms/vscollect.exe "C:\tmp\collect.exe"
+
+# Use the latest release channel. For more control, specify the location of an internal layout.
+ARG CHANNEL_URL=https://aka.ms/vs/17/release/channel
+ADD ${CHANNEL_URL} "C:\tmp\VisualStudio.chman"
+# Download the Build Tools bootstrapper.
+ARG BUILD_TOOLS_SOURCE=https://aka.ms/vs/17/release/vs_buildtools.exe
+ADD ${BUILD_TOOLS_SOURCE} vs_buildtools.exe
+# Install Build Tools with the Microsoft.VisualStudio.Workload.VCTools workload, including recommended.
+ARG VS_INSTALL_PATH_WP="C:\BuildTools"
+RUN vs_buildtools.exe --quiet --wait --norestart --nocache install \
+ --installPath %VS_INSTALL_PATH_WP% \
+ --channelUri "C:\tmp\VisualStudio.chman" \
+ --installChannelUri "C:\tmp\VisualStudio.chman" \
+ --add Microsoft.VisualStudio.Workload.VCTools \
+ --includeRecommended \
+ --locale "En-us"
+
+LABEL BUILDTOOLS_VERSION=${BUILDTOOLS_VERSION}
+
+WORKDIR /
+
+#
+# Installing Vcpkg
+#
+ARG VCPGK_VERSION=2023.11.20
+RUN git clone --single-branch --depth=1 -b %VCPGK_VERSION% https://github.com/microsoft/vcpkg.git
+WORKDIR /vcpkg
+RUN bootstrap-vcpkg.bat
+RUN vcpkg.exe update
+RUN vcpkg.exe install \
+ b64:x64-windows \
+ boost-interprocess:x64-windows \
+ boost-stacktrace:x64-windows \
+ openssl-windows:x64-windows \
+ openssl:x64-windows \
+ pthread:x64-windows \
+ rapidjson:x64-windows \
+ zlib:x64-windows
+RUN vcpkg.exe integrate install
+
+LABEL VCPGK_VERSION=${VCPGK_VERSION}
+
+WORKDIR /
+
+#
+# Installing CUDA
+#
+ARG CUDA_MAJOR=12
+ARG CUDA_MINOR=3
+ARG CUDA_PATCH=2
+ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}
+ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \
+ cudart_${CUDA_MAJOR}.${CUDA_MINOR} \
+ nvml_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+ cublas_${CUDA_MAJOR}.${CUDA_MINOR} cublas_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+ cufft_${CUDA_MAJOR}.${CUDA_MINOR} cufft_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+ curand_${CUDA_MAJOR}.${CUDA_MINOR} curand_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+ cusolver_${CUDA_MAJOR}.${CUDA_MINOR} cusolver_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+ cusparse_${CUDA_MAJOR}.${CUDA_MINOR} cusparse_dev_${CUDA_MAJOR}.${CUDA_MINOR} \
+ cupti_${CUDA_MAJOR}.${CUDA_MINOR} \
+ thrust_${CUDA_MAJOR}.${CUDA_MINOR} \
+ visual_studio_integration_${CUDA_MAJOR}.${CUDA_MINOR}"
+ARG CUDA_INSTALL_ROOT_WP="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${CUDA_MAJOR}.${CUDA_MINOR}"
+
+ARG CUDA_SOURCE=https://developer.download.nvidia.com/compute/cuda/${CUDA_VERSION}/network_installers/cuda_${CUDA_VERSION}_windows_network.exe
+ADD ${CUDA_SOURCE} cuda_${CUDA_VERSION}_windows_network.exe
+
+RUN cuda_%CUDA_VERSION%_windows_network.exe -s %CUDA_PACKAGES%
+# Copy the CUDA visualstudio integration from where it was installed
+# into the appropriate place in BuildTools
+RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensions\*" "%VS_INSTALL_PATH_WP%\MSBuild\Microsoft\VC\v170\BuildCustomizations"
+
+RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"
+
+ARG CUDNN_VERSION
+ENV CUDNN_VERSION ${CUDNN_VERSION}
+COPY --from=dependency_base /cudnn /cudnn
+RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
+RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
+RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
+LABEL CUDNN_VERSION="${CUDNN_VERSION}"
+
+ARG TENSORRT_VERSION
+ENV TRT_VERSION ${TENSORRT_VERSION}
+COPY --from=dependency_base /TensorRT /TensorRT
+RUN setx PATH "c:\TensorRT\lib;%PATH%"
+LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
+
+LABEL CUDA_VERSION="${CUDA_VERSION}"
+# It is important that the entrypoint initialize VisualStudio
+# environment otherwise the build will fail. Also set
+# CMAKE_TOOLCHAIN_FILE and VCPKG_TARGET_TRIPLET so
+# that cmake can find the packages installed by vcpkg.
+ENTRYPOINT C:\BuildTools\VC\Auxiliary\Build\vcvars64.bat &&
diff --git a/LICENSE b/LICENSE
index 8d2301c1f9..5529809efc 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,25 +1,25 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of NVIDIA CORPORATION nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/NVIDIA_Deep_Learning_Container_License.pdf b/NVIDIA_Deep_Learning_Container_License.pdf
new file mode 100644
index 0000000000..bfdce390f3
Binary files /dev/null and b/NVIDIA_Deep_Learning_Container_License.pdf differ
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..4783f8f1f7
--- /dev/null
+++ b/README.md
@@ -0,0 +1,277 @@
+
+
+# Triton Inference Server
+
+📣 **Triton Meetup at the NVIDIA Headquarters on April 30th 3:00 - 6:30 pm**
+
+We are excited to announce that we will be hosting our Triton user meetup at
+the NVIDIA Headquarters on April 30th 3:00 - 6:30 pm. Join us for this
+exclusive event where you will learn about the newest Triton features, get a
+glimpse into the roadmap, and connect with fellow users and the NVIDIA Triton
+engineering and product teams. Seating is limited and registration confirmation
+is required to attend - please register [here](https://lu.ma/tl06fqc1) to join
+the meetup. We can’t wait to welcome you and share what’s next for the Triton
+Inference Server.
+
+---
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+> [!WARNING]
+> ##### LATEST RELEASE
+> You are currently on the `main` branch which tracks under-development progress towards the next release.
+> The current release is version [2.44.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.03 container release on NVIDIA GPU Cloud (NGC).
+
+Triton Inference Server is an open source inference serving software that
+streamlines AI inferencing. Triton enables teams to deploy any AI model from
+multiple deep learning and machine learning frameworks, including TensorRT,
+TensorFlow, PyTorch, ONNX, OpenVINO, Python, RAPIDS FIL, and more. Triton
+Inference Server supports inference across cloud, data center, edge and embedded
+devices on NVIDIA GPUs, x86 and ARM CPU, or AWS Inferentia. Triton Inference
+Server delivers optimized performance for many query types, including real time,
+batched, ensembles and audio/video streaming. Triton inference Server is part of
+[NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/),
+a software platform that accelerates the data science pipeline and streamlines
+the development and deployment of production AI.
+
+Major features include:
+
+- [Supports multiple deep learning
+ frameworks](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton)
+- [Supports multiple machine learning
+ frameworks](https://github.com/triton-inference-server/fil_backend)
+- [Concurrent model
+ execution](docs/user_guide/architecture.md#concurrent-model-execution)
+- [Dynamic batching](docs/user_guide/model_configuration.md#dynamic-batcher)
+- [Sequence batching](docs/user_guide/model_configuration.md#sequence-batcher) and
+ [implicit state management](docs/user_guide/architecture.md#implicit-state-management)
+ for stateful models
+- Provides [Backend API](https://github.com/triton-inference-server/backend) that
+ allows adding custom backends and pre/post processing operations
+- Supports writing custom backends in python, a.k.a.
+ [Python-based backends.](https://github.com/triton-inference-server/backend/blob/main/docs/python_based_backends.md#python-based-backends)
+- Model pipelines using
+ [Ensembling](docs/user_guide/architecture.md#ensemble-models) or [Business
+ Logic Scripting
+ (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
+- [HTTP/REST and GRPC inference
+ protocols](docs/customization_guide/inference_protocols.md) based on the community
+ developed [KServe
+ protocol](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2)
+- A [C API](docs/customization_guide/inference_protocols.md#in-process-triton-server-api) and
+ [Java API](docs/customization_guide/inference_protocols.md#java-bindings-for-in-process-triton-server-api)
+ allow Triton to link directly into your application for edge and other in-process use cases
+- [Metrics](docs/user_guide/metrics.md) indicating GPU utilization, server
+ throughput, server latency, and more
+
+**New to Triton Inference Server?** Make use of
+[these tutorials](https://github.com/triton-inference-server/tutorials)
+to begin your Triton journey!
+
+Join the [Triton and TensorRT community](https://www.nvidia.com/en-us/deep-learning-ai/triton-tensorrt-newsletter/) and
+stay current on the latest product updates, bug fixes, content, best practices,
+and more. Need enterprise support? NVIDIA global support is available for Triton
+Inference Server with the
+[NVIDIA AI Enterprise software suite](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/).
+
+## Serve a Model in 3 Easy Steps
+
+```bash
+# Step 1: Create the example model repository
+git clone -b r24.03 https://github.com/triton-inference-server/server.git
+cd server/docs/examples
+./fetch_models.sh
+
+# Step 2: Launch triton from the NGC Triton container
+docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.03-py3 tritonserver --model-repository=/models
+
+# Step 3: Sending an Inference Request
+# In a separate console, launch the image_client example from the NGC Triton SDK container
+docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.03-py3-sdk
+/workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg
+
+# Inference should return the following
+Image '/workspace/images/mug.jpg':
+ 15.346230 (504) = COFFEE MUG
+ 13.224326 (968) = CUP
+ 10.422965 (505) = COFFEEPOT
+```
+Please read the [QuickStart](docs/getting_started/quickstart.md) guide for additional information
+regarding this example. The quickstart guide also contains an example of how to launch Triton on [CPU-only systems](docs/getting_started/quickstart.md#run-on-cpu-only-system). New to Triton and wondering where to get started? Watch the [Getting Started video](https://youtu.be/NQDtfSi5QF4).
+
+## Examples and Tutorials
+
+Check out [NVIDIA LaunchPad](https://www.nvidia.com/en-us/data-center/products/ai-enterprise-suite/trial/)
+for free access to a set of hands-on labs with Triton Inference Server hosted on
+NVIDIA infrastructure.
+
+Specific end-to-end examples for popular models, such as ResNet, BERT, and DLRM
+are located in the
+[NVIDIA Deep Learning Examples](https://github.com/NVIDIA/DeepLearningExamples)
+page on GitHub. The
+[NVIDIA Developer Zone](https://developer.nvidia.com/nvidia-triton-inference-server)
+contains additional documentation, presentations, and examples.
+
+## Documentation
+
+### Build and Deploy
+
+The recommended way to build and use Triton Inference Server is with Docker
+images.
+
+- [Install Triton Inference Server with Docker containers](docs/customization_guide/build.md#building-with-docker) (*Recommended*)
+- [Install Triton Inference Server without Docker containers](docs/customization_guide/build.md#building-without-docker)
+- [Build a custom Triton Inference Server Docker container](docs/customization_guide/compose.md)
+- [Build Triton Inference Server from source](docs/customization_guide/build.md#building-on-unsupported-platforms)
+- [Build Triton Inference Server for Windows 10](docs/customization_guide/build.md#building-for-windows-10)
+- Examples for deploying Triton Inference Server with Kubernetes and Helm on [GCP](deploy/gcp/README.md),
+ [AWS](deploy/aws/README.md), and [NVIDIA FleetCommand](deploy/fleetcommand/README.md)
+- [Secure Deployment Considerations](docs/customization_guide/deploy.md)
+
+### Using Triton
+
+#### Preparing Models for Triton Inference Server
+
+The first step in using Triton to serve your models is to place one or
+more models into a [model repository](docs/user_guide/model_repository.md). Depending on
+the type of the model and on what Triton capabilities you want to enable for
+the model, you may need to create a [model
+configuration](docs/user_guide/model_configuration.md) for the model.
+
+- [Add custom operations to Triton if needed by your model](docs/user_guide/custom_operations.md)
+- Enable model pipelining with [Model Ensemble](docs/user_guide/architecture.md#ensemble-models)
+ and [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
+- Optimize your models setting [scheduling and batching](docs/user_guide/architecture.md#models-and-schedulers)
+ parameters and [model instances](docs/user_guide/model_configuration.md#instance-groups).
+- Use the [Model Analyzer tool](https://github.com/triton-inference-server/model_analyzer)
+ to help optimize your model configuration with profiling
+- Learn how to [explicitly manage what models are available by loading and
+ unloading models](docs/user_guide/model_management.md)
+
+#### Configure and Use Triton Inference Server
+
+- Read the [Quick Start Guide](docs/getting_started/quickstart.md) to run Triton Inference
+ Server on both GPU and CPU
+- Triton supports multiple execution engines, called
+ [backends](https://github.com/triton-inference-server/backend#where-can-i-find-all-the-backends-that-are-available-for-triton), including
+ [TensorRT](https://github.com/triton-inference-server/tensorrt_backend),
+ [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend),
+ [PyTorch](https://github.com/triton-inference-server/pytorch_backend),
+ [ONNX](https://github.com/triton-inference-server/onnxruntime_backend),
+ [OpenVINO](https://github.com/triton-inference-server/openvino_backend),
+ [Python](https://github.com/triton-inference-server/python_backend), and more
+- Not all the above backends are supported on every platform supported by Triton.
+ Look at the
+ [Backend-Platform Support Matrix](https://github.com/triton-inference-server/backend/blob/main/docs/backend_platform_support_matrix.md)
+ to learn which backends are supported on your target platform.
+- Learn how to [optimize performance](docs/user_guide/optimization.md) using the
+ [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+ and
+ [Model Analyzer](https://github.com/triton-inference-server/model_analyzer)
+- Learn how to [manage loading and unloading models](docs/user_guide/model_management.md) in
+ Triton
+- Send requests directly to Triton with the [HTTP/REST JSON-based
+ or gRPC protocols](docs/customization_guide/inference_protocols.md#httprest-and-grpc-protocols)
+
+#### Client Support and Examples
+
+A Triton *client* application sends inference and other requests to Triton. The
+[Python and C++ client libraries](https://github.com/triton-inference-server/client)
+provide APIs to simplify this communication.
+
+- Review client examples for [C++](https://github.com/triton-inference-server/client/blob/main/src/c%2B%2B/examples),
+ [Python](https://github.com/triton-inference-server/client/blob/main/src/python/examples),
+ and [Java](https://github.com/triton-inference-server/client/blob/main/src/java/src/main/java/triton/client/examples)
+- Configure [HTTP](https://github.com/triton-inference-server/client#http-options)
+ and [gRPC](https://github.com/triton-inference-server/client#grpc-options)
+ client options
+- Send input data (e.g. a jpeg image) directly to Triton in the [body of an HTTP
+ request without any additional metadata](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md#raw-binary-request)
+
+### Extend Triton
+
+[Triton Inference Server's architecture](docs/user_guide/architecture.md) is specifically
+designed for modularity and flexibility
+
+- [Customize Triton Inference Server container](docs/customization_guide/compose.md) for your use case
+- [Create custom backends](https://github.com/triton-inference-server/backend)
+ in either [C/C++](https://github.com/triton-inference-server/backend/blob/main/README.md#triton-backend-api)
+ or [Python](https://github.com/triton-inference-server/python_backend)
+- Create [decoupled backends and models](docs/user_guide/decoupled_models.md) that can send
+ multiple responses for a request or not send any responses for a request
+- Use a [Triton repository agent](docs/customization_guide/repository_agents.md) to add functionality
+ that operates when a model is loaded and unloaded, such as authentication,
+ decryption, or conversion
+- Deploy Triton on [Jetson and JetPack](docs/user_guide/jetson.md)
+- [Use Triton on AWS
+ Inferentia](https://github.com/triton-inference-server/python_backend/tree/main/inferentia)
+
+### Additional Documentation
+
+- [FAQ](docs/user_guide/faq.md)
+- [User Guide](docs/README.md#user-guide)
+- [Customization Guide](docs/README.md#customization-guide)
+- [Release Notes](https://docs.nvidia.com/deeplearning/triton-inference-server/release-notes/index.html)
+- [GPU, Driver, and CUDA Support
+Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html)
+
+## Contributing
+
+Contributions to Triton Inference Server are more than welcome. To
+contribute please review the [contribution
+guidelines](CONTRIBUTING.md). If you have a backend, client,
+example or similar contribution that is not modifying the core of
+Triton, then you should file a PR in the [contrib
+repo](https://github.com/triton-inference-server/contrib).
+
+## Reporting problems, asking questions
+
+We appreciate any feedback, questions or bug reporting regarding this project.
+When posting [issues in GitHub](https://github.com/triton-inference-server/server/issues),
+follow the process outlined in the [Stack Overflow document](https://stackoverflow.com/help/mcve).
+Ensure posted examples are:
+- minimal – use as little code as possible that still produces the
+ same problem
+- complete – provide all parts needed to reproduce the problem. Check
+ if you can strip external dependencies and still show the problem. The
+ less time we spend on reproducing problems the more time we have to
+ fix it
+- verifiable – test the code you're about to provide to make sure it
+ reproduces the problem. Remove all other problems that are not
+ related to your request/question.
+
+For issues, please use the provided bug report and feature request templates.
+
+For questions, we recommend posting in our community
+[GitHub Discussions.](https://github.com/triton-inference-server/server/discussions)
+
+## For more information
+
+Please refer to the [NVIDIA Developer Triton page](https://developer.nvidia.com/nvidia-triton-inference-server)
+for more information.
diff --git a/README.rst b/README.rst
deleted file mode 100644
index b8a516266d..0000000000
--- a/README.rst
+++ /dev/null
@@ -1,113 +0,0 @@
-..
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions
- # are met:
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- # * Neither the name of NVIDIA CORPORATION nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-|License|
-
-NVIDIA TensorRT Inference Server
-================================
-
-
- **NOTE: You are currently on the master branch which tracks
- under-development progress towards the next release. The latest
- release of the TensorRT Inference Server is 0.8.0 beta and is
- available on branch** `r18.11
- `_.
-
-.. overview-begin-marker-do-not-remove
-
-The NVIDIA TensorRT Inference Server (TRTIS) provides a cloud
-inferencing solution optimized for NVIDIA GPUs. The server provides an
-inference service via an HTTP or gRPC endpoint, allowing remote
-clients to request inferencing for any model being managed by the
-server. TRTIS provides the following features:
-
-* `Multiple framework support `_. The server can manage any number and mix of
- models (limited by system disk and memory resources). Supports
- TensorRT, TensorFlow GraphDef, TensorFlow SavedModel and Caffe2
- NetDef model formats. Also supports TensorFlow-TensorRT integrated
- models.
-* Multi-GPU support. The server can distribute inferencing across all
- system GPUs.
-* `Concurrent model execution support `_. Multiple models (or multiple instances of the
- same model) can run simultaneously on the same GPU.
-* Batching support. For models that support batching, the server can
- accept requests for a batch of inputs and respond with the
- corresponding batch of outputs. The server also supports `dynamic
- batching `_ where individual inference requests are dynamically
- combined together to improve inference throughput. Dynamic batching
- is transparent to the client requesting inference.
-* `Model repositories `_ may reside on a locally accessible file system (e.g. NFS) or
- in Google Cloud Storage.
-* Readiness and liveness `health endpoints `_ suitable for any orchestration or deployment framework, such as Kubernetes.
-* `Metrics `_ indicating GPU utiliization, server throughput, and server
- latency.
-
-.. overview-end-marker-do-not-remove
-
-The current release of the TensorRT Inference Server is 0.8.0 beta and
-corresponds to the 18.11 release of the tensorrtserver container on
-`NVIDIA GPU Cloud (NGC) `_. The branch for
-this release is `r18.11
-`_. The
-User Guide, Developer Guide, and API Reference `documentation
-`_
-provide guidance on installing, building and running TRTIS.
-
-You can also view the documentation for the `master branch
-`_
-and for `earlier releases
-`_.
-
-Contributing
-------------
-
-Contributions to TensorRT Inference Server are more than welcome. To
-contribute make a pull request and follow the guidelines outlined in
-the `Contributing `_ document.
-
-Reporting problems, asking questions
-------------------------------------
-
-We appreciate any feedback, questions or bug reporting regarding this
-project. When help with code is needed, follow the process outlined in
-the Stack Overflow (https://stackoverflow.com/help/mcve)
-document. Ensure posted examples are:
-
-* minimal – use as little code as possible that still produces the
- same problem
-
-* complete – provide all parts needed to reproduce the problem. Check
- if you can strip external dependency and still show the problem. The
- less time we spend on reproducing problems the more time we have to
- fix it
-
-* verifiable – test the code you're about to provide to make sure it
- reproduces the problem. Remove all other problems that are not
- related to your request/question.
-
-.. |License| image:: https://img.shields.io/badge/License-BSD3-lightgrey.svg
- :target: https://opensource.org/licenses/BSD-3-Clause
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000..7aa39f4e5d
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,44 @@
+
+
+# Report a Security Vulnerability
+
+To report a potential security vulnerability in any NVIDIA product, please use either:
+* This web form: [Security Vulnerability Submission Form](https://www.nvidia.com/object/submit-security-vulnerability.html), or
+* Send email to: [NVIDIA PSIRT](mailto:psirt@nvidia.com)
+
+**OEM Partners should contact their NVIDIA Customer Program Manager**
+
+If reporting a potential vulnerability via email, please encrypt it using NVIDIA’s public PGP key ([see PGP Key page](https://www.nvidia.com/en-us/security/pgp-key/)) and include the following information:
+1. Product/Driver name and version/branch that contains the vulnerability
+2. Type of vulnerability (code execution, denial of service, buffer overflow, etc.)
+3. Instructions to reproduce the vulnerability
+4. Proof-of-concept or exploit code
+5. Potential impact of the vulnerability, including how an attacker could exploit the vulnerability
+
+See https://www.nvidia.com/en-us/security/ for past NVIDIA Security Bulletins and Notices.
diff --git a/TRITON_VERSION b/TRITON_VERSION
new file mode 100644
index 0000000000..4cc09ac9dd
--- /dev/null
+++ b/TRITON_VERSION
@@ -0,0 +1 @@
+2.45.0dev
diff --git a/Triton-CCLA-v1.pdf b/Triton-CCLA-v1.pdf
new file mode 100644
index 0000000000..d08afc8183
Binary files /dev/null and b/Triton-CCLA-v1.pdf differ
diff --git a/VERSION b/VERSION
deleted file mode 100644
index 7382a313f5..0000000000
--- a/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-0.10.0dev
diff --git a/WORKSPACE b/WORKSPACE
deleted file mode 100644
index afb9d3217c..0000000000
--- a/WORKSPACE
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# * Redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer.
-# * Redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution.
-# * Neither the name of NVIDIA CORPORATION nor the names of its
-# contributors may be used to endorse or promote products derived
-# from this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-workspace(name = "inference_server")
-
-local_repository(
- name = "org_tensorflow",
- path = "/opt/tensorflow/",
-)
-
-local_repository(
- name = "tf_serving",
- path = __workspace_dir__ + "/serving/",
-)
-
-new_local_repository(
- name = "extern_lib",
- path = "/opt/tensorrtserver/lib",
- build_file_content = """
-cc_library(
- name = "libcaffe2",
- srcs = ["libcaffe2.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libcaffe2_gpu",
- srcs = ["libcaffe2_gpu.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libcaffe2_detectron_ops_gpu",
- srcs = ["libcaffe2_detectron_ops_gpu.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libc10",
- srcs = ["libc10.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libmkl_core",
- srcs = ["libmkl_core.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libmkl_gnu_thread",
- srcs = ["libmkl_gnu_thread.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libmkl_avx2",
- srcs = ["libmkl_avx2.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libmkl_def",
- srcs = ["libmkl_def.so"],
- visibility = ["//visibility:public"],
-)
-cc_library(
- name = "libmkl_intel_lp64",
- srcs = ["libmkl_intel_lp64.so"],
- visibility = ["//visibility:public"],
-)
-""",
-)
-
-# Need prometheus for metrics
-http_archive(
- name = "prometheus",
- strip_prefix = "prometheus-cpp-0.5.0",
- urls = ["https://github.com/jupp0r/prometheus-cpp/archive/v0.5.0.tar.gz"],
-)
-load("@prometheus//:repositories.bzl", "load_civetweb")
-load_civetweb()
-
-# TensorFlow depends on "io_bazel_rules_closure" so we need this here.
-# Needs to be kept in sync with the same target in TensorFlow's WORKSPACE file.
-http_archive(
- name = "io_bazel_rules_closure",
- sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
- strip_prefix = "rules_closure-dbb96841cc0a5fb2664c37822803b06dab20c7d1",
- urls = [
- "https://mirror.bazel.build/github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz",
- "https://github.com/bazelbuild/rules_closure/archive/dbb96841cc0a5fb2664c37822803b06dab20c7d1.tar.gz", # 2018-04-13
- ],
-)
-
-load('@tf_serving//tensorflow_serving:workspace.bzl', 'tf_serving_workspace')
-tf_serving_workspace()
-
-# Specify the minimum required bazel version.
-load("@org_tensorflow//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-
-check_bazel_version_at_least("0.15.0")
diff --git a/build.py b/build.py
new file mode 100755
index 0000000000..fde2b4ed2b
--- /dev/null
+++ b/build.py
@@ -0,0 +1,2747 @@
+#!/usr/bin/env python3
+# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import importlib.util
+import multiprocessing
+import os
+import os.path
+import pathlib
+import platform
+import stat
+import subprocess
+import sys
+from inspect import getsourcefile
+
+import requests
+
+#
+# Build Triton Inference Server.
+#
+
+# By default build.py builds the Triton Docker image, but can also be
+# used to build without Docker. See docs/build.md and --help for more
+# information.
+#
+# The TRITON_VERSION file indicates the Triton version and
+# TRITON_VERSION_MAP is used to determine the corresponding container
+# version and upstream container version (upstream containers are
+# dependencies required by Triton). These versions may be overridden.
+
+# Map from Triton version to corresponding container and component versions.
+#
+# triton version ->
+# (triton container version,
+# upstream container version,
+# ORT version,
+# ORT OpenVINO version (use None to disable OpenVINO in ORT),
+# Standalone OpenVINO version,
+# DCGM version
+# )
+#
+# Currently the OpenVINO versions used in ORT and standalone must
+# match because of the way dlopen works with loading the backends. If
+# different versions are used then one backend or the other will
+# incorrectly load the other version of the openvino libraries.
+#
+TRITON_VERSION_MAP = {
+ "2.45.0dev": (
+ "24.04dev", # triton container
+ "24.03", # upstream container
+ "1.17.2", # ORT
+ "2023.3.0", # ORT OpenVINO
+ "2023.3.0", # Standalone OpenVINO
+ "3.2.6", # DCGM version
+ "0.4.0.post1", # vLLM version
+ )
+}
+
+CORE_BACKENDS = ["ensemble"]
+
+FLAGS = None
+EXTRA_CORE_CMAKE_FLAGS = {}
+OVERRIDE_CORE_CMAKE_FLAGS = {}
+EXTRA_BACKEND_CMAKE_FLAGS = {}
+OVERRIDE_BACKEND_CMAKE_FLAGS = {}
+
+THIS_SCRIPT_DIR = os.path.dirname(os.path.abspath(getsourcefile(lambda: 0)))
+
+
+def log(msg, force=False):
+ if force or not FLAGS.quiet:
+ try:
+ print(msg, file=sys.stderr)
+ except Exception:
+ print("", file=sys.stderr)
+
+
+def log_verbose(msg):
+ if FLAGS.verbose:
+ log(msg, force=True)
+
+
+def fail(msg):
+ fail_if(True, msg)
+
+
+def fail_if(p, msg):
+ if p:
+ print("error: {}".format(msg), file=sys.stderr)
+ sys.exit(1)
+
+
+def target_platform():
+ if FLAGS.target_platform is not None:
+ return FLAGS.target_platform
+ return platform.system().lower()
+
+
+def target_machine():
+ if FLAGS.target_machine is not None:
+ return FLAGS.target_machine
+ return platform.machine().lower()
+
+
+def container_versions(version, container_version, upstream_container_version):
+ if container_version is None:
+ if version not in TRITON_VERSION_MAP:
+ fail("container version not known for {}".format(version))
+ container_version = TRITON_VERSION_MAP[version][0]
+ if upstream_container_version is None:
+ if version not in TRITON_VERSION_MAP:
+ fail("upstream container version not known for {}".format(version))
+ upstream_container_version = TRITON_VERSION_MAP[version][1]
+ return container_version, upstream_container_version
+
+
+class BuildScript:
+ """Utility class for writing build scripts"""
+
+ def __init__(self, filepath, desc=None, verbose=False):
+ self._filepath = filepath
+ self._file = open(self._filepath, "w")
+ self._verbose = verbose
+ self.header(desc)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self.close()
+
+ def __del__(self):
+ self.close()
+
+ def close(self):
+ if self._file is not None:
+ if target_platform() == "windows":
+ self.blankln()
+ self._file.write("}\n")
+ self._file.write("catch {\n")
+ self._file.write(" $_;\n")
+ self._file.write(" ExitWithCode 1;\n")
+ self._file.write("}\n")
+ """Close the file"""
+ self._file.close()
+ self._file = None
+ st = os.stat(self._filepath)
+ os.chmod(self._filepath, st.st_mode | stat.S_IEXEC)
+
+ def blankln(self):
+ self._file.write("\n")
+
+ def commentln(self, cnt):
+ self._file.write("#" * cnt + "\n")
+
+ def comment(self, msg=""):
+ if not isinstance(msg, str):
+ try:
+ for m in msg:
+ self._file.write(f"# {msg}\n")
+ return
+ except TypeError:
+ pass
+ self._file.write(f"# {msg}\n")
+
+ def comment_verbose(self, msg=""):
+ if self._verbose:
+ self.comment(msg)
+
+ def header(self, desc=None):
+ if target_platform() != "windows":
+ self._file.write("#!/usr/bin/env bash\n\n")
+
+ if desc is not None:
+ self.comment()
+ self.comment(desc)
+ self.comment()
+ self.blankln()
+
+ self.comment("Exit script immediately if any command fails")
+ if target_platform() == "windows":
+ self._file.write("function ExitWithCode($exitcode) {\n")
+ self._file.write(" $host.SetShouldExit($exitcode)\n")
+ self._file.write(" exit $exitcode\n")
+ self._file.write("}\n")
+ self.blankln()
+ if self._verbose:
+ self._file.write("Set-PSDebug -Trace 1\n")
+ self.blankln()
+ self._file.write("try {\n")
+ else:
+ self._file.write("set -e\n")
+ if self._verbose:
+ self._file.write("set -x\n")
+ self.blankln()
+
+ def envvar_ref(self, v):
+ if target_platform() == "windows":
+ return f"${{env:{v}}}"
+ return f"${{{v}}}"
+
+ def cmd(self, clist, check_exitcode=False):
+ if isinstance(clist, str):
+ self._file.write(f"{clist}\n")
+ else:
+ for c in clist:
+ self._file.write(f"{c} ")
+ self.blankln()
+
+ if check_exitcode:
+ if target_platform() == "windows":
+ self._file.write("if ($LASTEXITCODE -ne 0) {\n")
+ self._file.write(
+ ' Write-Output "exited with status code $LASTEXITCODE";\n'
+ )
+ self._file.write(" ExitWithCode 1;\n")
+ self._file.write("}\n")
+
+ def cwd(self, path):
+ if target_platform() == "windows":
+ self.cmd(f"Set-Location -EV Err -EA Stop {path}")
+ else:
+ self.cmd(f"cd {path}")
+
+ def cp(self, src, dest):
+ if target_platform() == "windows":
+ self.cmd(f"Copy-Item -EV Err -EA Stop {src} -Destination {dest}")
+ else:
+ self.cmd(f"cp {src} {dest}")
+
+ def mkdir(self, path):
+ if target_platform() == "windows":
+ self.cmd(
+ f"New-Item -EV Err -EA Stop -ItemType Directory -Force -Path {path}"
+ )
+ else:
+ self.cmd(f"mkdir -p {pathlib.Path(path)}")
+
+ def rmdir(self, path):
+ if target_platform() == "windows":
+ self.cmd(f"if (Test-Path -Path {path}) {{")
+ self.cmd(f" Remove-Item -EV Err -EA Stop -Recurse -Force {path}")
+ self.cmd("}")
+ else:
+ self.cmd(f"rm -fr {pathlib.Path(path)}")
+
+ def cpdir(self, src, dest):
+ if target_platform() == "windows":
+ self.cmd(f"Copy-Item -EV Err -EA Stop -Recurse {src} -Destination {dest}")
+ else:
+ self.cmd(f"cp -r {src} {dest}")
+
+ def tar(self, subdir, tar_filename):
+ if target_platform() == "windows":
+ fail("unsupported operation: tar")
+ else:
+ self.cmd(f"tar zcf {tar_filename} {subdir}")
+
+ def cmake(self, args):
+ # Pass some additional envvars into cmake...
+ env_args = []
+ for k in ("TRT_VERSION", "CMAKE_TOOLCHAIN_FILE", "VCPKG_TARGET_TRIPLET"):
+ env_args += [f'"-D{k}={self.envvar_ref(k)}"']
+ self.cmd(f'cmake {" ".join(env_args)} {" ".join(args)}', check_exitcode=True)
+
+ def makeinstall(self, target="install"):
+ verbose_flag = "-v" if self._verbose else ""
+ self.cmd(
+ f"cmake --build . --config {FLAGS.build_type} -j{FLAGS.build_parallel} {verbose_flag} -t {target}"
+ )
+
+ def gitclone(self, repo, tag, subdir, org):
+ clone_dir = subdir
+ if not FLAGS.no_force_clone:
+ self.rmdir(clone_dir)
+
+ if target_platform() == "windows":
+ self.cmd(f"if (-Not (Test-Path -Path {clone_dir})) {{")
+ else:
+ self.cmd(f"if [[ ! -e {clone_dir} ]]; then")
+
+ # FIXME [DLIS-4045 - Currently the tag starting with "pull/" is not
+ # working with "--repo-tag" as the option is not forwarded to the
+ # individual repo build correctly.]
+ # If 'tag' starts with "pull/" then it must be of form
+ # "pull//head". We just clone at "main" and then fetch the
+ # reference onto a new branch we name "tritonbuildref".
+ if tag.startswith("pull/"):
+ self.cmd(
+ f" git clone --recursive --depth=1 {org}/{repo}.git {subdir};",
+ check_exitcode=True,
+ )
+ self.cmd("}" if target_platform() == "windows" else "fi")
+ self.cwd(subdir)
+ self.cmd(f"git fetch origin {tag}:tritonbuildref", check_exitcode=True)
+ self.cmd(f"git checkout tritonbuildref", check_exitcode=True)
+ else:
+ self.cmd(
+ f" git clone --recursive --single-branch --depth=1 -b {tag} {org}/{repo}.git {subdir};",
+ check_exitcode=True,
+ )
+ self.cmd("}" if target_platform() == "windows" else "fi")
+
+
+def cmake_core_arg(name, type, value):
+ # Return cmake -D setting to set name=value for core build. Use
+ # command-line specified value if one is given.
+ if name in OVERRIDE_CORE_CMAKE_FLAGS:
+ value = OVERRIDE_CORE_CMAKE_FLAGS[name]
+ if type is None:
+ type = ""
+ else:
+ type = ":{}".format(type)
+ return '"-D{}{}={}"'.format(name, type, value)
+
+
+def cmake_core_enable(name, flag):
+ # Return cmake -D setting to set name=flag?ON:OFF for core
+ # build. Use command-line specified value for 'flag' if one is
+ # given.
+ if name in OVERRIDE_CORE_CMAKE_FLAGS:
+ value = OVERRIDE_CORE_CMAKE_FLAGS[name]
+ else:
+ value = "ON" if flag else "OFF"
+ return '"-D{}:BOOL={}"'.format(name, value)
+
+
+def cmake_core_extra_args():
+ args = []
+ for k, v in EXTRA_CORE_CMAKE_FLAGS.items():
+ args.append('"-D{}={}"'.format(k, v))
+ return args
+
+
+def cmake_backend_arg(backend, name, type, value):
+ # Return cmake -D setting to set name=value for backend build. Use
+ # command-line specified value if one is given.
+ if backend in OVERRIDE_BACKEND_CMAKE_FLAGS:
+ if name in OVERRIDE_BACKEND_CMAKE_FLAGS[backend]:
+ value = OVERRIDE_BACKEND_CMAKE_FLAGS[backend][name]
+ if type is None:
+ type = ""
+ else:
+ type = ":{}".format(type)
+ return '"-D{}{}={}"'.format(name, type, value)
+
+
+def cmake_backend_enable(backend, name, flag):
+ # Return cmake -D setting to set name=flag?ON:OFF for backend
+ # build. Use command-line specified value for 'flag' if one is
+ # given.
+ value = None
+ if backend in OVERRIDE_BACKEND_CMAKE_FLAGS:
+ if name in OVERRIDE_BACKEND_CMAKE_FLAGS[backend]:
+ value = OVERRIDE_BACKEND_CMAKE_FLAGS[backend][name]
+ if value is None:
+ value = "ON" if flag else "OFF"
+ return '"-D{}:BOOL={}"'.format(name, value)
+
+
+def cmake_backend_extra_args(backend):
+ args = []
+ if backend in EXTRA_BACKEND_CMAKE_FLAGS:
+ for k, v in EXTRA_BACKEND_CMAKE_FLAGS[backend].items():
+ args.append('"-D{}={}"'.format(k, v))
+ return args
+
+
+def cmake_repoagent_arg(name, type, value):
+ # For now there is no override for repo-agents
+ if type is None:
+ type = ""
+ else:
+ type = ":{}".format(type)
+ return '"-D{}{}={}"'.format(name, type, value)
+
+
+def cmake_repoagent_enable(name, flag):
+ # For now there is no override for repo-agents
+ value = "ON" if flag else "OFF"
+ return '"-D{}:BOOL={}"'.format(name, value)
+
+
+def cmake_repoagent_extra_args():
+ # For now there is no extra args for repo-agents
+ args = []
+ return args
+
+
+def cmake_cache_arg(name, type, value):
+ # For now there is no override for caches
+ if type is None:
+ type = ""
+ else:
+ type = ":{}".format(type)
+ return '"-D{}{}={}"'.format(name, type, value)
+
+
+def cmake_cache_enable(name, flag):
+ # For now there is no override for caches
+ value = "ON" if flag else "OFF"
+ return '"-D{}:BOOL={}"'.format(name, value)
+
+
+def cmake_cache_extra_args():
+ # For now there is no extra args for caches
+ args = []
+ return args
+
+
+def core_cmake_args(components, backends, cmake_dir, install_dir):
+ cargs = [
+ cmake_core_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type),
+ cmake_core_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+ cmake_core_arg("TRITON_VERSION", "STRING", FLAGS.version),
+ cmake_core_arg("TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization),
+ cmake_core_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+ cmake_core_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]),
+ cmake_core_arg("TRITON_BACKEND_REPO_TAG", "STRING", components["backend"]),
+ cmake_core_arg(
+ "TRITON_THIRD_PARTY_REPO_TAG", "STRING", components["thirdparty"]
+ ),
+ ]
+
+ cargs.append(cmake_core_enable("TRITON_ENABLE_LOGGING", FLAGS.enable_logging))
+ cargs.append(cmake_core_enable("TRITON_ENABLE_STATS", FLAGS.enable_stats))
+ cargs.append(cmake_core_enable("TRITON_ENABLE_METRICS", FLAGS.enable_metrics))
+ cargs.append(
+ cmake_core_enable("TRITON_ENABLE_METRICS_GPU", FLAGS.enable_gpu_metrics)
+ )
+ cargs.append(
+ cmake_core_enable("TRITON_ENABLE_METRICS_CPU", FLAGS.enable_cpu_metrics)
+ )
+ cargs.append(cmake_core_enable("TRITON_ENABLE_TRACING", FLAGS.enable_tracing))
+ cargs.append(cmake_core_enable("TRITON_ENABLE_NVTX", FLAGS.enable_nvtx))
+
+ cargs.append(cmake_core_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu))
+ cargs.append(
+ cmake_core_arg(
+ "TRITON_MIN_COMPUTE_CAPABILITY", None, FLAGS.min_compute_capability
+ )
+ )
+
+ cargs.append(cmake_core_enable("TRITON_ENABLE_MALI_GPU", FLAGS.enable_mali_gpu))
+
+ cargs.append(cmake_core_enable("TRITON_ENABLE_GRPC", "grpc" in FLAGS.endpoint))
+ cargs.append(cmake_core_enable("TRITON_ENABLE_HTTP", "http" in FLAGS.endpoint))
+ cargs.append(
+ cmake_core_enable("TRITON_ENABLE_SAGEMAKER", "sagemaker" in FLAGS.endpoint)
+ )
+ cargs.append(
+ cmake_core_enable("TRITON_ENABLE_VERTEX_AI", "vertex-ai" in FLAGS.endpoint)
+ )
+
+ cargs.append(cmake_core_enable("TRITON_ENABLE_GCS", "gcs" in FLAGS.filesystem))
+ cargs.append(cmake_core_enable("TRITON_ENABLE_S3", "s3" in FLAGS.filesystem))
+ cargs.append(
+ cmake_core_enable(
+ "TRITON_ENABLE_AZURE_STORAGE", "azure_storage" in FLAGS.filesystem
+ )
+ )
+
+ cargs.append(cmake_core_enable("TRITON_ENABLE_ENSEMBLE", "ensemble" in backends))
+ cargs.append(cmake_core_enable("TRITON_ENABLE_TENSORRT", "tensorrt" in backends))
+
+ cargs += cmake_core_extra_args()
+ cargs.append(cmake_dir)
+ return cargs
+
+
+def repoagent_repo(ra):
+ return "{}_repository_agent".format(ra)
+
+
+def repoagent_cmake_args(images, components, ra, install_dir):
+ args = []
+
+ cargs = args + [
+ cmake_repoagent_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type),
+ cmake_repoagent_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+ cmake_repoagent_arg(
+ "TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization
+ ),
+ cmake_repoagent_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+ cmake_repoagent_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]),
+ ]
+
+ cargs.append(cmake_repoagent_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu))
+ cargs += cmake_repoagent_extra_args()
+ cargs.append("..")
+ return cargs
+
+
+def cache_repo(cache):
+ # example: "local", or "redis"
+ return "{}_cache".format(cache)
+
+
+def cache_cmake_args(images, components, cache, install_dir):
+ args = []
+
+ cargs = args + [
+ cmake_cache_arg("CMAKE_BUILD_TYPE", None, FLAGS.build_type),
+ cmake_cache_arg("CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+ cmake_cache_arg(
+ "TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization
+ ),
+ cmake_cache_arg("TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+ cmake_cache_arg("TRITON_CORE_REPO_TAG", "STRING", components["core"]),
+ ]
+
+ cargs.append(cmake_cache_enable("TRITON_ENABLE_GPU", FLAGS.enable_gpu))
+ cargs += cmake_cache_extra_args()
+ cargs.append("..")
+ return cargs
+
+
+def backend_repo(be):
+ return "{}_backend".format(be)
+
+
+def backend_cmake_args(images, components, be, install_dir, library_paths):
+ cmake_build_type = FLAGS.build_type
+
+ if be == "onnxruntime":
+ args = onnxruntime_cmake_args(images, library_paths)
+ elif be == "openvino":
+ args = openvino_cmake_args()
+ elif be == "tensorflow":
+ args = tensorflow_cmake_args(images, library_paths)
+ elif be == "python":
+ args = []
+ elif be == "dali":
+ args = dali_cmake_args()
+ elif be == "pytorch":
+ args = pytorch_cmake_args(images)
+ elif be == "armnn_tflite":
+ args = armnn_tflite_cmake_args()
+ elif be == "fil":
+ args = fil_cmake_args(images)
+ # DLIS-4618: FIL backend fails debug build, so override it for now.
+ cmake_build_type = "Release"
+ elif be == "fastertransformer":
+ args = fastertransformer_cmake_args()
+ elif be == "tensorrt":
+ args = tensorrt_cmake_args()
+ elif be == "tensorrtllm":
+ args = tensorrtllm_cmake_args(images)
+ else:
+ args = []
+
+ cargs = args + [
+ cmake_backend_arg(be, "CMAKE_BUILD_TYPE", None, cmake_build_type),
+ cmake_backend_arg(be, "CMAKE_INSTALL_PREFIX", "PATH", install_dir),
+ cmake_backend_arg(
+ be, "TRITON_REPO_ORGANIZATION", "STRING", FLAGS.github_organization
+ ),
+ cmake_backend_arg(be, "TRITON_COMMON_REPO_TAG", "STRING", components["common"]),
+ cmake_backend_arg(be, "TRITON_CORE_REPO_TAG", "STRING", components["core"]),
+ cmake_backend_arg(
+ be, "TRITON_BACKEND_REPO_TAG", "STRING", components["backend"]
+ ),
+ ]
+
+ cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_GPU", FLAGS.enable_gpu))
+ cargs.append(
+ cmake_backend_enable(be, "TRITON_ENABLE_MALI_GPU", FLAGS.enable_mali_gpu)
+ )
+ cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_STATS", FLAGS.enable_stats))
+ cargs.append(
+ cmake_backend_enable(be, "TRITON_ENABLE_METRICS", FLAGS.enable_metrics)
+ )
+
+ # [DLIS-4950] always enable below once Windows image is updated with CUPTI
+ # cargs.append(cmake_backend_enable(be, 'TRITON_ENABLE_MEMORY_TRACKER', True))
+ if (target_platform() == "windows") and (not FLAGS.no_container_build):
+ print(
+ "Warning: Detected docker build is used for Windows, backend utility 'device memory tracker' will be disabled due to missing library in CUDA Windows docker image."
+ )
+ cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", False))
+ elif target_platform() == "igpu":
+ print(
+ "Warning: Detected iGPU build, backend utility 'device memory tracker' will be disabled as iGPU doesn't contain required version of the library."
+ )
+ cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", False))
+ elif FLAGS.enable_gpu:
+ cargs.append(cmake_backend_enable(be, "TRITON_ENABLE_MEMORY_TRACKER", True))
+
+ cargs += cmake_backend_extra_args(be)
+ if be == "tensorrtllm":
+ cargs.append("-S ../inflight_batcher_llm -B .")
+
+ else:
+ cargs.append("..")
+ return cargs
+
+
+def pytorch_cmake_args(images):
+ if "pytorch" in images:
+ image = images["pytorch"]
+ else:
+ image = "nvcr.io/nvidia/pytorch:{}-py3".format(FLAGS.upstream_container_version)
+ cargs = [
+ cmake_backend_arg("pytorch", "TRITON_PYTORCH_DOCKER_IMAGE", None, image),
+ ]
+
+ if FLAGS.enable_gpu:
+ cargs.append(
+ cmake_backend_enable("pytorch", "TRITON_PYTORCH_ENABLE_TORCHTRT", True)
+ )
+ cargs.append(
+ cmake_backend_enable("pytorch", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx)
+ )
+ return cargs
+
+
+def onnxruntime_cmake_args(images, library_paths):
+ cargs = [
+ cmake_backend_arg(
+ "onnxruntime",
+ "TRITON_BUILD_ONNXRUNTIME_VERSION",
+ None,
+ TRITON_VERSION_MAP[FLAGS.version][2],
+ )
+ ]
+
+ # TRITON_ENABLE_GPU is already set for all backends in backend_cmake_args()
+ if FLAGS.enable_gpu:
+ cargs.append(
+ cmake_backend_enable(
+ "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_TENSORRT", True
+ )
+ )
+
+ if target_platform() == "windows":
+ if "base" in images:
+ cargs.append(
+ cmake_backend_arg(
+ "onnxruntime", "TRITON_BUILD_CONTAINER", None, images["base"]
+ )
+ )
+ else:
+ if "base" in images:
+ cargs.append(
+ cmake_backend_arg(
+ "onnxruntime", "TRITON_BUILD_CONTAINER", None, images["base"]
+ )
+ )
+ else:
+ cargs.append(
+ cmake_backend_arg(
+ "onnxruntime",
+ "TRITON_BUILD_CONTAINER_VERSION",
+ None,
+ TRITON_VERSION_MAP[FLAGS.version][1],
+ )
+ )
+
+ if (target_machine() != "aarch64") and (
+ TRITON_VERSION_MAP[FLAGS.version][3] is not None
+ ):
+ cargs.append(
+ cmake_backend_enable(
+ "onnxruntime", "TRITON_ENABLE_ONNXRUNTIME_OPENVINO", True
+ )
+ )
+ cargs.append(
+ cmake_backend_arg(
+ "onnxruntime",
+ "TRITON_BUILD_ONNXRUNTIME_OPENVINO_VERSION",
+ None,
+ TRITON_VERSION_MAP[FLAGS.version][3],
+ )
+ )
+
+ if target_platform() == "igpu":
+ cargs.append(
+ cmake_backend_arg(
+ "onnxruntime",
+ "TRITON_BUILD_TARGET_PLATFORM",
+ None,
+ target_platform(),
+ )
+ )
+
+ return cargs
+
+
+def openvino_cmake_args():
+ cargs = [
+ cmake_backend_arg(
+ "openvino",
+ "TRITON_BUILD_OPENVINO_VERSION",
+ None,
+ TRITON_VERSION_MAP[FLAGS.version][4],
+ )
+ ]
+ if target_platform() == "windows":
+ if "base" in images:
+ cargs.append(
+ cmake_backend_arg(
+ "openvino", "TRITON_BUILD_CONTAINER", None, images["base"]
+ )
+ )
+ else:
+ if "base" in images:
+ cargs.append(
+ cmake_backend_arg(
+ "openvino", "TRITON_BUILD_CONTAINER", None, images["base"]
+ )
+ )
+ else:
+ cargs.append(
+ cmake_backend_arg(
+ "openvino",
+ "TRITON_BUILD_CONTAINER_VERSION",
+ None,
+ TRITON_VERSION_MAP[FLAGS.version][1],
+ )
+ )
+ return cargs
+
+
+def tensorrt_cmake_args():
+ cargs = [
+ cmake_backend_enable("tensorrt", "TRITON_ENABLE_NVTX", FLAGS.enable_nvtx),
+ ]
+ if target_platform() == "windows":
+ cargs.append(
+ cmake_backend_arg(
+ "tensorrt", "TRITON_TENSORRT_INCLUDE_PATHS", None, "c:/TensorRT/include"
+ )
+ )
+
+ return cargs
+
+
+def tensorflow_cmake_args(images, library_paths):
+ backend_name = "tensorflow"
+ extra_args = []
+
+ # If a specific TF image is specified use it, otherwise pull from NGC.
+ if backend_name in images:
+ image = images[backend_name]
+ else:
+ image = "nvcr.io/nvidia/tensorflow:{}-tf2-py3".format(
+ FLAGS.upstream_container_version
+ )
+ extra_args = [
+ cmake_backend_arg(backend_name, "TRITON_TENSORFLOW_DOCKER_IMAGE", None, image)
+ ]
+ return extra_args
+
+
+def dali_cmake_args():
+ return [
+ cmake_backend_enable("dali", "TRITON_DALI_SKIP_DOWNLOAD", False),
+ ]
+
+
+def fil_cmake_args(images):
+ cargs = [cmake_backend_enable("fil", "TRITON_FIL_DOCKER_BUILD", True)]
+ if "base" in images:
+ cargs.append(
+ cmake_backend_arg("fil", "TRITON_BUILD_CONTAINER", None, images["base"])
+ )
+ else:
+ cargs.append(
+ cmake_backend_arg(
+ "fil",
+ "TRITON_BUILD_CONTAINER_VERSION",
+ None,
+ TRITON_VERSION_MAP[FLAGS.version][1],
+ )
+ )
+
+ return cargs
+
+
+def armnn_tflite_cmake_args():
+ return [
+ cmake_backend_arg("armnn_tflite", "JOBS", None, multiprocessing.cpu_count()),
+ ]
+
+
+def fastertransformer_cmake_args():
+ print("Warning: FasterTransformer backend is not officially supported.")
+ cargs = [
+ cmake_backend_arg(
+ "fastertransformer", "CMAKE_EXPORT_COMPILE_COMMANDS", None, 1
+ ),
+ cmake_backend_arg("fastertransformer", "ENABLE_FP8", None, "OFF"),
+ ]
+ return cargs
+
+
+def tensorrtllm_cmake_args(images):
+ cargs = [
+ cmake_backend_arg(
+ "tensorrtllm",
+ "TRT_LIB_DIR",
+ None,
+ "${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib",
+ ),
+ cmake_backend_arg(
+ "tensorrtllm", "TRT_INCLUDE_DIR", None, "${TRT_ROOT}/include"
+ ),
+ ]
+ cargs.append(cmake_backend_enable("tensorrtllm", "TRITON_BUILD", True))
+ return cargs
+
+
+def install_dcgm_libraries(dcgm_version, target_machine):
+ if dcgm_version == "":
+ fail(
+ "unable to determine default repo-tag, DCGM version not known for {}".format(
+ FLAGS.version
+ )
+ )
+ return ""
+ else:
+ if target_machine == "aarch64":
+ return """
+ENV DCGM_VERSION {}
+# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
+RUN curl -o /tmp/cuda-keyring.deb \\
+ https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/sbsa/cuda-keyring_1.0-1_all.deb \\
+ && apt install /tmp/cuda-keyring.deb \\
+ && rm /tmp/cuda-keyring.deb \\
+ && apt-get update \\
+ && apt-get install -y datacenter-gpu-manager=1:{}
+""".format(
+ dcgm_version, dcgm_version
+ )
+ else:
+ return """
+ENV DCGM_VERSION {}
+# Install DCGM. Steps from https://developer.nvidia.com/dcgm#Downloads
+RUN curl -o /tmp/cuda-keyring.deb \\
+ https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \\
+ && apt install /tmp/cuda-keyring.deb \\
+ && rm /tmp/cuda-keyring.deb \\
+ && apt-get update \\
+ && apt-get install -y datacenter-gpu-manager=1:{}
+""".format(
+ dcgm_version, dcgm_version
+ )
+
+
+def create_dockerfile_buildbase(ddir, dockerfile_name, argmap):
+ df = """
+ARG TRITON_VERSION={}
+ARG TRITON_CONTAINER_VERSION={}
+ARG BASE_IMAGE={}
+""".format(
+ argmap["TRITON_VERSION"],
+ argmap["TRITON_CONTAINER_VERSION"],
+ argmap["BASE_IMAGE"],
+ )
+
+ df += """
+FROM ${BASE_IMAGE}
+
+ARG TRITON_VERSION
+ARG TRITON_CONTAINER_VERSION
+"""
+ # Install the windows- or linux-specific buildbase dependencies
+ if target_platform() == "windows":
+ df += """
+SHELL ["cmd", "/S", "/C"]
+"""
+ else:
+ df += """
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install docker docker buildx
+RUN apt-get update \\
+ && apt-get install -y ca-certificates curl gnupg \\
+ && install -m 0755 -d /etc/apt/keyrings \\
+ && curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \\
+ && chmod a+r /etc/apt/keyrings/docker.gpg \\
+ && echo \\
+ "deb [arch="$(dpkg --print-architecture)" signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \\
+ "$(. /etc/os-release && echo "$VERSION_CODENAME")" stable" | \\
+ tee /etc/apt/sources.list.d/docker.list > /dev/null \\
+ && apt-get update \\
+ && apt-get install -y docker.io docker-buildx-plugin
+
+# libcurl4-openSSL-dev is needed for GCS
+# python3-dev is needed by Torchvision
+# python3-pip and libarchive-dev is needed by python backend
+# libxml2-dev is needed for Azure Storage
+# scons is needed for armnn_tflite backend build dep
+RUN apt-get update \\
+ && apt-get install -y --no-install-recommends \\
+ ca-certificates \\
+ autoconf \\
+ automake \\
+ build-essential \\
+ git \\
+ gperf \\
+ libre2-dev \\
+ libssl-dev \\
+ libtool \\
+ libcurl4-openssl-dev \\
+ libb64-dev \\
+ libgoogle-perftools-dev \\
+ patchelf \\
+ python3-dev \\
+ python3-pip \\
+ python3-setuptools \\
+ rapidjson-dev \\
+ scons \\
+ software-properties-common \\
+ pkg-config \\
+ unzip \\
+ wget \\
+ zlib1g-dev \\
+ libarchive-dev \\
+ libxml2-dev \\
+ libnuma-dev \\
+ wget \\
+ && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --upgrade pip \\
+ && pip3 install --upgrade \\
+ wheel \\
+ setuptools \\
+ docker \\
+ virtualenv
+
+# Install boost version >= 1.78 for boost::span
+# Current libboost-dev apt packages are < 1.78, so install from tar.gz
+RUN wget -O /tmp/boost.tar.gz \\
+ https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz \\
+ && (cd /tmp && tar xzf boost.tar.gz) \\
+ && mv /tmp/boost_1_80_0/boost /usr/include/boost
+
+# Server build requires recent version of CMake (FetchContent required)
+RUN apt update -q=2 \\
+ && apt install -y gpg wget \\
+ && wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null \\
+ && . /etc/os-release \\
+ && echo "deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ $UBUNTU_CODENAME main" | tee /etc/apt/sources.list.d/kitware.list >/dev/null \\
+ && apt-get update -q=2 \\
+ && apt-get install -y --no-install-recommends cmake=3.27.7* cmake-data=3.27.7*
+"""
+
+ if FLAGS.enable_gpu:
+ df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine())
+
+ df += """
+ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
+ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
+"""
+
+ # Copy in the triton source. We remove existing contents first in
+ # case the FROM container has something there already.
+ if target_platform() == "windows":
+ df += """
+WORKDIR /workspace
+RUN rmdir /S/Q * || exit 0
+COPY . .
+"""
+ else:
+ df += """
+WORKDIR /workspace
+RUN rm -fr *
+COPY . .
+ENTRYPOINT []
+"""
+
+ with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
+ dfile.write(df)
+
+
+def create_dockerfile_cibase(ddir, dockerfile_name, argmap):
+ df = """
+ARG TRITON_VERSION={}
+ARG TRITON_CONTAINER_VERSION={}
+ARG BASE_IMAGE={}
+""".format(
+ argmap["TRITON_VERSION"],
+ argmap["TRITON_CONTAINER_VERSION"],
+ argmap["BASE_IMAGE"],
+ )
+
+ df += """
+FROM ${BASE_IMAGE}
+
+ARG TRITON_VERSION
+ARG TRITON_CONTAINER_VERSION
+
+COPY build/ci /workspace
+
+WORKDIR /workspace
+
+ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
+ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
+"""
+
+ with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
+ dfile.write(df)
+
+
+def create_dockerfile_linux(
+ ddir, dockerfile_name, argmap, backends, repoagents, caches, endpoints
+):
+ df = """
+ARG TRITON_VERSION={}
+ARG TRITON_CONTAINER_VERSION={}
+ARG BASE_IMAGE={}
+
+""".format(
+ argmap["TRITON_VERSION"],
+ argmap["TRITON_CONTAINER_VERSION"],
+ argmap["BASE_IMAGE"],
+ )
+
+ # PyTorch and TensorFlow backends need extra CUDA and other
+ # dependencies during runtime that are missing in the CPU-only base container.
+ # These dependencies must be copied from the Triton Min image.
+ if not FLAGS.enable_gpu and (("pytorch" in backends) or ("tensorflow" in backends)):
+ df += """
+############################################################################
+## Triton Min image
+############################################################################
+FROM {} AS min_container
+
+""".format(
+ argmap["GPU_BASE_IMAGE"]
+ )
+
+ df += """
+############################################################################
+## Production stage: Create container with just inference server executable
+############################################################################
+FROM ${BASE_IMAGE}
+"""
+
+ df += dockerfile_prepare_container_linux(
+ argmap, backends, FLAGS.enable_gpu, target_machine()
+ )
+
+ df += """
+WORKDIR /opt
+COPY --chown=1000:1000 build/install tritonserver
+
+WORKDIR /opt/tritonserver
+COPY --chown=1000:1000 NVIDIA_Deep_Learning_Container_License.pdf .
+
+"""
+ if not FLAGS.no_core_build:
+ # Add feature labels for SageMaker endpoint
+ if "sagemaker" in endpoints:
+ df += """
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
+LABEL com.amazonaws.sagemaker.capabilities.multi-models=true
+COPY --chown=1000:1000 docker/sagemaker/serve /usr/bin/.
+"""
+
+ # This is required since libcublasLt.so is not present during the build
+ # stage of the PyTorch backend
+ if not FLAGS.enable_gpu and ("pytorch" in backends):
+ df += """
+RUN patchelf --add-needed /usr/local/cuda/lib64/stubs/libcublasLt.so.12 backends/pytorch/libtorch_cuda.so
+"""
+ if "tensorrtllm" in backends:
+ df += """
+# Remove TRT contents that are not needed in runtime
+RUN ARCH="$(uname -i)" \\
+ && rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data \\
+ && rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python \\
+ && rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples
+
+# Install required packages for TRT-LLM models
+RUN python3 -m pip install --upgrade pip \\
+ && pip3 install transformers
+
+# Uninstall unused nvidia packages
+RUN if pip freeze | grep -q "nvidia.*"; then \\
+ pip freeze | grep "nvidia.*" | xargs pip uninstall -y; \\
+ fi
+RUN pip cache purge
+
+# Drop the static libs
+RUN ARCH="$(uname -i)" \\
+ && rm -f ${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib/libnvinfer*.a \\
+ ${TRT_ROOT}/targets/${ARCH}-linux-gnu/lib/libnvonnxparser_*.a
+
+ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
+"""
+ with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
+ dfile.write(df)
+
+
+def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_machine):
+ gpu_enabled = 1 if enable_gpu else 0
+ # Common steps to produce docker images shared by build.py and compose.py.
+ # Sets environment variables, installs dependencies and adds entrypoint
+ df = """
+ARG TRITON_VERSION
+ARG TRITON_CONTAINER_VERSION
+
+ENV TRITON_SERVER_VERSION ${TRITON_VERSION}
+ENV NVIDIA_TRITON_SERVER_VERSION ${TRITON_CONTAINER_VERSION}
+LABEL com.nvidia.tritonserver.version="${TRITON_SERVER_VERSION}"
+
+ENV PATH /opt/tritonserver/bin:${PATH}
+# Remove once https://github.com/openucx/ucx/pull/9148 is available
+# in the min container.
+ENV UCX_MEM_EVENTS no
+"""
+
+ # TODO Remove once the ORT-OpenVINO "Exception while Reading network" is fixed
+ if "onnxruntime" in backends:
+ df += """
+ENV LD_LIBRARY_PATH /opt/tritonserver/backends/onnxruntime:${LD_LIBRARY_PATH}
+"""
+
+ # Necessary for libtorch.so to find correct HPCX libraries
+ if "pytorch" in backends:
+ df += """
+ENV LD_LIBRARY_PATH /opt/hpcx/ucc/lib/:/opt/hpcx/ucx/lib/:${LD_LIBRARY_PATH}
+"""
+
+ backend_dependencies = ""
+ # libgomp1 is needed by both onnxruntime and pytorch backends
+ if ("onnxruntime" in backends) or ("pytorch" in backends):
+ backend_dependencies = "libgomp1"
+
+ # libgfortran5 is needed by pytorch backend on ARM
+ if ("pytorch" in backends) and (target_machine == "aarch64"):
+ backend_dependencies += " libgfortran5"
+ # openssh-server is needed for fastertransformer
+ if "fastertransformer" in backends:
+ backend_dependencies += " openssh-server"
+
+ df += """
+ENV TF_ADJUST_HUE_FUSED 1
+ENV TF_ADJUST_SATURATION_FUSED 1
+ENV TF_ENABLE_WINOGRAD_NONFUSED 1
+ENV TF_AUTOTUNE_THRESHOLD 2
+ENV TRITON_SERVER_GPU_ENABLED {gpu_enabled}
+
+# Create a user that can be used to run triton as
+# non-root. Make sure that this user to given ID 1000. All server
+# artifacts copied below are assign to this user.
+ENV TRITON_SERVER_USER=triton-server
+RUN userdel tensorrt-server > /dev/null 2>&1 || true \\
+ && if ! id -u $TRITON_SERVER_USER > /dev/null 2>&1 ; then \\
+ useradd $TRITON_SERVER_USER; \\
+ fi \\
+ && [ `id -u $TRITON_SERVER_USER` -eq 1000 ] \\
+ && [ `id -g $TRITON_SERVER_USER` -eq 1000 ]
+
+# Ensure apt-get won't prompt for selecting options
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Common dependencies. FIXME (can any of these be conditional? For
+# example libcurl only needed for GCS?)
+RUN apt-get update \\
+ && apt-get install -y --no-install-recommends \\
+ clang \\
+ curl \\
+ dirmngr \\
+ git \\
+ gperf \\
+ libb64-0d \\
+ libcurl4-openssl-dev \\
+ libgoogle-perftools-dev \\
+ libjemalloc-dev \\
+ libnuma-dev \\
+ libre2-9 \\
+ software-properties-common \\
+ wget \\
+ {backend_dependencies} \\
+ && rm -rf /var/lib/apt/lists/*
+
+# Set TCMALLOC_RELEASE_RATE for users setting LD_PRELOAD with tcmalloc
+ENV TCMALLOC_RELEASE_RATE 200
+""".format(
+ gpu_enabled=gpu_enabled, backend_dependencies=backend_dependencies
+ )
+
+ if "fastertransformer" in backends:
+ be = "fastertransformer"
+ url = "https://raw.githubusercontent.com/triton-inference-server/fastertransformer_backend/{}/docker/create_dockerfile_and_build.py".format(
+ backends[be]
+ )
+ response = requests.get(url)
+ spec = importlib.util.spec_from_loader(
+ "fastertransformer_buildscript", loader=None, origin=url
+ )
+ fastertransformer_buildscript = importlib.util.module_from_spec(spec)
+ exec(response.content, fastertransformer_buildscript.__dict__)
+ df += fastertransformer_buildscript.create_postbuild(is_multistage_build=False)
+
+ if enable_gpu:
+ df += install_dcgm_libraries(argmap["DCGM_VERSION"], target_machine)
+ df += """
+# Extra defensive wiring for CUDA Compat lib
+RUN ln -sf ${_CUDA_COMPAT_PATH}/lib.real ${_CUDA_COMPAT_PATH}/lib \\
+ && echo ${_CUDA_COMPAT_PATH}/lib > /etc/ld.so.conf.d/00-cuda-compat.conf \\
+ && ldconfig \\
+ && rm -f ${_CUDA_COMPAT_PATH}/lib
+"""
+ else:
+ df += add_cpu_libs_to_linux_dockerfile(backends, target_machine)
+
+ # Add dependencies needed for python backend
+ if "python" in backends:
+ df += """
+# python3, python3-pip and some pip installs required for the python backend
+RUN apt-get update \\
+ && apt-get install -y --no-install-recommends \\
+ python3 \\
+ libarchive-dev \\
+ python3-pip \\
+ libpython3-dev \\
+ && pip3 install --upgrade pip \\
+ && pip3 install --upgrade \\
+ wheel \\
+ setuptools \\
+ numpy \\
+ virtualenv \\
+ && rm -rf /var/lib/apt/lists/*
+"""
+
+ if "vllm" in backends:
+ df += """
+# vLLM needed for vLLM backend
+RUN pip3 install vllm=={}
+""".format(
+ TRITON_VERSION_MAP[FLAGS.version][6]
+ )
+
+ df += """
+WORKDIR /opt/tritonserver
+RUN rm -fr /opt/tritonserver/*
+ENV NVIDIA_PRODUCT_NAME="Triton Server"
+COPY docker/entrypoint.d/ /opt/nvidia/entrypoint.d/
+"""
+
+ # The CPU-only build uses ubuntu as the base image, and so the
+ # entrypoint files are not available in /opt/nvidia in the base
+ # image, so we must provide them ourselves.
+ if not enable_gpu:
+ df += """
+COPY docker/cpu_only/ /opt/nvidia/
+ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
+"""
+
+ df += """
+ENV NVIDIA_BUILD_ID {}
+LABEL com.nvidia.build.id={}
+LABEL com.nvidia.build.ref={}
+""".format(
+ argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_REF"]
+ )
+
+ return df
+
+
+def add_cpu_libs_to_linux_dockerfile(backends, target_machine):
+ df = ""
+ libs_arch = "aarch64" if target_machine == "aarch64" else "x86_64"
+ if "pytorch" in backends:
+ # Add extra dependencies for pytorch backend.
+ # Note: Even though the build is CPU-only, the version of pytorch
+ # we are using depend upon libraries like cuda and cudnn. Since
+ # these dependencies are not present in the ubuntu base image,
+ # we must copy these from the Triton min container ourselves.
+ cuda_arch = "sbsa" if target_machine == "aarch64" else "x86_64"
+ df += """
+RUN mkdir -p /usr/local/cuda/lib64/stubs
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusparse.so /usr/local/cuda/lib64/stubs/libcusparse.so.12
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcusolver.so /usr/local/cuda/lib64/stubs/libcusolver.so.11
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcurand.so /usr/local/cuda/lib64/stubs/libcurand.so.10
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcufft.so /usr/local/cuda/lib64/stubs/libcufft.so.11
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublas.so /usr/local/cuda/lib64/stubs/libcublas.so.12
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.12
+COPY --from=min_container /usr/local/cuda/lib64/stubs/libcublasLt.so /usr/local/cuda/lib64/stubs/libcublasLt.so.11
+
+RUN mkdir -p /usr/local/cuda/targets/{cuda_arch}-linux/lib
+COPY --from=min_container /usr/local/cuda/lib64/libcudart.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libcupti.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libnvToolsExt.so.1 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+COPY --from=min_container /usr/local/cuda/lib64/libnvJitLink.so.12 /usr/local/cuda/targets/{cuda_arch}-linux/lib/.
+
+RUN mkdir -p /opt/hpcx/ucc/lib/ /opt/hpcx/ucx/lib/
+COPY --from=min_container /opt/hpcx/ucc/lib/libucc.so.1 /opt/hpcx/ucc/lib/libucc.so.1
+COPY --from=min_container /opt/hpcx/ucx/lib/libucm.so.0 /opt/hpcx/ucx/lib/libucm.so.0
+COPY --from=min_container /opt/hpcx/ucx/lib/libucp.so.0 /opt/hpcx/ucx/lib/libucp.so.0
+COPY --from=min_container /opt/hpcx/ucx/lib/libucs.so.0 /opt/hpcx/ucx/lib/libucs.so.0
+COPY --from=min_container /opt/hpcx/ucx/lib/libuct.so.0 /opt/hpcx/ucx/lib/libuct.so.0
+
+COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9 /usr/lib/{libs_arch}-linux-gnu/libcudnn.so.9
+
+# patchelf is needed to add deps of libcublasLt.so.12 to libtorch_cuda.so
+RUN apt-get update \\
+ && apt-get install -y --no-install-recommends openmpi-bin patchelf
+
+ENV LD_LIBRARY_PATH /usr/local/cuda/targets/{cuda_arch}-linux/lib:/usr/local/cuda/lib64/stubs:${{LD_LIBRARY_PATH}}
+""".format(
+ cuda_arch=cuda_arch, libs_arch=libs_arch
+ )
+
+ if ("pytorch" in backends) or ("tensorflow" in backends):
+ # Add NCCL dependency for tensorflow/pytorch backend.
+ # Note: Even though the build is CPU-only, the version of
+ # tensorflow/pytorch we are using depends upon the NCCL library.
+ # Since this dependency is not present in the ubuntu base image,
+ # we must copy it from the Triton min container ourselves.
+ df += """
+COPY --from=min_container /usr/lib/{libs_arch}-linux-gnu/libnccl.so.2 /usr/lib/{libs_arch}-linux-gnu/libnccl.so.2
+""".format(
+ libs_arch=libs_arch
+ )
+
+ return df
+
+
+def create_dockerfile_windows(
+ ddir, dockerfile_name, argmap, backends, repoagents, caches
+):
+ df = """
+ARG TRITON_VERSION={}
+ARG TRITON_CONTAINER_VERSION={}
+ARG BASE_IMAGE={}
+
+############################################################################
+## Production stage: Create container with just inference server executable
+############################################################################
+FROM ${{BASE_IMAGE}}
+
+ARG TRITON_VERSION
+ARG TRITON_CONTAINER_VERSION
+
+ENV TRITON_SERVER_VERSION ${{TRITON_VERSION}}
+ENV NVIDIA_TRITON_SERVER_VERSION ${{TRITON_CONTAINER_VERSION}}
+LABEL com.nvidia.tritonserver.version="${{TRITON_SERVER_VERSION}}"
+
+RUN setx path "%path%;C:\opt\tritonserver\bin"
+
+""".format(
+ argmap["TRITON_VERSION"],
+ argmap["TRITON_CONTAINER_VERSION"],
+ argmap["BASE_IMAGE"],
+ )
+ df += """
+WORKDIR /opt
+RUN rmdir /S/Q tritonserver || exit 0
+COPY --chown=1000:1000 build/install tritonserver
+
+WORKDIR /opt/tritonserver
+COPY --chown=1000:1000 NVIDIA_Deep_Learning_Container_License.pdf .
+
+"""
+ df += """
+ENTRYPOINT []
+ENV NVIDIA_BUILD_ID {}
+LABEL com.nvidia.build.id={}
+LABEL com.nvidia.build.ref={}
+""".format(
+ argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_ID"], argmap["NVIDIA_BUILD_REF"]
+ )
+
+ with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
+ dfile.write(df)
+
+
+def create_build_dockerfiles(
+ container_build_dir, images, backends, repoagents, caches, endpoints
+):
+ if "base" in images:
+ base_image = images["base"]
+ elif target_platform() == "windows":
+ base_image = "mcr.microsoft.com/dotnet/framework/sdk:4.8"
+ elif FLAGS.enable_gpu:
+ base_image = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+ FLAGS.upstream_container_version
+ )
+ else:
+ base_image = "ubuntu:22.04"
+
+ dockerfileargmap = {
+ "NVIDIA_BUILD_REF": "" if FLAGS.build_sha is None else FLAGS.build_sha,
+ "NVIDIA_BUILD_ID": "" if FLAGS.build_id is None else FLAGS.build_id,
+ "TRITON_VERSION": FLAGS.version,
+ "TRITON_CONTAINER_VERSION": FLAGS.container_version,
+ "BASE_IMAGE": base_image,
+ "DCGM_VERSION": ""
+ if FLAGS.version is None or FLAGS.version not in TRITON_VERSION_MAP
+ else TRITON_VERSION_MAP[FLAGS.version][5],
+ }
+
+ # For CPU-only image we need to copy some cuda libraries and dependencies
+ # since we are using PyTorch and TensorFlow containers that
+ # are not CPU-only.
+ if (
+ not FLAGS.enable_gpu
+ and (("pytorch" in backends) or ("tensorflow" in backends))
+ and (target_platform() != "windows")
+ ):
+ if "gpu-base" in images:
+ gpu_base_image = images["gpu-base"]
+ else:
+ gpu_base_image = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+ FLAGS.upstream_container_version
+ )
+ dockerfileargmap["GPU_BASE_IMAGE"] = gpu_base_image
+
+ create_dockerfile_buildbase(
+ FLAGS.build_dir, "Dockerfile.buildbase", dockerfileargmap
+ )
+
+ if target_platform() == "windows":
+ create_dockerfile_windows(
+ FLAGS.build_dir,
+ "Dockerfile",
+ dockerfileargmap,
+ backends,
+ repoagents,
+ caches,
+ )
+ else:
+ create_dockerfile_linux(
+ FLAGS.build_dir,
+ "Dockerfile",
+ dockerfileargmap,
+ backends,
+ repoagents,
+ caches,
+ endpoints,
+ )
+
+ # Dockerfile used for the creating the CI base image.
+ create_dockerfile_cibase(FLAGS.build_dir, "Dockerfile.cibase", dockerfileargmap)
+
+
+def create_docker_build_script(script_name, container_install_dir, container_ci_dir):
+ with BuildScript(
+ os.path.join(FLAGS.build_dir, script_name),
+ verbose=FLAGS.verbose,
+ desc=("Docker-based build script for Triton Inference Server"),
+ ) as docker_script:
+ #
+ # Build base image... tritonserver_buildbase
+ #
+ docker_script.commentln(8)
+ docker_script.comment("Create Triton base build image")
+ docker_script.comment(
+ "This image contains all dependencies necessary to build Triton"
+ )
+ docker_script.comment()
+
+ cachefrommap = [
+ "tritonserver_buildbase",
+ "tritonserver_buildbase_cache0",
+ "tritonserver_buildbase_cache1",
+ ]
+
+ baseargs = [
+ "docker",
+ "build",
+ "-t",
+ "tritonserver_buildbase",
+ "-f",
+ os.path.join(FLAGS.build_dir, "Dockerfile.buildbase"),
+ ]
+
+ if not FLAGS.no_container_pull:
+ baseargs += [
+ "--pull",
+ ]
+
+ # Windows docker runs in a VM and memory needs to be specified
+ # explicitly (at least for some configurations of docker).
+ if target_platform() == "windows":
+ if FLAGS.container_memory:
+ baseargs += ["--memory", FLAGS.container_memory]
+
+ baseargs += ["--cache-from={}".format(k) for k in cachefrommap]
+ baseargs += ["."]
+
+ docker_script.cwd(THIS_SCRIPT_DIR)
+ docker_script.cmd(baseargs, check_exitcode=True)
+
+ #
+ # Build...
+ #
+ docker_script.blankln()
+ docker_script.commentln(8)
+ docker_script.comment("Run build in tritonserver_buildbase container")
+ docker_script.comment("Mount a directory into the container where the install")
+ docker_script.comment("artifacts will be placed.")
+ docker_script.comment()
+
+ # Don't use '-v' to communicate the built artifacts out of the
+ # build, because we want this code to work even if run within
+ # Docker (i.e. docker-in-docker) and not just if run directly
+ # from host.
+ runargs = [
+ "docker",
+ "run",
+ "-w",
+ "/workspace/build",
+ "--name",
+ "tritonserver_builder",
+ ]
+
+ if not FLAGS.no_container_interactive:
+ runargs += ["-it"]
+
+ if target_platform() == "windows":
+ if FLAGS.container_memory:
+ runargs += ["--memory", FLAGS.container_memory]
+ runargs += ["-v", "\\\\.\pipe\docker_engine:\\\\.\pipe\docker_engine"]
+ else:
+ runargs += ["-v", "/var/run/docker.sock:/var/run/docker.sock"]
+
+ runargs += ["tritonserver_buildbase"]
+
+ if target_platform() == "windows":
+ runargs += ["powershell.exe", "-noexit", "-File", "./cmake_build.ps1"]
+ else:
+ runargs += ["./cmake_build"]
+
+ # Remove existing tritonserver_builder container...
+ if target_platform() == "windows":
+ docker_script.cmd(["docker", "rm", "tritonserver_builder"])
+ else:
+ docker_script._file.write(
+ 'if [ "$(docker ps -a | grep tritonserver_builder)" ]; then docker rm -f tritonserver_builder; fi\n'
+ )
+
+ docker_script.cmd(runargs, check_exitcode=True)
+
+ docker_script.cmd(
+ [
+ "docker",
+ "cp",
+ "tritonserver_builder:/tmp/tritonbuild/install",
+ FLAGS.build_dir,
+ ],
+ check_exitcode=True,
+ )
+ docker_script.cmd(
+ [
+ "docker",
+ "cp",
+ "tritonserver_builder:/tmp/tritonbuild/ci",
+ FLAGS.build_dir,
+ ],
+ check_exitcode=True,
+ )
+
+ #
+ # Final image... tritonserver
+ #
+ docker_script.blankln()
+ docker_script.commentln(8)
+ docker_script.comment("Create final tritonserver image")
+ docker_script.comment()
+
+ finalargs = [
+ "docker",
+ "build",
+ "-t",
+ "tritonserver",
+ "-f",
+ os.path.join(FLAGS.build_dir, "Dockerfile"),
+ ".",
+ ]
+
+ docker_script.cwd(THIS_SCRIPT_DIR)
+ docker_script.cmd(finalargs, check_exitcode=True)
+
+ #
+ # CI base image... tritonserver_cibase
+ #
+ docker_script.blankln()
+ docker_script.commentln(8)
+ docker_script.comment("Create CI base image")
+ docker_script.comment()
+
+ cibaseargs = [
+ "docker",
+ "build",
+ "-t",
+ "tritonserver_cibase",
+ "-f",
+ os.path.join(FLAGS.build_dir, "Dockerfile.cibase"),
+ ".",
+ ]
+
+ docker_script.cwd(THIS_SCRIPT_DIR)
+ docker_script.cmd(cibaseargs, check_exitcode=True)
+
+
+def core_build(
+ cmake_script, repo_dir, cmake_dir, build_dir, install_dir, components, backends
+):
+ repo_build_dir = os.path.join(build_dir, "tritonserver", "build")
+ repo_install_dir = os.path.join(build_dir, "tritonserver", "install")
+
+ cmake_script.commentln(8)
+ cmake_script.comment("Triton core library and tritonserver executable")
+ cmake_script.comment()
+ cmake_script.mkdir(repo_build_dir)
+ cmake_script.cwd(repo_build_dir)
+ cmake_script.cmake(
+ core_cmake_args(components, backends, cmake_dir, repo_install_dir)
+ )
+ cmake_script.makeinstall()
+
+ if target_platform() == "windows":
+ cmake_script.mkdir(os.path.join(install_dir, "bin"))
+ cmake_script.cp(
+ os.path.join(repo_install_dir, "bin", "tritonserver.exe"),
+ os.path.join(install_dir, "bin"),
+ )
+ cmake_script.cp(
+ os.path.join(repo_install_dir, "bin", "tritonserver.dll"),
+ os.path.join(install_dir, "bin"),
+ )
+ else:
+ cmake_script.mkdir(os.path.join(install_dir, "bin"))
+ cmake_script.cp(
+ os.path.join(repo_install_dir, "bin", "tritonserver"),
+ os.path.join(install_dir, "bin"),
+ )
+ cmake_script.mkdir(os.path.join(install_dir, "lib"))
+ cmake_script.cp(
+ os.path.join(repo_install_dir, "lib", "libtritonserver.so"),
+ os.path.join(install_dir, "lib"),
+ )
+ # [FIXME] Placing the Triton server wheel file in 'python' for now, should
+ # have been upload to pip registry and be able to install directly
+ cmake_script.mkdir(os.path.join(install_dir, "python"))
+ cmake_script.cp(
+ os.path.join(repo_install_dir, "python", "tritonserver*.whl"),
+ os.path.join(install_dir, "python"),
+ )
+
+ cmake_script.mkdir(os.path.join(install_dir, "include", "triton"))
+ cmake_script.cpdir(
+ os.path.join(repo_install_dir, "include", "triton", "core"),
+ os.path.join(install_dir, "include", "triton", "core"),
+ )
+
+ cmake_script.cp(os.path.join(repo_dir, "LICENSE"), install_dir)
+ cmake_script.cp(os.path.join(repo_dir, "TRITON_VERSION"), install_dir)
+
+ # If requested, package the source code for all OSS used to build
+ # For windows, Triton is not delivered as a container so skip for
+ # windows platform.
+ if target_platform() != "windows":
+ if (
+ (not FLAGS.no_container_build)
+ and (not FLAGS.no_core_build)
+ and (not FLAGS.no_container_source)
+ ):
+ cmake_script.mkdir(os.path.join(install_dir, "third-party-src"))
+ cmake_script.cwd(repo_build_dir)
+ cmake_script.tar(
+ "third-party-src",
+ os.path.join(install_dir, "third-party-src", "src.tar.gz"),
+ )
+ cmake_script.cp(
+ os.path.join(repo_dir, "docker", "README.third-party-src"),
+ os.path.join(install_dir, "third-party-src", "README"),
+ )
+
+ cmake_script.comment()
+ cmake_script.comment("end Triton core library and tritonserver executable")
+ cmake_script.commentln(8)
+ cmake_script.blankln()
+
+
+def tensorrtllm_prebuild(cmake_script):
+ # Export the TRT_ROOT environment variable
+ cmake_script.cmd("export TRT_ROOT=/usr/local/tensorrt")
+ cmake_script.cmd("export ARCH=$(uname -m)")
+
+
+def backend_build(
+ be,
+ cmake_script,
+ tag,
+ build_dir,
+ install_dir,
+ github_organization,
+ images,
+ components,
+ library_paths,
+):
+ repo_build_dir = os.path.join(build_dir, be, "build")
+ repo_install_dir = os.path.join(build_dir, be, "install")
+
+ cmake_script.commentln(8)
+ cmake_script.comment(f"'{be}' backend")
+ cmake_script.comment("Delete this section to remove backend from build")
+ cmake_script.comment()
+ cmake_script.mkdir(build_dir)
+ cmake_script.cwd(build_dir)
+ cmake_script.gitclone(backend_repo(be), tag, be, github_organization)
+
+ if be == "tensorrtllm":
+ tensorrtllm_prebuild(cmake_script)
+
+ cmake_script.mkdir(repo_build_dir)
+ cmake_script.cwd(repo_build_dir)
+ cmake_script.cmake(
+ backend_cmake_args(images, components, be, repo_install_dir, library_paths)
+ )
+ cmake_script.makeinstall()
+
+ cmake_script.mkdir(os.path.join(install_dir, "backends"))
+ cmake_script.rmdir(os.path.join(install_dir, "backends", be))
+
+ cmake_script.cpdir(
+ os.path.join(repo_install_dir, "backends", be),
+ os.path.join(install_dir, "backends"),
+ )
+
+ cmake_script.comment()
+ cmake_script.comment(f"end '{be}' backend")
+ cmake_script.commentln(8)
+ cmake_script.blankln()
+
+
+def backend_clone(
+ be,
+ clone_script,
+ tag,
+ build_dir,
+ install_dir,
+ github_organization,
+):
+ clone_script.commentln(8)
+ clone_script.comment(f"'{be}' backend")
+ clone_script.comment("Delete this section to remove backend from build")
+ clone_script.comment()
+ clone_script.mkdir(build_dir)
+ clone_script.cwd(build_dir)
+ clone_script.gitclone(backend_repo(be), tag, be, github_organization)
+
+ repo_target_dir = os.path.join(install_dir, "backends")
+ clone_script.mkdir(repo_target_dir)
+ backend_dir = os.path.join(repo_target_dir, be)
+ clone_script.rmdir(backend_dir)
+ clone_script.mkdir(backend_dir)
+
+ clone_script.cp(
+ os.path.join(build_dir, be, "src", "model.py"),
+ backend_dir,
+ )
+
+ clone_script.comment()
+ clone_script.comment(f"end '{be}' backend")
+ clone_script.commentln(8)
+ clone_script.blankln()
+
+
+def repo_agent_build(
+ ra, cmake_script, build_dir, install_dir, repoagent_repo, repoagents
+):
+ repo_build_dir = os.path.join(build_dir, ra, "build")
+ repo_install_dir = os.path.join(build_dir, ra, "install")
+
+ cmake_script.commentln(8)
+ cmake_script.comment(f"'{ra}' repository agent")
+ cmake_script.comment("Delete this section to remove repository agent from build")
+ cmake_script.comment()
+ cmake_script.mkdir(build_dir)
+ cmake_script.cwd(build_dir)
+ cmake_script.gitclone(
+ repoagent_repo(ra), repoagents[ra], ra, FLAGS.github_organization
+ )
+
+ cmake_script.mkdir(repo_build_dir)
+ cmake_script.cwd(repo_build_dir)
+ cmake_script.cmake(repoagent_cmake_args(images, components, ra, repo_install_dir))
+ cmake_script.makeinstall()
+
+ cmake_script.mkdir(os.path.join(install_dir, "repoagents"))
+ cmake_script.rmdir(os.path.join(install_dir, "repoagents", ra))
+ cmake_script.cpdir(
+ os.path.join(repo_install_dir, "repoagents", ra),
+ os.path.join(install_dir, "repoagents"),
+ )
+ cmake_script.comment()
+ cmake_script.comment(f"end '{ra}' repository agent")
+ cmake_script.commentln(8)
+ cmake_script.blankln()
+
+
+def cache_build(cache, cmake_script, build_dir, install_dir, cache_repo, caches):
+ repo_build_dir = os.path.join(build_dir, cache, "build")
+ repo_install_dir = os.path.join(build_dir, cache, "install")
+
+ cmake_script.commentln(8)
+ cmake_script.comment(f"'{cache}' cache")
+ cmake_script.comment("Delete this section to remove cache from build")
+ cmake_script.comment()
+ cmake_script.mkdir(build_dir)
+ cmake_script.cwd(build_dir)
+ cmake_script.gitclone(
+ cache_repo(cache), caches[cache], cache, FLAGS.github_organization
+ )
+
+ cmake_script.mkdir(repo_build_dir)
+ cmake_script.cwd(repo_build_dir)
+ cmake_script.cmake(cache_cmake_args(images, components, cache, repo_install_dir))
+ cmake_script.makeinstall()
+
+ cmake_script.mkdir(os.path.join(install_dir, "caches"))
+ cmake_script.rmdir(os.path.join(install_dir, "caches", cache))
+ cmake_script.cpdir(
+ os.path.join(repo_install_dir, "caches", cache),
+ os.path.join(install_dir, "caches"),
+ )
+ cmake_script.comment()
+ cmake_script.comment(f"end '{cache}' cache")
+ cmake_script.commentln(8)
+ cmake_script.blankln()
+
+
+def cibase_build(
+ cmake_script, repo_dir, cmake_dir, build_dir, install_dir, ci_dir, backends
+):
+ repo_install_dir = os.path.join(build_dir, "tritonserver", "install")
+
+ cmake_script.commentln(8)
+ cmake_script.comment("Collect Triton CI artifacts")
+ cmake_script.comment()
+
+ cmake_script.mkdir(ci_dir)
+
+ # On windows we are not yet using a CI/QA docker image for
+ # testing, so don't do anything...
+ if target_platform() == "windows":
+ return
+
+ # The core build produces some artifacts that are needed for CI
+ # testing, so include those in the install.
+ cmake_script.cpdir(os.path.join(repo_dir, "qa"), ci_dir)
+ cmake_script.cpdir(os.path.join(repo_dir, "deploy"), ci_dir)
+ cmake_script.mkdir(os.path.join(ci_dir, "docs"))
+ cmake_script.cpdir(
+ os.path.join(repo_dir, "docs", "examples"), os.path.join(ci_dir, "docs")
+ )
+ cmake_script.mkdir(os.path.join(ci_dir, "src", "test"))
+ cmake_script.cpdir(
+ os.path.join(repo_dir, "src", "test", "models"),
+ os.path.join(ci_dir, "src", "test"),
+ )
+ # Skip copying the artifacts in the bin, lib, and python as those directories will
+ # be missing when the core build is not enabled.
+ if not FLAGS.no_core_build:
+ cmake_script.cpdir(os.path.join(repo_install_dir, "bin"), ci_dir)
+ cmake_script.mkdir(os.path.join(ci_dir, "lib"))
+ cmake_script.cp(
+ os.path.join(repo_install_dir, "lib", "libtritonrepoagent_relocation.so"),
+ os.path.join(ci_dir, "lib"),
+ )
+ cmake_script.cpdir(os.path.join(repo_install_dir, "python"), ci_dir)
+
+ # Some of the backends are needed for CI testing
+ cmake_script.mkdir(os.path.join(ci_dir, "backends"))
+ for be in ("identity", "repeat", "square"):
+ be_install_dir = os.path.join(build_dir, be, "install", "backends", be)
+ if target_platform() == "windows":
+ cmake_script.cmd(f"if (Test-Path -Path {be_install_dir}) {{")
+ else:
+ cmake_script.cmd(f"if [[ -e {be_install_dir} ]]; then")
+ cmake_script.cpdir(be_install_dir, os.path.join(ci_dir, "backends"))
+ cmake_script.cmd("}" if target_platform() == "windows" else "fi")
+
+ # Some of the unit-test built backends are needed for CI testing
+ cmake_script.mkdir(os.path.join(ci_dir, "tritonbuild", "tritonserver", "backends"))
+ for be in (
+ "query",
+ "implicit_state",
+ "sequence",
+ "dyna_sequence",
+ "distributed_addsub",
+ "iterative_sequence",
+ ):
+ be_install_dir = os.path.join(repo_install_dir, "backends", be)
+ if target_platform() == "windows":
+ cmake_script.cmd(f"if (Test-Path -Path {be_install_dir}) {{")
+ else:
+ cmake_script.cmd(f"if [[ -e {be_install_dir} ]]; then")
+ cmake_script.cpdir(
+ be_install_dir,
+ os.path.join(ci_dir, "tritonbuild", "tritonserver", "backends"),
+ )
+ cmake_script.cmd("}" if target_platform() == "windows" else "fi")
+
+ # The onnxruntime_backend build produces some artifacts that
+ # are needed for CI testing.
+ if "onnxruntime" in backends:
+ ort_install_dir = os.path.join(build_dir, "onnxruntime", "install")
+ cmake_script.mkdir(os.path.join(ci_dir, "qa", "L0_custom_ops"))
+ if target_platform() != "igpu":
+ cmake_script.cp(
+ os.path.join(ort_install_dir, "test", "libcustom_op_library.so"),
+ os.path.join(ci_dir, "qa", "L0_custom_ops"),
+ )
+ cmake_script.cp(
+ os.path.join(ort_install_dir, "test", "custom_op_test.onnx"),
+ os.path.join(ci_dir, "qa", "L0_custom_ops"),
+ )
+ # [WIP] other way than wildcard?
+ backend_tests = os.path.join(build_dir, "onnxruntime", "test", "*")
+ cmake_script.cpdir(backend_tests, os.path.join(ci_dir, "qa"))
+
+ # Need the build area for some backends so that they can be
+ # rebuilt with specific options.
+ cmake_script.mkdir(os.path.join(ci_dir, "tritonbuild"))
+ for be in ("identity", "python"):
+ if be in backends:
+ cmake_script.rmdir(os.path.join(build_dir, be, "build"))
+ cmake_script.rmdir(os.path.join(build_dir, be, "install"))
+ cmake_script.cpdir(
+ os.path.join(build_dir, be), os.path.join(ci_dir, "tritonbuild")
+ )
+
+ cmake_script.comment()
+ cmake_script.comment("end Triton CI artifacts")
+ cmake_script.commentln(8)
+ cmake_script.blankln()
+
+
+def finalize_build(cmake_script, install_dir, ci_dir):
+ cmake_script.cmd(f"chmod -R a+rw {install_dir}")
+ cmake_script.cmd(f"chmod -R a+rw {ci_dir}")
+
+
+def enable_all():
+ if target_platform() != "windows":
+ all_backends = [
+ "ensemble",
+ "identity",
+ "square",
+ "repeat",
+ "tensorflow",
+ "onnxruntime",
+ "python",
+ "dali",
+ "pytorch",
+ "openvino",
+ "fil",
+ "tensorrt",
+ ]
+ all_repoagents = ["checksum"]
+ all_caches = ["local", "redis"]
+ all_filesystems = ["gcs", "s3", "azure_storage"]
+ all_endpoints = ["http", "grpc", "sagemaker", "vertex-ai"]
+
+ FLAGS.enable_logging = True
+ FLAGS.enable_stats = True
+ FLAGS.enable_metrics = True
+ FLAGS.enable_gpu_metrics = True
+ FLAGS.enable_cpu_metrics = True
+ FLAGS.enable_tracing = True
+ FLAGS.enable_nvtx = True
+ FLAGS.enable_gpu = True
+ else:
+ all_backends = [
+ "ensemble",
+ "identity",
+ "square",
+ "repeat",
+ "onnxruntime",
+ "openvino",
+ "tensorrt",
+ ]
+ all_repoagents = ["checksum"]
+ all_caches = ["local", "redis"]
+ all_filesystems = []
+ all_endpoints = ["http", "grpc"]
+
+ FLAGS.enable_logging = True
+ FLAGS.enable_stats = True
+ FLAGS.enable_tracing = True
+ FLAGS.enable_gpu = True
+
+ requested_backends = []
+ for be in FLAGS.backend:
+ parts = be.split(":")
+ requested_backends += [parts[0]]
+ for be in all_backends:
+ if be not in requested_backends:
+ FLAGS.backend += [be]
+
+ requested_repoagents = []
+ for ra in FLAGS.repoagent:
+ parts = ra.split(":")
+ requested_repoagents += [parts[0]]
+ for ra in all_repoagents:
+ if ra not in requested_repoagents:
+ FLAGS.repoagent += [ra]
+
+ requested_caches = []
+ for cache in FLAGS.cache:
+ parts = cache.split(":")
+ requested_caches += [parts[0]]
+ for cache in all_caches:
+ if cache not in requested_caches:
+ FLAGS.cache += [cache]
+
+ for fs in all_filesystems:
+ if fs not in FLAGS.filesystem:
+ FLAGS.filesystem += [fs]
+
+ for ep in all_endpoints:
+ if ep not in FLAGS.endpoint:
+ FLAGS.endpoint += [ep]
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+
+ group_qv = parser.add_mutually_exclusive_group()
+ group_qv.add_argument(
+ "-q",
+ "--quiet",
+ action="store_true",
+ required=False,
+ help="Disable console output.",
+ )
+ group_qv.add_argument(
+ "-v",
+ "--verbose",
+ action="store_true",
+ required=False,
+ help="Enable verbose output.",
+ )
+
+ parser.add_argument(
+ "--dryrun",
+ action="store_true",
+ required=False,
+ help="Output the build scripts, but do not perform build.",
+ )
+ parser.add_argument(
+ "--no-container-build",
+ action="store_true",
+ required=False,
+ help="Do not use Docker container for build.",
+ )
+ parser.add_argument(
+ "--no-container-interactive",
+ action="store_true",
+ required=False,
+ help='Do not use -it argument to "docker run" when performing container build.',
+ )
+ parser.add_argument(
+ "--no-container-pull",
+ action="store_true",
+ required=False,
+ help="Do not use Docker --pull argument when building container.",
+ )
+ parser.add_argument(
+ "--container-memory",
+ default=None,
+ required=False,
+ help="Value for Docker --memory argument. Used only for windows builds.",
+ )
+ parser.add_argument(
+ "--target-platform",
+ required=False,
+ default=None,
+ help='Target platform for build, can be "linux", "windows" or "igpu". If not specified, build targets the current platform.',
+ )
+ parser.add_argument(
+ "--target-machine",
+ required=False,
+ default=None,
+ help="Target machine/architecture for build. If not specified, build targets the current machine/architecture.",
+ )
+
+ parser.add_argument(
+ "--build-id",
+ type=str,
+ required=False,
+ help="Build ID associated with the build.",
+ )
+ parser.add_argument(
+ "--build-sha", type=str, required=False, help="SHA associated with the build."
+ )
+ parser.add_argument(
+ "--build-dir",
+ type=str,
+ required=False,
+ help="Build directory. All repo clones and builds will be performed in this directory.",
+ )
+ parser.add_argument(
+ "--install-dir",
+ type=str,
+ required=False,
+ default=None,
+ help="Install directory, default is /opt/tritonserver.",
+ )
+ parser.add_argument(
+ "--cmake-dir",
+ type=str,
+ required=False,
+ help="Directory containing the CMakeLists.txt file for Triton server.",
+ )
+ parser.add_argument(
+ "--tmp-dir",
+ type=str,
+ required=False,
+ default="/tmp",
+ help="Temporary directory used for building inside docker. Default is /tmp.",
+ )
+ parser.add_argument(
+ "--library-paths",
+ action="append",
+ required=False,
+ default=None,
+ help="Specify library paths for respective backends in build as [:].",
+ )
+ parser.add_argument(
+ "--build-type",
+ required=False,
+ default="Release",
+ help='Build type, one of "Release", "Debug", "RelWithDebInfo" or "MinSizeRel". Default is "Release".',
+ )
+ parser.add_argument(
+ "-j",
+ "--build-parallel",
+ type=int,
+ required=False,
+ default=None,
+ help="Build parallelism. Defaults to 2 * number-of-cores.",
+ )
+
+ parser.add_argument(
+ "--github-organization",
+ type=str,
+ required=False,
+ default="https://github.com/triton-inference-server",
+ help='The GitHub organization containing the repos used for the build. Defaults to "https://github.com/triton-inference-server".',
+ )
+ parser.add_argument(
+ "--version",
+ type=str,
+ required=False,
+ help="The Triton version. If not specified defaults to the value in the TRITON_VERSION file.",
+ )
+ parser.add_argument(
+ "--container-version",
+ type=str,
+ required=False,
+ help="The Triton container version to build. If not specified the container version will be chosen automatically based on --version value.",
+ )
+ parser.add_argument(
+ "--upstream-container-version",
+ type=str,
+ required=False,
+ help="The upstream container version to use for the build. If not specified the upstream container version will be chosen automatically based on --version value.",
+ )
+ parser.add_argument(
+ "--container-prebuild-command",
+ type=str,
+ required=False,
+ help="When performing a container build, this command will be executed within the container just before the build it performed.",
+ )
+ parser.add_argument(
+ "--no-container-source",
+ action="store_true",
+ required=False,
+ help="Do not include OSS source code in Docker container.",
+ )
+ parser.add_argument(
+ "--image",
+ action="append",
+ required=False,
+ help='Use specified Docker image in build as ,. can be "base", "gpu-base", "tensorflow", or "pytorch".',
+ )
+
+ parser.add_argument(
+ "--enable-all",
+ action="store_true",
+ required=False,
+ help="Enable all standard released Triton features, backends, repository agents, caches, endpoints and file systems.",
+ )
+ parser.add_argument(
+ "--enable-logging", action="store_true", required=False, help="Enable logging."
+ )
+ parser.add_argument(
+ "--enable-stats",
+ action="store_true",
+ required=False,
+ help="Enable statistics collection.",
+ )
+ parser.add_argument(
+ "--enable-metrics",
+ action="store_true",
+ required=False,
+ help="Enable metrics reporting.",
+ )
+ parser.add_argument(
+ "--enable-gpu-metrics",
+ action="store_true",
+ required=False,
+ help="Include GPU metrics in reported metrics.",
+ )
+ parser.add_argument(
+ "--enable-cpu-metrics",
+ action="store_true",
+ required=False,
+ help="Include CPU metrics in reported metrics.",
+ )
+ parser.add_argument(
+ "--enable-tracing", action="store_true", required=False, help="Enable tracing."
+ )
+ parser.add_argument(
+ "--enable-nvtx", action="store_true", required=False, help="Enable NVTX."
+ )
+ parser.add_argument(
+ "--enable-gpu", action="store_true", required=False, help="Enable GPU support."
+ )
+ parser.add_argument(
+ "--enable-mali-gpu",
+ action="store_true",
+ required=False,
+ help="Enable ARM MALI GPU support.",
+ )
+ parser.add_argument(
+ "--min-compute-capability",
+ type=str,
+ required=False,
+ default="6.0",
+ help="Minimum CUDA compute capability supported by server.",
+ )
+
+ parser.add_argument(
+ "--endpoint",
+ action="append",
+ required=False,
+ help='Include specified endpoint in build. Allowed values are "grpc", "http", "vertex-ai" and "sagemaker".',
+ )
+ parser.add_argument(
+ "--filesystem",
+ action="append",
+ required=False,
+ help='Include specified filesystem in build. Allowed values are "gcs", "azure_storage" and "s3".',
+ )
+ parser.add_argument(
+ "--no-core-build",
+ action="store_true",
+ required=False,
+ help="Do not build Triton core shared library or executable.",
+ )
+ parser.add_argument(
+ "--backend",
+ action="append",
+ required=False,
+ help='Include specified backend in build as [:]. If starts with "pull/" then it refers to a pull-request reference, otherwise indicates the git tag/branch to use for the build. If the version is non-development then the default is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default is "main" (e.g. version YY.MMdev -> branch main).',
+ )
+ parser.add_argument(
+ "--repo-tag",
+ action="append",
+ required=False,
+ help='The version of a component to use in the build as :. can be "common", "core", "backend" or "thirdparty". indicates the git tag/branch to use for the build. Currently does not support pull-request reference. If the version is non-development then the default is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default is "main" (e.g. version YY.MMdev -> branch main).',
+ )
+ parser.add_argument(
+ "--repoagent",
+ action="append",
+ required=False,
+ help='Include specified repo agent in build as [:]. If starts with "pull/" then it refers to a pull-request reference, otherwise indicates the git tag/branch to use for the build. If the version is non-development then the default is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default is "main" (e.g. version YY.MMdev -> branch main).',
+ )
+ parser.add_argument(
+ "--cache",
+ action="append",
+ required=False,
+ help='Include specified cache in build as [:]. If starts with "pull/" then it refers to a pull-request reference, otherwise indicates the git tag/branch to use for the build. If the version is non-development then the default is the release branch matching the container version (e.g. version YY.MM -> branch rYY.MM); otherwise the default is "main" (e.g. version YY.MMdev -> branch main).',
+ )
+ parser.add_argument(
+ "--no-force-clone",
+ action="store_true",
+ default=False,
+ help="Do not create fresh clones of repos that have already been cloned.",
+ )
+ parser.add_argument(
+ "--extra-core-cmake-arg",
+ action="append",
+ required=False,
+ help="Extra CMake argument as =. The argument is passed to CMake as -D= and is included after all CMake arguments added by build.py for the core builds.",
+ )
+ parser.add_argument(
+ "--override-core-cmake-arg",
+ action="append",
+ required=False,
+ help="Override specified CMake argument in the build as =. The argument is passed to CMake as -D=. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the core build use --extra-core-cmake-arg.",
+ )
+ parser.add_argument(
+ "--extra-backend-cmake-arg",
+ action="append",
+ required=False,
+ help="Extra CMake argument for a backend build as :=. The argument is passed to CMake as -D= and is included after all CMake arguments added by build.py for the backend.",
+ )
+ parser.add_argument(
+ "--override-backend-cmake-arg",
+ action="append",
+ required=False,
+ help="Override specified backend CMake argument in the build as :=. The argument is passed to CMake as -D=. This flag only impacts CMake arguments that are used by build.py. To unconditionally add a CMake argument to the backend build use --extra-backend-cmake-arg.",
+ )
+
+ FLAGS = parser.parse_args()
+
+ if FLAGS.image is None:
+ FLAGS.image = []
+ if FLAGS.repo_tag is None:
+ FLAGS.repo_tag = []
+ if FLAGS.backend is None:
+ FLAGS.backend = []
+ if FLAGS.endpoint is None:
+ FLAGS.endpoint = []
+ if FLAGS.filesystem is None:
+ FLAGS.filesystem = []
+ if FLAGS.repoagent is None:
+ FLAGS.repoagent = []
+ if FLAGS.cache is None:
+ FLAGS.cache = []
+ if FLAGS.library_paths is None:
+ FLAGS.library_paths = []
+ if FLAGS.extra_core_cmake_arg is None:
+ FLAGS.extra_core_cmake_arg = []
+ if FLAGS.override_core_cmake_arg is None:
+ FLAGS.override_core_cmake_arg = []
+ if FLAGS.override_backend_cmake_arg is None:
+ FLAGS.override_backend_cmake_arg = []
+ if FLAGS.extra_backend_cmake_arg is None:
+ FLAGS.extra_backend_cmake_arg = []
+
+ # if --enable-all is specified, then update FLAGS to enable all
+ # settings, backends, repo-agents, caches, file systems, endpoints, etc.
+ if FLAGS.enable_all:
+ enable_all()
+
+ # When doing a docker build, --build-dir, --install-dir and
+ # --cmake-dir must not be set. We will use the build/ subdir
+ # within the server/ repo that contains this build.py script for
+ # --build-dir. If not doing a docker build, --build-dir must be
+ # set.
+ if FLAGS.no_container_build:
+ if FLAGS.build_dir is None:
+ fail("--no-container-build requires --build-dir")
+ if FLAGS.install_dir is None:
+ FLAGS.install_dir = os.path.join(FLAGS.build_dir, "opt", "tritonserver")
+ if FLAGS.cmake_dir is None:
+ FLAGS.cmake_dir = THIS_SCRIPT_DIR
+ else:
+ if FLAGS.build_dir is not None:
+ fail("--build-dir must not be set for container-based build")
+ if FLAGS.install_dir is not None:
+ fail("--install-dir must not be set for container-based build")
+ if FLAGS.cmake_dir is not None:
+ fail("--cmake-dir must not be set for container-based build")
+ FLAGS.build_dir = os.path.join(THIS_SCRIPT_DIR, "build")
+
+ # Determine the versions. Start with Triton version, if --version
+ # is not explicitly specified read from TRITON_VERSION file.
+ if FLAGS.version is None:
+ with open(os.path.join(THIS_SCRIPT_DIR, "TRITON_VERSION"), "r") as vfile:
+ FLAGS.version = vfile.readline().strip()
+
+ if FLAGS.build_parallel is None:
+ FLAGS.build_parallel = multiprocessing.cpu_count() * 2
+
+ log("Building Triton Inference Server")
+ log("platform {}".format(target_platform()))
+ log("machine {}".format(target_machine()))
+ log("version {}".format(FLAGS.version))
+ log("build dir {}".format(FLAGS.build_dir))
+ log("install dir {}".format(FLAGS.install_dir))
+ log("cmake dir {}".format(FLAGS.cmake_dir))
+
+ # Determine the default repo-tag that should be used for images,
+ # backends, repo-agents, and caches if a repo-tag is not given
+ # explicitly. For release branches we use the release branch as
+ # the default, otherwise we use 'main'.
+ default_repo_tag = "main"
+ cver = FLAGS.container_version
+ if cver is None:
+ if FLAGS.version not in TRITON_VERSION_MAP:
+ fail(
+ "unable to determine default repo-tag, container version not known for {}".format(
+ FLAGS.version
+ )
+ )
+ cver = TRITON_VERSION_MAP[FLAGS.version][0]
+ if not cver.endswith("dev"):
+ default_repo_tag = "r" + cver
+ log("default repo-tag: {}".format(default_repo_tag))
+
+ # For other versions use the TRITON_VERSION_MAP unless explicitly
+ # given.
+ FLAGS.container_version, FLAGS.upstream_container_version = container_versions(
+ FLAGS.version, FLAGS.container_version, FLAGS.upstream_container_version
+ )
+
+ log("container version {}".format(FLAGS.container_version))
+ log("upstream container version {}".format(FLAGS.upstream_container_version))
+
+ for ep in FLAGS.endpoint:
+ log(f'endpoint "{ep}"')
+ for fs in FLAGS.filesystem:
+ log(f'filesystem "{fs}"')
+
+ # Initialize map of backends to build and repo-tag for each.
+ backends = {}
+ for be in FLAGS.backend:
+ parts = be.split(":")
+ if len(parts) == 1:
+ parts.append(default_repo_tag)
+ if parts[0] == "tensorflow1":
+ fail(
+ "Starting from Triton version 23.04, support for TensorFlow 1 has been discontinued. Please switch to Tensorflow 2."
+ )
+ if parts[0] == "tensorflow2":
+ parts[0] = "tensorflow"
+ log('backend "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
+ backends[parts[0]] = parts[1]
+
+ if "vllm" in backends:
+ if "python" not in backends:
+ log(
+ "vLLM backend requires Python backend, adding Python backend with tag {}".format(
+ backends["vllm"]
+ )
+ )
+ backends["python"] = backends["vllm"]
+
+ # Initialize map of repo agents to build and repo-tag for each.
+ repoagents = {}
+ for be in FLAGS.repoagent:
+ parts = be.split(":")
+ if len(parts) == 1:
+ parts.append(default_repo_tag)
+ log('repoagent "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
+ repoagents[parts[0]] = parts[1]
+
+ # Initialize map of caches to build and repo-tag for each.
+ caches = {}
+ for be in FLAGS.cache:
+ parts = be.split(":")
+ if len(parts) == 1:
+ parts.append(default_repo_tag)
+ log('cache "{}" at tag/branch "{}"'.format(parts[0], parts[1]))
+ caches[parts[0]] = parts[1]
+
+ # Initialize map of docker images.
+ images = {}
+ for img in FLAGS.image:
+ parts = img.split(",")
+ fail_if(
+ len(parts) != 2, "--image must specify ,"
+ )
+ fail_if(
+ parts[0]
+ not in ["base", "gpu-base", "pytorch", "tensorflow", "tensorflow2"],
+ "unsupported value for --image",
+ )
+ log('image "{}": "{}"'.format(parts[0], parts[1]))
+ if parts[0] == "tensorflow2":
+ parts[0] = "tensorflow"
+ images[parts[0]] = parts[1]
+
+ # Initialize map of library paths for each backend.
+ library_paths = {}
+ for lpath in FLAGS.library_paths:
+ parts = lpath.split(":")
+ if len(parts) == 2:
+ log('backend "{}" library path "{}"'.format(parts[0], parts[1]))
+ if parts[0] == "tensorflow2":
+ parts[0] = "tensorflow"
+ library_paths[parts[0]] = parts[1]
+
+ # Parse any explicitly specified cmake arguments
+ for cf in FLAGS.extra_core_cmake_arg:
+ parts = cf.split("=")
+ fail_if(len(parts) != 2, "--extra-core-cmake-arg must specify =")
+ log('CMake core extra "-D{}={}"'.format(parts[0], parts[1]))
+ EXTRA_CORE_CMAKE_FLAGS[parts[0]] = parts[1]
+
+ for cf in FLAGS.override_core_cmake_arg:
+ parts = cf.split("=")
+ fail_if(
+ len(parts) != 2, "--override-core-cmake-arg must specify ="
+ )
+ log('CMake core override "-D{}={}"'.format(parts[0], parts[1]))
+ OVERRIDE_CORE_CMAKE_FLAGS[parts[0]] = parts[1]
+
+ for cf in FLAGS.extra_backend_cmake_arg:
+ parts = cf.split(":", 1)
+ fail_if(
+ len(parts) != 2,
+ "--extra-backend-cmake-arg must specify :=",
+ )
+ be = parts[0]
+ parts = parts[1].split("=", 1)
+ fail_if(
+ len(parts) != 2,
+ "--extra-backend-cmake-arg must specify :=",
+ )
+ fail_if(
+ be not in backends,
+ '--extra-backend-cmake-arg specifies backend "{}" which is not included in build'.format(
+ be
+ ),
+ )
+ log('backend "{}" CMake extra "-D{}={}"'.format(be, parts[0], parts[1]))
+ if be not in EXTRA_BACKEND_CMAKE_FLAGS:
+ EXTRA_BACKEND_CMAKE_FLAGS[be] = {}
+ EXTRA_BACKEND_CMAKE_FLAGS[be][parts[0]] = parts[1]
+
+ for cf in FLAGS.override_backend_cmake_arg:
+ parts = cf.split(":", 1)
+ fail_if(
+ len(parts) != 2,
+ "--override-backend-cmake-arg must specify :=",
+ )
+ be = parts[0]
+ parts = parts[1].split("=", 1)
+ fail_if(
+ len(parts) != 2,
+ "--override-backend-cmake-arg must specify :=",
+ )
+ fail_if(
+ be not in backends,
+ '--override-backend-cmake-arg specifies backend "{}" which is not included in build'.format(
+ be
+ ),
+ )
+ log('backend "{}" CMake override "-D{}={}"'.format(be, parts[0], parts[1]))
+ if be not in OVERRIDE_BACKEND_CMAKE_FLAGS:
+ OVERRIDE_BACKEND_CMAKE_FLAGS[be] = {}
+ OVERRIDE_BACKEND_CMAKE_FLAGS[be][parts[0]] = parts[1]
+
+ # Initialize map of common components and repo-tag for each.
+ components = {
+ "common": default_repo_tag,
+ "core": default_repo_tag,
+ "backend": default_repo_tag,
+ "thirdparty": default_repo_tag,
+ }
+ for be in FLAGS.repo_tag:
+ parts = be.split(":")
+ fail_if(len(parts) != 2, "--repo-tag must specify :")
+ fail_if(
+ parts[0] not in components,
+ '--repo-tag must be "common", "core", "backend", or "thirdparty"',
+ )
+ components[parts[0]] = parts[1]
+ for c in components:
+ log('component "{}" at tag/branch "{}"'.format(c, components[c]))
+
+ # Set the build, install, and cmake directories to use for the
+ # generated build scripts and Dockerfiles. If building without
+ # Docker, these are the directories specified on the cmdline. If
+ # building with Docker, we change these to be directories within
+ # FLAGS.tmp_dir inside the Docker container.
+ script_repo_dir = THIS_SCRIPT_DIR
+ script_build_dir = FLAGS.build_dir
+ script_install_dir = script_ci_dir = FLAGS.install_dir
+ script_cmake_dir = FLAGS.cmake_dir
+ if not FLAGS.no_container_build:
+ # FLAGS.tmp_dir may be specified with "\" on Windows, adjust
+ # to "/" for docker usage.
+ script_build_dir = os.path.normpath(
+ os.path.join(FLAGS.tmp_dir, "tritonbuild").replace("\\", "/")
+ )
+ script_install_dir = os.path.normpath(os.path.join(script_build_dir, "install"))
+ script_ci_dir = os.path.normpath(os.path.join(script_build_dir, "ci"))
+ if target_platform() == "windows":
+ script_repo_dir = script_cmake_dir = os.path.normpath("c:/workspace")
+ else:
+ script_repo_dir = script_cmake_dir = "/workspace"
+
+ script_name = "cmake_build"
+ if target_platform() == "windows":
+ script_name += ".ps1"
+
+ # Write the build script that invokes cmake for the core, backends, repo-agents, and caches.
+ pathlib.Path(FLAGS.build_dir).mkdir(parents=True, exist_ok=True)
+ with BuildScript(
+ os.path.join(FLAGS.build_dir, script_name),
+ verbose=FLAGS.verbose,
+ desc=("Build script for Triton Inference Server"),
+ ) as cmake_script:
+ # Run the container pre-build command if the cmake build is
+ # being done within the build container.
+ if not FLAGS.no_container_build and FLAGS.container_prebuild_command:
+ cmake_script.cmd(FLAGS.container_prebuild_command, check_exitcode=True)
+ cmake_script.blankln()
+
+ # Commands to build the core shared library and the server executable.
+ if not FLAGS.no_core_build:
+ core_build(
+ cmake_script,
+ script_repo_dir,
+ script_cmake_dir,
+ script_build_dir,
+ script_install_dir,
+ components,
+ backends,
+ )
+
+ # Commands to build each backend...
+ for be in backends:
+ # Core backends are not built separately from core so skip...
+ if be in CORE_BACKENDS:
+ continue
+
+ # If armnn_tflite backend, source from external repo for git clone
+ if be == "armnn_tflite":
+ github_organization = "https://gitlab.com/arm-research/smarter/"
+ else:
+ github_organization = FLAGS.github_organization
+
+ if be == "vllm":
+ backend_clone(
+ be,
+ cmake_script,
+ backends[be],
+ script_build_dir,
+ script_install_dir,
+ github_organization,
+ )
+ else:
+ backend_build(
+ be,
+ cmake_script,
+ backends[be],
+ script_build_dir,
+ script_install_dir,
+ github_organization,
+ images,
+ components,
+ library_paths,
+ )
+
+ # Commands to build each repo agent...
+ for ra in repoagents:
+ repo_agent_build(
+ ra,
+ cmake_script,
+ script_build_dir,
+ script_install_dir,
+ repoagent_repo,
+ repoagents,
+ )
+
+ # Commands to build each cache...
+ for cache in caches:
+ cache_build(
+ cache,
+ cmake_script,
+ script_build_dir,
+ script_install_dir,
+ cache_repo,
+ caches,
+ )
+
+ # Commands needed only when building with Docker...
+ if not FLAGS.no_container_build:
+ # Commands to collect all the build artifacts needed for CI
+ # testing.
+ cibase_build(
+ cmake_script,
+ script_repo_dir,
+ script_cmake_dir,
+ script_build_dir,
+ script_install_dir,
+ script_ci_dir,
+ backends,
+ )
+
+ # When building with Docker the install and ci artifacts
+ # written to the build-dir while running the docker container
+ # may have root ownership, so give them permissions to be
+ # managed by all users on the host system.
+ if target_platform() != "windows":
+ finalize_build(cmake_script, script_install_dir, script_ci_dir)
+
+ # If --no-container-build is not specified then we perform the
+ # actual build within a docker container and from that create the
+ # final tritonserver docker image. For the build we need to
+ # generate a few Dockerfiles and a top-level script that drives
+ # the build process.
+ if not FLAGS.no_container_build:
+ script_name = "docker_build"
+ if target_platform() == "windows":
+ script_name += ".ps1"
+
+ create_build_dockerfiles(
+ script_build_dir, images, backends, repoagents, caches, FLAGS.endpoint
+ )
+ create_docker_build_script(script_name, script_install_dir, script_ci_dir)
+
+ # In not dry-run, execute the script to perform the build... If a
+ # container-based build is requested use 'docker_build' script,
+ # otherwise build directly on this system using cmake script.
+ if not FLAGS.dryrun:
+ if target_platform() == "windows":
+ p = subprocess.Popen(
+ ["powershell.exe", "-noexit", "-File", f"./{script_name}"],
+ cwd=FLAGS.build_dir,
+ )
+ else:
+ p = subprocess.Popen([f"./{script_name}"], cwd=FLAGS.build_dir)
+ p.wait()
+ fail_if(p.returncode != 0, "build failed")
diff --git a/compose.py b/compose.py
new file mode 100755
index 0000000000..14b58c93f6
--- /dev/null
+++ b/compose.py
@@ -0,0 +1,525 @@
+#!/usr/bin/env python3
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import argparse
+import os
+import platform
+import subprocess
+import sys
+
+FLAGS = None
+
+
+#### helper functions
+def log(msg, force=False):
+ if force or not FLAGS.quiet:
+ try:
+ print(msg, file=sys.stderr)
+ except Exception:
+ print("", file=sys.stderr)
+
+
+def log_verbose(msg):
+ if FLAGS.verbose:
+ log(msg, force=True)
+
+
+def fail(msg):
+ print("error: {}".format(msg), file=sys.stderr)
+ sys.exit(1)
+
+
+def fail_if(p, msg):
+ if p:
+ fail(msg)
+
+
+def start_dockerfile(ddir, images, argmap, dockerfile_name, backends):
+ # Set environment variables, set default user and install dependencies
+ df = """
+#
+# Multistage build.
+#
+ARG TRITON_VERSION={}
+ARG TRITON_CONTAINER_VERSION={}
+
+FROM {} AS full
+""".format(
+ argmap["TRITON_VERSION"], argmap["TRITON_CONTAINER_VERSION"], images["full"]
+ )
+
+ # PyTorch, TensorFlow backends need extra CUDA and other
+ # dependencies during runtime that are missing in the CPU-only base container.
+ # These dependencies must be copied from the Triton Min image.
+ if not FLAGS.enable_gpu and (
+ ("pytorch" in backends)
+ or ("tensorflow" in backends)
+ or ("tensorflow2" in backends)
+ ):
+ df += """
+FROM {} AS min_container
+
+""".format(
+ images["gpu-min"]
+ )
+
+ df += """
+FROM {}
+""".format(
+ images["min"]
+ )
+
+ import build
+
+ df += build.dockerfile_prepare_container_linux(
+ argmap, backends, FLAGS.enable_gpu, platform.machine().lower()
+ )
+ # Copy over files
+ df += """
+WORKDIR /opt/tritonserver
+COPY --chown=1000:1000 --from=full /opt/tritonserver/LICENSE .
+COPY --chown=1000:1000 --from=full /opt/tritonserver/TRITON_VERSION .
+COPY --chown=1000:1000 --from=full /opt/tritonserver/NVIDIA_Deep_Learning_Container_License.pdf .
+COPY --chown=1000:1000 --from=full /opt/tritonserver/bin bin/
+COPY --chown=1000:1000 --from=full /opt/tritonserver/lib lib/
+COPY --chown=1000:1000 --from=full /opt/tritonserver/include include/
+"""
+ with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
+ dfile.write(df)
+
+
+def add_requested_backends(ddir, dockerfile_name, backends):
+ df = "# Copying over backends \n"
+ for backend in backends:
+ df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/backends/{} /opt/tritonserver/backends/{}
+""".format(
+ backend, backend
+ )
+ if len(backends) > 0:
+ df += """
+# Top-level /opt/tritonserver/backends not copied so need to explicitly set permissions here
+RUN chown triton-server:triton-server /opt/tritonserver/backends
+"""
+ with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
+ dfile.write(df)
+
+
+def add_requested_repoagents(ddir, dockerfile_name, repoagents):
+ df = "# Copying over repoagents \n"
+ for ra in repoagents:
+ df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/repoagents/{} /opt/tritonserver/repoagents/{}
+""".format(
+ ra, ra
+ )
+ if len(repoagents) > 0:
+ df += """
+# Top-level /opt/tritonserver/repoagents not copied so need to explicitly set permissions here
+RUN chown triton-server:triton-server /opt/tritonserver/repoagents
+"""
+ with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
+ dfile.write(df)
+
+
+def add_requested_caches(ddir, dockerfile_name, caches):
+ df = "# Copying over caches \n"
+ for cache in caches:
+ df += """COPY --chown=1000:1000 --from=full /opt/tritonserver/caches/{} /opt/tritonserver/caches/{}
+""".format(
+ cache, cache
+ )
+ if len(caches) > 0:
+ df += """
+# Top-level /opt/tritonserver/caches not copied so need to explicitly set permissions here
+RUN chown triton-server:triton-server /opt/tritonserver/caches
+"""
+ with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
+ dfile.write(df)
+
+
+def end_dockerfile(ddir, dockerfile_name, argmap):
+ # Install additional dependencies
+ df = ""
+ if argmap["SAGEMAKER_ENDPOINT"]:
+ df += """
+LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
+COPY --chown=1000:1000 --from=full /usr/bin/serve /usr/bin/.
+"""
+ with open(os.path.join(ddir, dockerfile_name), "a") as dfile:
+ dfile.write(df)
+
+
+def build_docker_image(ddir, dockerfile_name, container_name):
+ # Create container with docker build
+ p = subprocess.Popen(
+ [
+ "docker",
+ "build",
+ "-t",
+ container_name,
+ "-f",
+ os.path.join(ddir, dockerfile_name),
+ ".",
+ ]
+ )
+ p.wait()
+ fail_if(p.returncode != 0, "docker build {} failed".format(container_name))
+
+
+def get_container_version_if_not_specified():
+ if FLAGS.container_version is None:
+ # Read from TRITON_VERSION file in server repo to determine version
+ with open("TRITON_VERSION", "r") as vfile:
+ version = vfile.readline().strip()
+ import build
+
+ _, FLAGS.container_version = build.container_versions(
+ version, None, FLAGS.container_version
+ )
+ log("version {}".format(version))
+ log("using container version {}".format(FLAGS.container_version))
+
+
+def create_argmap(images, skip_pull):
+ # Extract information from upstream build and create map other functions can
+ # use
+ full_docker_image = images["full"]
+ min_docker_image = images["min"]
+ enable_gpu = FLAGS.enable_gpu
+ # Docker inspect environment variables
+ base_run_args = ["docker", "inspect", "-f"]
+ import re # parse all PATH environment variables
+
+ # first pull docker images
+ if not skip_pull:
+ log("pulling container:{}".format(full_docker_image))
+ p = subprocess.run(["docker", "pull", full_docker_image])
+ fail_if(
+ p.returncode != 0,
+ "docker pull container {} failed, {}".format(full_docker_image, p.stderr),
+ )
+ if enable_gpu:
+ if not skip_pull:
+ pm = subprocess.run(["docker", "pull", min_docker_image])
+ fail_if(
+ pm.returncode != 0 and not skip_pull,
+ "docker pull container {} failed, {}".format(
+ min_docker_image, pm.stderr
+ ),
+ )
+ pm_path = subprocess.run(
+ base_run_args
+ + [
+ "{{range $index, $value := .Config.Env}}{{$value}} {{end}}",
+ min_docker_image,
+ ],
+ capture_output=True,
+ text=True,
+ )
+ fail_if(
+ pm_path.returncode != 0,
+ "docker inspect to find triton environment variables for min container failed, {}".format(
+ pm_path.stderr
+ ),
+ )
+ # min container needs to be GPU-support-enabled if the build is GPU build
+ vars = pm_path.stdout
+ e = re.search("CUDA_VERSION", vars)
+ gpu_enabled = False if e is None else True
+ fail_if(
+ not gpu_enabled,
+ "Composing container with gpu support enabled but min container provided does not have CUDA installed",
+ )
+
+ # Check full container environment variables
+ p_path = subprocess.run(
+ base_run_args
+ + [
+ "{{range $index, $value := .Config.Env}}{{$value}} {{end}}",
+ full_docker_image,
+ ],
+ capture_output=True,
+ text=True,
+ )
+ fail_if(
+ p_path.returncode != 0,
+ "docker inspect to find environment variables for full container failed, {}".format(
+ p_path.stderr
+ ),
+ )
+ vars = p_path.stdout
+ log_verbose("inspect args: {}".format(vars))
+
+ e0 = re.search("TRITON_SERVER_GPU_ENABLED=([\S]{1,}) ", vars)
+ e1 = re.search("CUDA_VERSION", vars)
+ gpu_enabled = False
+ if e0 != None:
+ gpu_enabled = e0.group(1) == "1"
+ elif e1 != None:
+ gpu_enabled = True
+ fail_if(
+ gpu_enabled != enable_gpu,
+ "Error: full container provided was build with "
+ "'TRITON_SERVER_GPU_ENABLED' as {} and you are composing container"
+ "with 'TRITON_SERVER_GPU_ENABLED' as {}".format(gpu_enabled, enable_gpu),
+ )
+ e = re.search("TRITON_SERVER_VERSION=([\S]{6,}) ", vars)
+ version = "" if e is None else e.group(1)
+ fail_if(
+ len(version) == 0,
+ "docker inspect to find triton server version failed, {}".format(p_path.stderr),
+ )
+ e = re.search("NVIDIA_TRITON_SERVER_VERSION=([\S]{5,}) ", vars)
+ container_version = "" if e is None else e.group(1)
+ fail_if(
+ len(container_version) == 0,
+ "docker inspect to find triton container version failed, {}".format(vars),
+ )
+ dcgm_ver = re.search("DCGM_VERSION=([\S]{4,}) ", vars)
+ dcgm_version = ""
+ if dcgm_ver is None:
+ dcgm_version = "2.2.3"
+ log(
+ "WARNING: DCGM version not found from image, installing the earlierst version {}".format(
+ dcgm_version
+ )
+ )
+ else:
+ dcgm_version = dcgm_ver.group(1)
+ fail_if(
+ len(dcgm_version) == 0,
+ "docker inspect to find DCGM version failed, {}".format(vars),
+ )
+
+ p_sha = subprocess.run(
+ base_run_args
+ + ['{{ index .Config.Labels "com.nvidia.build.ref"}}', full_docker_image],
+ capture_output=True,
+ text=True,
+ )
+ fail_if(
+ p_sha.returncode != 0,
+ "docker inspect of upstream docker image build sha failed, {}".format(
+ p_sha.stderr
+ ),
+ )
+ p_build = subprocess.run(
+ base_run_args
+ + ['{{ index .Config.Labels "com.nvidia.build.id"}}', full_docker_image],
+ capture_output=True,
+ text=True,
+ )
+ fail_if(
+ p_build.returncode != 0,
+ "docker inspect of upstream docker image build sha failed, {}".format(
+ p_build.stderr
+ ),
+ )
+
+ p_find = subprocess.run(
+ ["docker", "run", full_docker_image, "bash", "-c", "ls /usr/bin/"],
+ capture_output=True,
+ text=True,
+ )
+ f = re.search("serve", p_find.stdout)
+ fail_if(
+ p_find.returncode != 0,
+ "Cannot search for 'serve' in /usr/bin, {}".format(p_find.stderr),
+ )
+ argmap = {
+ "NVIDIA_BUILD_REF": p_sha.stdout.rstrip(),
+ "NVIDIA_BUILD_ID": p_build.stdout.rstrip(),
+ "TRITON_VERSION": version,
+ "TRITON_CONTAINER_VERSION": container_version,
+ "DCGM_VERSION": dcgm_version,
+ "SAGEMAKER_ENDPOINT": f is not None,
+ }
+ return argmap
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ group_qv = parser.add_mutually_exclusive_group()
+ group_qv.add_argument(
+ "-q",
+ "--quiet",
+ action="store_true",
+ required=False,
+ help="Disable console output.",
+ )
+ group_qv.add_argument(
+ "-v",
+ "--verbose",
+ action="store_true",
+ required=False,
+ help="Enable verbose output.",
+ )
+ parser.add_argument(
+ "--output-name",
+ type=str,
+ required=False,
+ help='Name for the generated Docker image. Default is "tritonserver".',
+ )
+ parser.add_argument(
+ "--work-dir",
+ type=str,
+ required=False,
+ help="Generated dockerfiles are placed here. Default to current directory.",
+ )
+ parser.add_argument(
+ "--container-version",
+ type=str,
+ required=False,
+ help="The version to use for the generated Docker image. If not specified "
+ "the container version will be chosen automatically based on the "
+ "repository branch.",
+ )
+ parser.add_argument(
+ "--image",
+ action="append",
+ required=False,
+ help="Use specified Docker image to generate Docker image. Specified as "
+ ',. can be "min", "gpu-min" '
+ 'or "full". Both "min" and "full" need to be specified at the same time.'
+ 'This will override "--container-version". "gpu-min" is needed for '
+ "CPU-only container to copy TensorFlow and PyTorch deps.",
+ )
+ parser.add_argument(
+ "--enable-gpu",
+ nargs="?",
+ type=lambda x: (str(x).lower() == "true"),
+ const=True,
+ default=True,
+ required=False,
+ help=argparse.SUPPRESS,
+ )
+ parser.add_argument(
+ "--backend",
+ action="append",
+ required=False,
+ help="Include in the generated Docker image. The flag may be "
+ "specified multiple times.",
+ )
+ parser.add_argument(
+ "--repoagent",
+ action="append",
+ required=False,
+ help="Include in the generated Docker image. The flag may "
+ "be specified multiple times.",
+ )
+ parser.add_argument(
+ "--cache",
+ action="append",
+ required=False,
+ help="Include in the generated Docker image. The flag may "
+ "be specified multiple times.",
+ )
+ parser.add_argument(
+ "--skip-pull",
+ action="store_true",
+ required=False,
+ help="Do not pull the required docker images. The user is responsible "
+ "for pulling the upstream images needed to compose the image.",
+ )
+ parser.add_argument(
+ "--dry-run",
+ action="store_true",
+ required=False,
+ help="Only creates Dockerfile.compose, does not build the Docker image.",
+ )
+
+ FLAGS = parser.parse_args()
+
+ if FLAGS.work_dir is None:
+ FLAGS.work_dir = "."
+ if FLAGS.output_name is None:
+ FLAGS.output_name = "tritonserver"
+
+ dockerfile_name = "Dockerfile.compose"
+
+ if FLAGS.backend is None:
+ FLAGS.backend = []
+ if FLAGS.repoagent is None:
+ FLAGS.repoagent = []
+ if FLAGS.cache is None:
+ FLAGS.cache = []
+
+ # Initialize map of docker images.
+ images = {}
+ if FLAGS.image:
+ for img in FLAGS.image:
+ parts = img.split(",")
+ fail_if(
+ len(parts) != 2,
+ "--image must specific ,",
+ )
+ fail_if(
+ parts[0] not in ["min", "full", "gpu-min"],
+ "unsupported image-name '{}' for --image".format(parts[0]),
+ )
+ log('image "{}": "{}"'.format(parts[0], parts[1]))
+ images[parts[0]] = parts[1]
+ else:
+ get_container_version_if_not_specified()
+ if FLAGS.enable_gpu:
+ images = {
+ "full": "nvcr.io/nvidia/tritonserver:{}-py3".format(
+ FLAGS.container_version
+ ),
+ "min": "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+ FLAGS.container_version
+ ),
+ }
+ else:
+ images = {
+ "full": "nvcr.io/nvidia/tritonserver:{}-cpu-only-py3".format(
+ FLAGS.container_version
+ ),
+ "min": "ubuntu:22.04",
+ }
+ fail_if(len(images) < 2, "Need to specify both 'full' and 'min' images if at all")
+
+ # For CPU-only image we need to copy some cuda libraries and dependencies
+ # since we are using PyTorch, TensorFlow 1, TensorFlow 2 containers that
+ # are not CPU-only.
+ if (
+ ("pytorch" in FLAGS.backend)
+ or ("tensorflow" in FLAGS.backend)
+ or ("tensorflow2" in FLAGS.backend)
+ ) and ("gpu-min" not in images):
+ images["gpu-min"] = "nvcr.io/nvidia/tritonserver:{}-py3-min".format(
+ FLAGS.container_version
+ )
+
+ argmap = create_argmap(images, FLAGS.skip_pull)
+
+ start_dockerfile(FLAGS.work_dir, images, argmap, dockerfile_name, FLAGS.backend)
+ add_requested_backends(FLAGS.work_dir, dockerfile_name, FLAGS.backend)
+ add_requested_repoagents(FLAGS.work_dir, dockerfile_name, FLAGS.repoagent)
+ add_requested_caches(FLAGS.work_dir, dockerfile_name, FLAGS.cache)
+ end_dockerfile(FLAGS.work_dir, dockerfile_name, argmap)
+
+ if not FLAGS.dry_run:
+ build_docker_image(FLAGS.work_dir, dockerfile_name, FLAGS.output_name)
diff --git a/deploy/alibaba-cloud/README.md b/deploy/alibaba-cloud/README.md
new file mode 100644
index 0000000000..98f914a693
--- /dev/null
+++ b/deploy/alibaba-cloud/README.md
@@ -0,0 +1,180 @@
+
+
+# Deploy Triton Inference Server on PAI-EAS
+* Table Of Contents
+ - [Description](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Description)
+ - [Prerequisites](https://yuque.alibaba-inc.com/pai/blade/mtptqc#Prerequisites)
+ - [Demo Instruction](https://yuque.alibaba-inc.com/pai/blade/mtptqc#31bb94ef)
+ - [Additional Resources](https://yuque.alibaba-inc.com/pai/blade/mtptqc#89d5e680)
+ - [Known Issues](https://yuque.alibaba-inc.com/pai/blade/mtptqc#558ab0be)
+
+# Description
+This repository contains information about how to deploy NVIDIA Triton Inference Server in EAS(Elastic Algorithm Service) of Alibaba-Cloud.
+- EAS provides a simple way for deep learning developers to deploy their models in Alibaba Cloud.
+- Using **Triton Processor** is the recommended way on EAS to deploy Triton Inference Server. Users can simply deploy a Triton Server by preparing models and creating a EAS service by setting processor type to `triton`.
+- Models should be uploaded to Alibaba Cloud's OSS(Object Storage Service). User's model repository in OSS will be mounted onto local path visible to Triton Server.
+- This documentation uses Triton's own example models for demo. The tensorflow inception model can be downloaded by the `fetch_models.sh` script.
+
+# Prerequisites
+- You should register an Alibaba Cloud Account, and being able to use EAS by [eascmd](https://help.aliyun.com/document_detail/111031.html?spm=a2c4g.11186623.6.752.42356f46FN5fU1), which is a command line tool to create stop or scale services on EAS.
+- Before creating an EAS service, you should buy dedicated resource groups(CPU or GPU) on EAS following this [document](https://www.alibabacloud.com/help/doc-detail/120122.htm).
+- Make sure you can use OSS(Object Storage Service), the models should be uploaded into your own OSS bucket.
+
+# Demo Instruction
+## Prepare a model repo directory in OSS
+Download the tensorflow inception model via [fetch_model.sh](https://github.com/triton-inference-server/server/blob/main/docs/examples/fetch_models.sh). Then using [ossutil](https://help.aliyun.com/document_detail/50452.html?spm=a2c4g.11186623.6.833.26d66d51dPEytI) , which is a command line tool to use OSS, to upload the model to a certain OSS dir as you want.
+
+```
+./ossutil cp inception_graphdef/ oss://triton-model-repo/models
+```
+## Create Triton Service with json config by eascmd
+The following is the json we use when creating a Triton Server on EAS.
+```
+{
+ "name": "",
+ "processor": "triton",
+ "processor_params": [
+ "--model-repository=oss://triton-model-repo/models",
+ "--allow-grpc=true",
+ "--allow-http=true"
+ ],
+ "metadata": {
+ "instance": 1,
+ "cpu": 4,
+ "gpu": 1,
+ "memory": 10000,
+ "resource": "",
+ "rpc.keepalive": 3000
+ }
+}
+```
+Only processor and processor_params should be different from a normal EAS service.
+|params|details|
+|--------|-------|
+|processor|Name should be **triton** to use Triton on EAS|
+|processor_params|List of strings, every element is a param for tritonserver |
+
+```
+./eascmd create triton.config
+[RequestId]: AECDB6A4-CB69-4688-AA35-BA1E020C39E6
++-------------------+------------------------------------------------------------------------------------------------+
+| Internet Endpoint | http://1271520832287160.cn-shanghai.pai-eas.aliyuncs.com/api/predict/test_triton_processor |
+| Intranet Endpoint | http://1271520832287160.vpc.cn-shanghai.pai-eas.aliyuncs.com/api/predict/test_triton_processor |
+| Token | MmY3M2ExZGYwYjZiMTQ5YTRmZWE3MDAzNWM1ZTBiOWQ3MGYxZGNkZQ== |
++-------------------+------------------------------------------------------------------------------------------------+
+[OK] Service is now deploying
+[OK] Successfully synchronized resources
+[OK] Waiting [Total: 1, Pending: 1, Running: 0]
+[OK] Waiting [Total: 1, Pending: 1, Running: 0]
+[OK] Running [Total: 1, Pending: 0, Running: 1]
+[OK] Service is running
+```
+## Query Triton service by python client
+### Install triton's python client
+```
+pip install tritonclient[all]
+```
+### A demo to query inception model
+```
+import numpy as np
+import time
+from PIL import Image
+
+import tritonclient.http as httpclient
+from tritonclient.utils import InferenceServerException
+
+URL = ""
+HEADERS = {"Authorization": ""}
+input_img = httpclient.InferInput("input", [1, 299, 299, 3], "FP32")
+# Using one of the cat images from imagenet or a random cat images you like
+img = Image.open('./cat.png').resize((299, 299))
+img = np.asarray(img).astype('float32') / 255.0
+input_img.set_data_from_numpy(img.reshape([1, 299, 299, 3]), binary_data=True)
+
+output = httpclient.InferRequestedOutput(
+ "InceptionV3/Predictions/Softmax", binary_data=True
+)
+triton_client = httpclient.InferenceServerClient(url=URL, verbose=False)
+
+start = time.time()
+for i in range(10):
+ results = triton_client.infer(
+ "inception_graphdef", inputs=[input_img], outputs=[output], headers=HEADERS
+ )
+ res_body = results.get_response()
+ elapsed_ms = (time.time() - start) * 1000
+ if i == 0:
+ print("model name: ", res_body["model_name"])
+ print("model version: ", res_body["model_version"])
+ print("output name: ", res_body["outputs"][0]["name"])
+ print("output shape: ", res_body["outputs"][0]["shape"])
+ print("[{}] Avg rt(ms): {:.2f}".format(i, elapsed_ms))
+ start = time.time()
+```
+You will get the following result by running the python script:
+```
+[0] Avg rt(ms): 86.05
+[1] Avg rt(ms): 52.35
+[2] Avg rt(ms): 50.56
+[3] Avg rt(ms): 43.45
+[4] Avg rt(ms): 41.19
+[5] Avg rt(ms): 40.55
+[6] Avg rt(ms): 37.24
+[7] Avg rt(ms): 37.16
+[8] Avg rt(ms): 36.68
+[9] Avg rt(ms): 34.24
+[10] Avg rt(ms): 34.27
+```
+# Additional Resources
+See the following resources to learn more about how to use Alibaba Cloud's OSS orEAS.
+- [Alibaba Cloud OSS's Document](https://help.aliyun.com/product/31815.html?spm=a2c4g.11186623.6.540.3c0f62e7q3jw8b)
+
+
+# Known Issues
+- [Binary Tensor Data Extension](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_binary_data.md) is not fully supported yet, users want to use service with binary extension supported, it is only available in cn-shanghai region of PAI-EAS.
+- Currently only HTTP/1 is supported, hence gRPC cannot be used when query Triton servers on EAS. HTP/2 will be officially supported in a short time.
+- Users should not mount a whole OSS bucket when launching Triton processor, but an arbitrarily deep sub-directory in bucket. Otherwise the mounted path will no be as expected.
+- Not all of Triton Server parameters are be supported on EAS, the following params are supported on EAS:
+```
+model-repository
+log-verbose
+log-info
+log-warning
+log-error
+exit-on-error
+strict-model-config
+strict-readiness
+allow-http
+http-thread-count
+pinned-memory-pool-byte-size
+cuda-memory-pool-byte-size
+min-supported-compute-capability
+buffer-manager-thread-count
+backend-config
+```
diff --git a/deploy/aws/Chart.yaml b/deploy/aws/Chart.yaml
new file mode 100644
index 0000000000..2b7541bee6
--- /dev/null
+++ b/deploy/aws/Chart.yaml
@@ -0,0 +1,31 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+appVersion: "1.0"
+description: Triton Inference Server
+name: triton-inference-server
+version: 1.0.0
diff --git a/deploy/aws/README.md b/deploy/aws/README.md
new file mode 100644
index 0000000000..4e60fdd65b
--- /dev/null
+++ b/deploy/aws/README.md
@@ -0,0 +1,262 @@
+
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+# Kubernetes Deploy: Triton Inference Server Cluster
+
+A helm chart for installing a single cluster of Triton Inference
+Server is provided. By default the cluster contains a single instance
+of the inference server but the *replicaCount* configuration parameter
+can be set to create a cluster of any size, as described below.
+
+This guide assumes you already have a functional Kubernetes cluster
+and helm installed (see below for instructions on installing
+helm). Note the following requirements:
+
+* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prpmetheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resources to support these services.
+
+* If you want Triton Server to use GPUs for inferencing, your cluster
+must be configured to contain the desired number of GPU nodes (EC2 G4 instances recommended)
+with support for the NVIDIA driver and CUDA version required by the version
+of the inference server you are using.
+
+The steps below describe how to set-up a model repository, use helm to
+launch the inference server, and then send inference requests to the
+running server. You can access a Grafana endpoint to see real-time
+metrics reported by the inference server.
+
+## Installing Helm
+
+### Helm v3
+
+If you do not already have Helm installed in your Kubernetes cluster,
+executing the following steps from the [official helm install
+guide](https://helm.sh/docs/intro/install/) will
+give you a quick setup.
+
+If you're currently using Helm v2 and would like to migrate to Helm v3,
+please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/).
+
+### Helm v2
+
+> **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3.
+
+Below are example instructions for installing Helm v2.
+
+```
+$ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash
+$ kubectl create serviceaccount -n kube-system tiller
+serviceaccount/tiller created
+$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+$ helm init --service-account tiller --wait
+```
+
+If you run into any issues, you can refer to the official installation guide [here](https://v2.helm.sh/docs/install/).
+
+## Model Repository
+
+If you already have a model repository you may use that with this helm
+chart. If you do not have a model repository, you can checkout a local
+copy of the inference server source repository to create an example
+model repository::
+
+```
+$ git clone https://github.com/triton-inference-server/server.git
+```
+
+Triton Server needs a repository of models that it will make available
+for inferencing. For this example you will place the model repository
+in an AWS S3 Storage bucket.
+
+```
+$ aws s3 mb s3://triton-inference-server-repository
+```
+
+Following the [QuickStart](../../docs/getting_started/quickstart.md) download the
+example model repository to your system and copy it into the AWS S3
+bucket.
+
+```
+$ aws s3 cp --recursive docs/examples/model_repository s3://triton-inference-server-repository/model_repository
+```
+
+### AWS Model Repository
+To load the model from the AWS S3, you need to convert the following AWS credentials in the base64 format and add it to the values.yaml
+
+```
+echo -n 'REGION' | base64
+```
+```
+echo -n 'SECRECT_KEY_ID' | base64
+```
+```
+echo -n 'SECRET_ACCESS_KEY' | base64
+```
+
+## Deploy Prometheus and Grafana
+
+The inference server metrics are collected by Prometheus and viewable
+by Grafana. The inference server helm chart assumes that Prometheus
+and Grafana are available so this step must be followed even if you
+don't want to use Grafana.
+
+Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) to install these components. The
+*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that
+Prometheus can find the inference server metrics in the *example*
+release deployed below.
+
+```
+$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack
+```
+
+Then port-forward to the Grafana service so you can access it from
+your local browser.
+
+```
+$ kubectl port-forward service/example-metrics-grafana 8080:80
+```
+
+Now you should be able to navigate in your browser to localhost:8080
+and see the Grafana login page. Use username=admin and
+password=prom-operator to login.
+
+An example Grafana dashboard is available in dashboard.json. Use the
+import function in Grafana to import and view this dashboard.
+
+## Deploy the Inference Server
+
+Deploy the inference server using the default configuration with the
+following commands.
+
+```
+$ cd
+$ helm install example .
+```
+
+Use kubectl to see status and wait until the inference server pods are
+running.
+
+```
+$ kubectl get pods
+NAME READY STATUS RESTARTS AGE
+example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s
+```
+
+There are several ways of overriding the default configuration as
+described in this [helm
+documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing).
+
+You can edit the values.yaml file directly or you can use the *--set*
+option to override a single parameter with the CLI. For example, to
+deploy a cluster of four inference servers use *--set* to set the
+replicaCount parameter.
+
+```
+$ helm install example --set replicaCount=4 .
+```
+
+You can also write your own "config.yaml" file with the values you
+want to override and pass it to helm.
+
+```
+$ cat << EOF > config.yaml
+namespace: MyCustomNamespace
+image:
+ imageName: nvcr.io/nvidia/tritonserver:custom-tag
+ modelRepositoryPath: gs://my_model_repository
+EOF
+$ helm install example -f config.yaml .
+```
+
+## Using Triton Inference Server
+
+Now that the inference server is running you can send HTTP or GRPC
+requests to it to perform inferencing. By default, the inferencing
+service is exposed with a LoadBalancer service type. Use the following
+to find the external IP for the inference server. In this case it is
+34.83.9.133.
+
+```
+$ kubectl get services
+NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
+...
+example-triton-inference-server LoadBalancer 10.18.13.28 34.83.9.133 8000:30249/TCP,8001:30068/TCP,8002:32723/TCP 47m
+```
+
+The inference server exposes an HTTP endpoint on port 8000, and GRPC
+endpoint on port 8001 and a Prometheus metrics endpoint on
+port 8002. You can use curl to get the meta-data of the inference server
+from the HTTP endpoint.
+
+```
+$ curl 34.83.9.133:8000/v2
+```
+
+Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example
+image classification client that can be used to perform inferencing
+using image classification models being served by the inference
+server. For example,
+
+```
+$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+Request 0, batch size 1
+Image 'images/mug.jpg':
+ 504 (COFFEE MUG) = 0.723992
+ 968 (CUP) = 0.270953
+ 967 (ESPRESSO) = 0.00115997
+```
+
+## Cleanup
+
+Once you've finished using the inference server you should use helm to
+delete the deployment.
+
+```
+$ helm list
+NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE
+example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default
+example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default
+
+$ helm uninstall example
+$ helm uninstall example-metrics
+```
+
+For the Prometheus and Grafana services, you should [explicitly delete
+CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart):
+
+```
+$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com
+```
+
+You may also want to delete the AWS bucket you created to hold the
+model repository.
+
+```
+$ aws s3 rm -r gs://triton-inference-server-repository
+```
diff --git a/deploy/aws/dashboard.json b/deploy/aws/dashboard.json
new file mode 100644
index 0000000000..8960b41d35
--- /dev/null
+++ b/deploy/aws/dashboard.json
@@ -0,0 +1,411 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "6.3.5"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "heatmap",
+ "name": "Heatmap",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "nv_inference_request_success",
+ "legendFormat": "Success {{instance}}",
+ "refId": "A"
+ },
+ {
+ "expr": "nv_inference_request_failure",
+ "legendFormat": "Failure {{instance}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Cumulative Inference Requests",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": null
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateReds",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 7,
+ "legend": {
+ "show": false
+ },
+ "options": {},
+ "reverseYBuckets": false,
+ "targets": [
+ {
+ "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Load Ratio (Total Time / Compute Time)",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": null,
+ "yAxis": {
+ "decimals": null,
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "auto",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Queue Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Queue Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Compute Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Compute Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 19,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Triton Inference Server",
+ "uid": "slEY4dsZk",
+ "version": 8
+}
diff --git a/deploy/aws/templates/_helpers.tpl b/deploy/aws/templates/_helpers.tpl
new file mode 100644
index 0000000000..6dba910012
--- /dev/null
+++ b/deploy/aws/templates/_helpers.tpl
@@ -0,0 +1,92 @@
+{{/*
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/}}
+
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Create inference server name.
+*/}}
+{{- define "triton-inference-server.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "triton-inference-server.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics service name and fullname derived from above and
+ truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics monitor name and fullname derived from
+ above and truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics-monitor.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics-monitor.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "triton-inference-server.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/deploy/aws/templates/deployment.yaml b/deploy/aws/templates/deployment.yaml
new file mode 100644
index 0000000000..d90e51b113
--- /dev/null
+++ b/deploy/aws/templates/deployment.yaml
@@ -0,0 +1,100 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ replicas: {{ .Values.replicaCount }}
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+ template:
+ metadata:
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+
+ spec:
+ containers:
+ - name: {{ .Chart.Name }}
+ image: "{{ .Values.image.imageName }}"
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+
+ resources:
+ limits:
+ nvidia.com/gpu: {{ .Values.image.numGpus }}
+
+ args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}",
+ "--model-control-mode=poll",
+ "--repository-poll-secs=5"]
+
+ env:
+ - name: AWS_DEFAULT_REGION
+ valueFrom:
+ secretKeyRef:
+ name: aws-credentials
+ key: AWS_DEFAULT_REGION
+ - name: AWS_ACCESS_KEY_ID
+ valueFrom:
+ secretKeyRef:
+ name: aws-credentials
+ key: AWS_ACCESS_KEY_ID
+ - name: AWS_SECRET_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: aws-credentials
+ key: AWS_SECRET_ACCESS_KEY
+
+ ports:
+ - containerPort: 8000
+ name: http
+ - containerPort: 8001
+ name: grpc
+ - containerPort: 8002
+ name: metrics
+ livenessProbe:
+ httpGet:
+ path: /v2/health/live
+ port: http
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ httpGet:
+ path: /v2/health/ready
+ port: http
+
+ securityContext:
+ runAsUser: 1000
+ fsGroup: 1000
diff --git a/deploy/aws/templates/secrets.yaml b/deploy/aws/templates/secrets.yaml
new file mode 100644
index 0000000000..d113214ee0
--- /dev/null
+++ b/deploy/aws/templates/secrets.yaml
@@ -0,0 +1,35 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Secret
+metadata:
+ name: aws-credentials
+type: Opaque
+data:
+ AWS_DEFAULT_REGION: {{ .Values.secret.region }}
+ AWS_ACCESS_KEY_ID: {{ .Values.secret.id }}
+ AWS_SECRET_ACCESS_KEY: {{ .Values.secret.key }}
diff --git a/deploy/aws/templates/service.yaml b/deploy/aws/templates/service.yaml
new file mode 100644
index 0000000000..3315fd77db
--- /dev/null
+++ b/deploy/aws/templates/service.yaml
@@ -0,0 +1,91 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ type: {{ .Values.service.type }}
+ ports:
+ - port: 8000
+ targetPort: http
+ name: http-inference-server
+ - port: 8001
+ targetPort: grpc
+ name: grpc-inference-server
+ - port: 8002
+ targetPort: metrics
+ name: metrics-inference-server
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server-metrics.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ annotations:
+ alpha.monitoring.coreos.com/non-namespaced: "true"
+spec:
+ ports:
+ - name: metrics
+ port: 8080
+ targetPort: metrics
+ protocol: TCP
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: {{ template "triton-inference-server-metrics-monitor.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics-monitor.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ endpoints:
+ - port: metrics
+ interval: 15s
diff --git a/deploy/aws/values.yaml b/deploy/aws/values.yaml
new file mode 100644
index 0000000000..e915da138b
--- /dev/null
+++ b/deploy/aws/values.yaml
@@ -0,0 +1,41 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+replicaCount: 1
+
+image:
+ imageName: nvcr.io/nvidia/tritonserver:24.03-py3
+ pullPolicy: IfNotPresent
+ modelRepositoryPath: s3://triton-inference-server-repository/model_repository
+ numGpus: 1
+
+service:
+ type: LoadBalancer
+
+secret:
+ region: AWS_REGION
+ id: AWS_SECRET_KEY_ID
+ key: AWS_SECRET_ACCESS_KEY
\ No newline at end of file
diff --git a/deploy/fleetcommand/Chart.yaml b/deploy/fleetcommand/Chart.yaml
new file mode 100644
index 0000000000..b7acfe729c
--- /dev/null
+++ b/deploy/fleetcommand/Chart.yaml
@@ -0,0 +1,38 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+# appVersion is the Triton version; update when changing release
+appVersion: "2.44.0"
+description: Triton Inference Server (Fleet Command)
+name: triton-inference-server
+# version is the Chart version; update when changing anything in the chart
+# This follows semantic versioning, i.e.:
+# Given version X.Y.Z
+# When making fixes to the chart, increment Z
+# When making functional changes to the chart (including updating the Triton version, above), increment Y and reset Z to 0
+# When making breaking changes to the chart (e.g. user must take action before deploying), increment X and reset Y and Z to 0
+version: 1.4.0
diff --git a/deploy/fleetcommand/README.md b/deploy/fleetcommand/README.md
new file mode 100644
index 0000000000..217162279c
--- /dev/null
+++ b/deploy/fleetcommand/README.md
@@ -0,0 +1,150 @@
+
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+# Fleet Command Deploy: NVIDIA Triton Inference Server
+
+A helm chart for installing a single cluster of NVIDIA Triton Inference Server
+on Fleet Command is provided. By default the cluster contains a single instance
+of the Triton but the *replicaCount* configuration parameter can be set to
+create a cluster of any size, as described below.
+
+This guide assumes you already have a functional Fleet Command location
+deployed. Please refer to the [Fleet Command
+Documentation](https://docs.nvidia.com/fleet-command/prod_fleet-command/prod_fleet-command/overview.html)
+
+The steps below describe how to set-up a model repository, use helm to launch
+the Triton, and then send inference requests to the running Triton Inference
+Server. You can optionally scrape metrics with Prometheus and access a Grafana
+endpoint to see real-time metrics reported by Triton.
+
+## Model Repository
+
+If you already have a model repository you may use that with this helm chart. If
+you do not have a model repository, you can checkout a local copy of the Triton
+Inference Server source repository to create an example model repository::
+
+```
+$ git clone https://github.com/triton-inference-server/server.git
+```
+
+Triton needs a repository of models that it will make available for inferencing.
+For this example you will place the model repository in an S3 Storage bucket
+(either in AWS or other S3 API compatible on-premises object storage).
+
+```
+$ aws s3 mb s3://triton-inference-server-repository
+```
+
+Following the [QuickStart](../../docs/getting_started/quickstart.md) download the example model
+repository to your system and copy it into the AWS S3 bucket.
+
+```
+$ aws s3 cp -r docs/examples/model_repository s3://triton-inference-server-repository/model_repository
+```
+
+### AWS Model Repository
+
+To load the model from the AWS S3, you need to convert the following AWS
+credentials in the base64 format and add it to the Application Configuration
+section when creating the Fleet Command Deployment.
+
+```
+echo -n 'REGION' | base64
+echo -n 'SECRECT_KEY_ID' | base64
+echo -n 'SECRET_ACCESS_KEY' | base64
+# Optional for using session token
+echo -n 'AWS_SESSION_TOKEN' | base64
+```
+
+## Deploy the Triton Inference Server
+
+Deploy the Triton Inference Server to your Location in Fleet Command by creating
+a Deployment. You can specify configuration parameters to override the default
+[values.yaml](values.yaml) in the Application Configuration section.
+
+*Note:* You _must_ provide a `--model-repository` parameter with a path to your
+prepared model repository in your S3 bucket. Otherwise, the Triton will not
+start.
+
+An example Application Configuration for Triton on Fleet Command:
+```yaml
+image:
+ serverArgs:
+ - --model-repository=s3://triton-inference-server-repository
+
+secret:
+ region:
+ id:
+ key:
+ token:
+```
+
+See [Fleet Command documentation](https://docs.nvidia.com/fleet-command/prod_fleet-command/prod_fleet-command/ug-deploying-to-the-edge.html)
+for more info.
+
+### Prometheus ServiceMonitor Support
+
+If you have `prometheus-operator` deployed, you can enable the ServiceMonitor
+for the Triton Inference Server by setting `serviceMonitor.enabled: true` in
+Application Configuration. This will also deploy a Grafana dashboard for Triton
+as a ConfigMap.
+
+Otherwise, metrics can be scraped by pointing an external Prometheus
+instance at the `metricsNodePort` in the values.
+
+## Using Triton Inference Server
+
+Now that the Triton Inference Server is running you can send HTTP or GRPC
+requests to it to perform inferencing. By default, the service is exposed with a
+NodePort service type, where the same port is opened on all systems in a
+Location.
+
+Triton exposes an HTTP endpoint on port 30343, and GRPC endpoint on port 30344
+and a Prometheus metrics endpoint on port 30345. These ports can be overridden
+in the application configuration when deploying. You can use curl to get the
+meta-data of Triton from the HTTP endpoint. For example, if a system in your
+location has the IP `34.83.9.133`:
+
+```
+$ curl 34.83.9.133:30343/v2
+```
+
+Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example image
+classification client that can be used to perform inferencing using image
+classification models being served by the Triton. For example,
+
+```
+$ image_client -u 34.83.9.133:30343 -m densenet_onnx -s INCEPTION -c 3 mug.jpg
+Request 0, batch size 1
+Image '/workspace/images/mug.jpg':
+ 15.349568 (504) = COFFEE MUG
+ 13.227468 (968) = CUP
+ 10.424893 (505) = COFFEEPOT
+```
diff --git a/deploy/fleetcommand/dashboard.json b/deploy/fleetcommand/dashboard.json
new file mode 100644
index 0000000000..5868176cbe
--- /dev/null
+++ b/deploy/fleetcommand/dashboard.json
@@ -0,0 +1,419 @@
+{
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "6.3.5"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "heatmap",
+ "name": "Heatmap",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "nv_inference_request_success",
+ "legendFormat": "Success {{instance}}",
+ "refId": "A"
+ },
+ {
+ "expr": "nv_inference_request_failure",
+ "legendFormat": "Failure {{instance}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Cumulative Inference Requests",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": null
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateReds",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 7,
+ "legend": {
+ "show": false
+ },
+ "options": {},
+ "reverseYBuckets": false,
+ "targets": [
+ {
+ "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Load Ratio (Total Time / Compute Time)",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": null,
+ "yAxis": {
+ "decimals": null,
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "auto",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Queue Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Queue Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Compute Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Compute Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 19,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "text": "Prometheus",
+ "value": "Prometheus"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Triton Inference Server",
+ "uid": "slEY4dsZk",
+ "version": 8
+}
diff --git a/deploy/fleetcommand/templates/_helpers.tpl b/deploy/fleetcommand/templates/_helpers.tpl
new file mode 100644
index 0000000000..6dba910012
--- /dev/null
+++ b/deploy/fleetcommand/templates/_helpers.tpl
@@ -0,0 +1,92 @@
+{{/*
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/}}
+
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Create inference server name.
+*/}}
+{{- define "triton-inference-server.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "triton-inference-server.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics service name and fullname derived from above and
+ truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics monitor name and fullname derived from
+ above and truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics-monitor.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics-monitor.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "triton-inference-server.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/deploy/fleetcommand/templates/configmap-grafana-dashboard.yaml b/deploy/fleetcommand/templates/configmap-grafana-dashboard.yaml
new file mode 100644
index 0000000000..782b1f85e6
--- /dev/null
+++ b/deploy/fleetcommand/templates/configmap-grafana-dashboard.yaml
@@ -0,0 +1,37 @@
+# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+{{- if .Values.serviceMonitor.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: {{ .Release.Name }}-dashboard-configmap
+ labels:
+ grafana_dashboard: "1"
+data:
+ dashboard.json: |-
+{{ .Files.Get "dashboard.json" | indent 4}}
+{{- end }}
diff --git a/deploy/fleetcommand/templates/deployment.yaml b/deploy/fleetcommand/templates/deployment.yaml
new file mode 100644
index 0000000000..5d7af7023d
--- /dev/null
+++ b/deploy/fleetcommand/templates/deployment.yaml
@@ -0,0 +1,112 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ replicas: {{ .Values.replicaCount }}
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+ template:
+ metadata:
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+
+ spec:
+ containers:
+ - name: {{ .Chart.Name }}
+ image: "{{ .Values.image.imageName }}"
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+
+ resources:
+ limits:
+ nvidia.com/gpu: {{ .Values.image.numGpus }}
+
+ args:
+ - {{ .Values.image.serverCommand }}
+ {{- $args := required "image.serverArgs, at least --model-repository, is required!" .Values.image.serverArgs }}
+ {{- range $args }}
+ - {{ . -}}
+ {{ end }}
+
+{{ if .Values.secret }}
+ env:
+ - name: AWS_DEFAULT_REGION
+ valueFrom:
+ secretKeyRef:
+ name: aws-credentials
+ key: AWS_DEFAULT_REGION
+ - name: AWS_ACCESS_KEY_ID
+ valueFrom:
+ secretKeyRef:
+ name: aws-credentials
+ key: AWS_ACCESS_KEY_ID
+ - name: AWS_SECRET_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: aws-credentials
+ key: AWS_SECRET_ACCESS_KEY
+{{- if .Values.secret.token }}
+ - name: AWS_SESSION_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: aws-credentials
+ key: AWS_SESSION_TOKEN
+{{- end }}
+{{- end }}
+
+ ports:
+ - containerPort: 8000
+ name: http
+ - containerPort: 8001
+ name: grpc
+ - containerPort: 8002
+ name: metrics
+ livenessProbe:
+ httpGet:
+ path: /v2/health/live
+ port: http
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ httpGet:
+ path: /v2/health/ready
+ port: http
+
+ securityContext:
+ runAsUser: 1000
+ fsGroup: 1000
diff --git a/deploy/fleetcommand/templates/secrets.yaml b/deploy/fleetcommand/templates/secrets.yaml
new file mode 100644
index 0000000000..9c7dcd404d
--- /dev/null
+++ b/deploy/fleetcommand/templates/secrets.yaml
@@ -0,0 +1,40 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+{{- if .Values.secret }}
+apiVersion: v1
+kind: Secret
+metadata:
+ name: aws-credentials
+type: Opaque
+data:
+ AWS_DEFAULT_REGION: {{ .Values.secret.region }}
+ AWS_ACCESS_KEY_ID: {{ .Values.secret.id }}
+ AWS_SECRET_ACCESS_KEY: {{ .Values.secret.key }}
+{{- if .Values.secret.token }}
+ AWS_SESSION_TOKEN: {{ .Values.secret.token }}
+{{- end }}
+{{- end }}
diff --git a/deploy/fleetcommand/templates/service.yaml b/deploy/fleetcommand/templates/service.yaml
new file mode 100644
index 0000000000..4f12205902
--- /dev/null
+++ b/deploy/fleetcommand/templates/service.yaml
@@ -0,0 +1,102 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ type: {{ .Values.service.type }}
+ ports:
+ - port: 8000
+ targetPort: http
+ name: http-inference-server
+ {{- if .Values.service.httpNodePort }}
+ nodePort: {{ .Values.service.httpNodePort }}
+ {{- end }}
+ - port: 8001
+ targetPort: grpc
+ name: grpc-inference-server
+ {{- if .Values.service.grpcNodePort }}
+ nodePort: {{ .Values.service.grpcNodePort }}
+ {{- end }}
+ - port: 8002
+ targetPort: metrics
+ name: metrics-inference-server
+ {{- if .Values.service.metricsNodePort }}
+ nodePort: {{ .Values.service.metricsNodePort }}
+ {{- end }}
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server-metrics.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ annotations:
+ alpha.monitoring.coreos.com/non-namespaced: "true"
+spec:
+ ports:
+ - name: metrics
+ port: 8080
+ targetPort: metrics
+ protocol: TCP
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+{{- if .Values.serviceMonitor.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: {{ template "triton-inference-server-metrics-monitor.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics-monitor.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ endpoints:
+ - port: metrics
+ interval: 15s
+{{- end }}
diff --git a/deploy/fleetcommand/values.yaml b/deploy/fleetcommand/values.yaml
new file mode 100644
index 0000000000..ca00a2acf1
--- /dev/null
+++ b/deploy/fleetcommand/values.yaml
@@ -0,0 +1,78 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+replicaCount: 1
+
+image:
+ imageName: nvcr.io/nvidia/tritonserver:24.03-py3
+ pullPolicy: IfNotPresent
+ numGpus: 1
+ serverCommand: tritonserver
+ serverArgs:
+ # Model Repository Configuration (REQUIRED)
+ #
+ # Configure sources for model repository below. Multiple repositories
+ # can be specified
+ #
+ # To download models from an S3 bucket, uncomment and configure below
+ # To specify a non-AWS S3 endpoint, use the form
+ # s3://https://your-s3-endpoint:443/bucket/model_repository
+ #
+ #- --model-repository=s3://triton-inference-server-repository/model_repository
+ #
+ # Model Control Mode (Optional, default: none)
+ #
+ # To set model control mode, uncomment and configure below
+ # See https://github.com/triton-inference-server/server/blob/r24.03/docs/model_management.md
+ # for more details
+ #- --model-control-mode=explicit|poll|none
+ #
+ # Additional server args
+ #
+ # see https://github.com/triton-inference-server/server/blob/r24.03/README.md
+ # for more details
+
+service:
+ # for Fleet Command, type should be NodePort
+ type: NodePort
+ # the following ports will be the external port opened for each service
+ httpNodePort: 30343
+ grpcNodePort: 30344
+ metricsNodePort: 30345
+
+# AWS
+#secret:
+ # update the following with base64 encoded parameters
+# region: AWS_REGION
+# id: AWS_SECRET_KEY_ID
+# key: AWS_SECRET_ACCESS_KEY
+# token: AWS_SESSION_TOKEN
+
+# Prometheus-Operator ServiceMonitor support
+# change enabled to 'true' to enable a ServiceMonitor if your cluster has
+# Prometheus-Operator installed
+serviceMonitor:
+ enabled: false
diff --git a/deploy/gcp/Chart.yaml b/deploy/gcp/Chart.yaml
new file mode 100644
index 0000000000..2b7541bee6
--- /dev/null
+++ b/deploy/gcp/Chart.yaml
@@ -0,0 +1,31 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+appVersion: "1.0"
+description: Triton Inference Server
+name: triton-inference-server
+version: 1.0.0
diff --git a/deploy/gcp/README.md b/deploy/gcp/README.md
new file mode 100644
index 0000000000..dc80cc77de
--- /dev/null
+++ b/deploy/gcp/README.md
@@ -0,0 +1,300 @@
+
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+# Kubernetes Deploy: Triton Inference Server Cluster
+
+A helm chart for installing a single cluster of Triton Inference
+Server is provided. By default the cluster contains a single instance
+of the inference server but the *replicaCount* configuration parameter
+can be set to create a cluster of any size, as described below.
+
+This guide assumes you already have a functional Kubernetes cluster
+and helm installed (see below for instructions on installing
+helm). Note the following requirements:
+
+* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. Your cluster must contain sufficient CPU resources to support these services. At a minimum you will likely require 2 CPU nodes with machine type of n1-standard-2 or greater.
+
+* If you want Triton Server to use GPUs for inferencing, your cluster
+must be configured to contain the desired number of GPU nodes with
+support for the NVIDIA driver and CUDA version required by the version
+of the inference server you are using.
+
+This helm chart is available from [Triton Inference Server
+GitHub](https://github.com/triton-inference-server/server) or from the
+[NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com).
+
+The steps below describe how to set-up a model repository, use helm to
+launch the inference server, and then send inference requests to the
+running server. You can access a Grafana endpoint to see real-time
+metrics reported by the inference server.
+
+
+## Installing Helm
+
+### Helm v3
+
+If you do not already have Helm installed in your Kubernetes cluster,
+executing the following steps from the [official helm install
+guide](https://helm.sh/docs/intro/install/) will
+give you a quick setup.
+
+If you're currently using Helm v2 and would like to migrate to Helm v3,
+please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/).
+
+### Helm v2
+
+> **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3.
+
+Below are example instructions for installing Helm v2.
+
+```
+$ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash
+$ kubectl create serviceaccount -n kube-system tiller
+serviceaccount/tiller created
+$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+$ helm init --service-account tiller --wait
+```
+
+If you run into any issues, you can refer to the official installation guide [here](https://v2.helm.sh/docs/install/).
+
+## Model Repository
+
+If you already have a model repository you may use that with this helm
+chart. If you do not have a model repository, you can checkout a local
+copy of the inference server source repository to create an example
+model repository::
+
+```
+$ git clone https://github.com/triton-inference-server/server.git
+```
+
+Triton Server needs a repository of models that it will make available
+for inferencing. For this example you will place the model repository
+in a Google Cloud Storage bucket.
+
+```
+$ gsutil mb gs://triton-inference-server-repository
+```
+
+Following the [QuickStart](../../docs/getting_started/quickstart.md) download the
+example model repository to your system and copy it into the GCS
+bucket.
+
+```
+$ gsutil cp -r docs/examples/model_repository gs://triton-inference-server-repository/model_repository
+```
+
+### GCS Permissions
+
+Make sure the bucket permissions are set so that the inference server
+can access the model repository. If the bucket is public then no
+additional changes are needed and you can proceed to "Deploy
+Prometheus and Grafana" section.
+
+If bucket premissions need to be set with the
+GOOGLE_APPLICATION_CREDENTIALS environment variable then perform the
+following steps:
+
+* Generate Google service account JSON with proper permissions called
+ *gcp-creds.json*.
+
+* Create a Kubernetes secret from *gcp-creds.json*:
+
+```
+ $ kubectl create configmap gcpcreds --from-literal "project-id=myproject"
+ $ kubectl create secret generic gcpcreds --from-file gcp-creds.json
+```
+
+* Modify templates/deployment.yaml to include the
+ GOOGLE_APPLICATION_CREDENTIALS environment variable:
+
+```
+ env:
+ - name: GOOGLE_APPLICATION_CREDENTIALS
+ value: /secret/gcp-creds.json
+```
+
+* Modify templates/deployment.yaml to mount the secret in a volume at
+ /secret:
+
+```
+ volumeMounts:
+ - name: vsecret
+ mountPath: "/secret"
+ readOnly: true
+ ...
+ volumes:
+ - name: vsecret
+ secret:
+ secretName: gcpcreds
+```
+
+
+## Deploy Prometheus and Grafana
+
+The inference server metrics are collected by Prometheus and viewable
+by Grafana. The inference server helm chart assumes that Prometheus
+and Grafana are available so this step must be followed even if you
+don't want to use Grafana.
+
+Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) to install these components. The
+*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that
+Prometheus can find the inference server metrics in the *example*
+release deployed below.
+
+```
+$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack
+```
+
+Then port-forward to the Grafana service so you can access it from
+your local browser.
+
+```
+$ kubectl port-forward service/example-metrics-grafana 8080:80
+```
+
+Now you should be able to navigate in your browser to localhost:8080
+and see the Grafana login page. Use username=admin and
+password=prom-operator to login.
+
+An example Grafana dashboard is available in dashboard.json. Use the
+import function in Grafana to import and view this dashboard.
+
+## Deploy the Inference Server
+
+Deploy the inference server using the default configuration with the
+following commands.
+
+```
+$ cd
+$ helm install example .
+```
+
+Use kubectl to see status and wait until the inference server pods are
+running.
+
+```
+$ kubectl get pods
+NAME READY STATUS RESTARTS AGE
+example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s
+```
+
+There are several ways of overriding the default configuration as
+described in this [helm
+documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing).
+
+You can edit the values.yaml file directly or you can use the *--set*
+option to override a single parameter with the CLI. For example, to
+deploy a cluster of four inference servers use *--set* to set the
+replicaCount parameter.
+
+```
+$ helm install example --set replicaCount=4 .
+```
+
+You can also write your own "config.yaml" file with the values you
+want to override and pass it to helm.
+
+```
+$ cat << EOF > config.yaml
+namespace: MyCustomNamespace
+image:
+ imageName: nvcr.io/nvidia/tritonserver:custom-tag
+ modelRepositoryPath: gs://my_model_repository
+EOF
+$ helm install example -f config.yaml .
+```
+
+## Using Triton Inference Server
+
+Now that the inference server is running you can send HTTP or GRPC
+requests to it to perform inferencing. By default, the inferencing
+service is exposed with a LoadBalancer service type. Use the following
+to find the external IP for the inference server. In this case it is
+34.83.9.133.
+
+```
+$ kubectl get services
+NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
+...
+example-triton-inference-server LoadBalancer 10.18.13.28 34.83.9.133 8000:30249/TCP,8001:30068/TCP,8002:32723/TCP 47m
+```
+
+The inference server exposes an HTTP endpoint on port 8000, and GRPC
+endpoint on port 8001 and a Prometheus metrics endpoint on
+port 8002. You can use curl to get the meta-data of the inference server
+from the HTTP endpoint.
+
+```
+$ curl 34.83.9.133:8000/v2
+```
+
+Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example
+image classification client that can be used to perform inferencing
+using image classification models being served by the inference
+server. For example,
+
+```
+$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+Request 0, batch size 1
+Image 'images/mug.jpg':
+ 504 (COFFEE MUG) = 0.723992
+ 968 (CUP) = 0.270953
+ 967 (ESPRESSO) = 0.00115997
+```
+
+## Cleanup
+
+Once you've finished using the inference server you should use helm to
+delete the deployment.
+
+```
+$ helm list
+NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE
+example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default
+example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default
+
+$ helm uninstall example
+$ helm uninstall example-metrics
+```
+
+For the Prometheus and Grafana services, you should [explicitly delete
+CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart):
+
+```
+$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com
+```
+
+You may also want to delete the GCS bucket you created to hold the
+model repository.
+
+```
+$ gsutil rm -r gs://triton-inference-server-repository
+```
diff --git a/deploy/gcp/dashboard.json b/deploy/gcp/dashboard.json
new file mode 100644
index 0000000000..8960b41d35
--- /dev/null
+++ b/deploy/gcp/dashboard.json
@@ -0,0 +1,411 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "6.3.5"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "heatmap",
+ "name": "Heatmap",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "nv_inference_request_success",
+ "legendFormat": "Success {{instance}}",
+ "refId": "A"
+ },
+ {
+ "expr": "nv_inference_request_failure",
+ "legendFormat": "Failure {{instance}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Cumulative Inference Requests",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": null
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateReds",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 7,
+ "legend": {
+ "show": false
+ },
+ "options": {},
+ "reverseYBuckets": false,
+ "targets": [
+ {
+ "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Load Ratio (Total Time / Compute Time)",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": null,
+ "yAxis": {
+ "decimals": null,
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "auto",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Queue Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Queue Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Compute Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Compute Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 19,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Triton Inference Server",
+ "uid": "slEY4dsZk",
+ "version": 8
+}
diff --git a/deploy/gcp/templates/_helpers.tpl b/deploy/gcp/templates/_helpers.tpl
new file mode 100644
index 0000000000..6dba910012
--- /dev/null
+++ b/deploy/gcp/templates/_helpers.tpl
@@ -0,0 +1,92 @@
+{{/*
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/}}
+
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Create inference server name.
+*/}}
+{{- define "triton-inference-server.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "triton-inference-server.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics service name and fullname derived from above and
+ truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics monitor name and fullname derived from
+ above and truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics-monitor.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics-monitor.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "triton-inference-server.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/deploy/gcp/templates/deployment.yaml b/deploy/gcp/templates/deployment.yaml
new file mode 100644
index 0000000000..b7592c7043
--- /dev/null
+++ b/deploy/gcp/templates/deployment.yaml
@@ -0,0 +1,81 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ replicas: {{ .Values.replicaCount }}
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+ template:
+ metadata:
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+
+ spec:
+ containers:
+ - name: {{ .Chart.Name }}
+ image: "{{ .Values.image.imageName }}"
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+
+ resources:
+ limits:
+ nvidia.com/gpu: {{ .Values.image.numGpus }}
+
+ args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}"]
+
+ ports:
+ - containerPort: 8000
+ name: http
+ - containerPort: 8001
+ name: grpc
+ - containerPort: 8002
+ name: metrics
+ livenessProbe:
+ httpGet:
+ path: /v2/health/live
+ port: http
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ httpGet:
+ path: /v2/health/ready
+ port: http
+
+ securityContext:
+ runAsUser: 1000
+ fsGroup: 1000
diff --git a/deploy/gcp/templates/service.yaml b/deploy/gcp/templates/service.yaml
new file mode 100644
index 0000000000..3315fd77db
--- /dev/null
+++ b/deploy/gcp/templates/service.yaml
@@ -0,0 +1,91 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ type: {{ .Values.service.type }}
+ ports:
+ - port: 8000
+ targetPort: http
+ name: http-inference-server
+ - port: 8001
+ targetPort: grpc
+ name: grpc-inference-server
+ - port: 8002
+ targetPort: metrics
+ name: metrics-inference-server
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server-metrics.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ annotations:
+ alpha.monitoring.coreos.com/non-namespaced: "true"
+spec:
+ ports:
+ - name: metrics
+ port: 8080
+ targetPort: metrics
+ protocol: TCP
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: {{ template "triton-inference-server-metrics-monitor.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics-monitor.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ endpoints:
+ - port: metrics
+ interval: 15s
diff --git a/deploy/gcp/values.yaml b/deploy/gcp/values.yaml
new file mode 100644
index 0000000000..0173f37b6f
--- /dev/null
+++ b/deploy/gcp/values.yaml
@@ -0,0 +1,36 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+replicaCount: 1
+
+image:
+ imageName: nvcr.io/nvidia/tritonserver:24.03-py3
+ pullPolicy: IfNotPresent
+ modelRepositoryPath: gs://triton-inference-server-repository/model_repository
+ numGpus: 1
+
+service:
+ type: LoadBalancer
\ No newline at end of file
diff --git a/deploy/gke-marketplace-app/README.md b/deploy/gke-marketplace-app/README.md
new file mode 100644
index 0000000000..e99b9efbae
--- /dev/null
+++ b/deploy/gke-marketplace-app/README.md
@@ -0,0 +1,201 @@
+
+
+# NVIDIA Triton Inference Server GKE Marketplace Application
+
+**Table Of Contents**
+- [NVIDIA Triton Inference Server GKE Marketplace Application](#nvidia-triton-inference-server-gke-marketplace-application)
+ - [Description](#description)
+ - [Prerequisites](#prerequisites)
+ - [Demo Instruction](#demo-instruction)
+ - [Additional Resources](#additional-resources)
+ - [Known Issues](#known-issues)
+
+## Description
+
+This repository contains Google Kubernetes Engine(GKE) Marketplace Application for NVIDIA Triton Inference Server deployer.
+
+ - Triton GKE deployer is a helm chart deployer recommended by GKE Marketplace
+ - Triton GKE deployer deploys a GKE ingress which accepts public inference requests
+ - Triton GKE deployer includes a horizontal pod autoscaler(HPA) which relies on [stack driver custom metrics adaptor](https://github.com/GoogleCloudPlatform/k8s-stackdriver/tree/master/custom-metrics-stackdriver-adapter) to monitor GPU duty cycle, and auto scale GPU nodes.
+ - This repo also contains a sample to generate BERT model with TensorRT and use Locust to experiment with GPU node autoscaling and monitor client latency/throughput.
+
+![Cloud Architecture Diagram](diagram.png)
+
+## Prerequisites
+
+ - [Install Google Cloud SDK on your laptop/client workstation](https://cloud.google.com/sdk/docs/install), so that `gcloud` SDK cli interface could be run on the client and sign in with your GCP credentials.
+ - In addition, user could leverage [Google Cloud shell](https://cloud.google.com/shell/docs/launching-cloud-shell).
+
+## Demo Instruction
+
+First, install this Triton GKE app to an existing GKE cluster with GPU node pool, Google Cloud Marketplace currently doesn't support auto creation of GPU clusters. User has to run following command to create a compatible cluster (gke version >=1.18.7) with GPU node pools, we recommend user to select T4 or A100(MIG) instances type and choose CPU ratio based on profiling of actual inference workflow.
+
+Users need to follow these [instructions](https://cloud.google.com/kubernetes-engine/docs/how-to/kubernetes-service-accounts#creating_a_kubernetes_service_account) to create a kubernetes service account. In this example, we use `gke-test@k80-exploration.iam.gserviceaccount.com`. Make sure it has access to artifact registry and monitoring viewer. For example, to grant access to custom metrics which is required for HPA to work:
+```
+gcloud iam service-accounts add-iam-policy-binding --role \
+ roles/iam.workloadIdentityUser --member \
+ "serviceAccount:.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \
+ @.iam.gserviceaccount.com
+
+kubectl annotate serviceaccount --namespace custom-metrics \
+ custom-metrics-stackdriver-adapter \
+ iam.gke.io/gcp-service-account=@.iam.gserviceaccount.com
+```
+
+Currently, GKE >= 1.18.7 only supported in GKE rapid channel, to find the latest version, please visit [GKE release notes](https://cloud.google.com/kubernetes-engine/docs/release-notes).
+```
+export PROJECT_ID=
+export ZONE=
+export REGION=
+export DEPLOYMENT_NAME=
+# example: export SERVICE_ACCOUNT="gke-test@k80-exploration.iam.gserviceaccount.com"
+export SERVICE_ACCOUNT=
+
+gcloud beta container clusters create ${DEPLOYMENT_NAME} \
+--addons=HorizontalPodAutoscaling,HttpLoadBalancing \
+--service-account=${SERVICE_ACCOUNT} \
+--machine-type=n1-standard-8 \
+--node-locations=${ZONE} \
+--monitoring=SYSTEM \
+--zone=${ZONE} \
+--subnetwork=default \
+--scopes cloud-platform \
+--num-nodes 1 \
+--project ${PROJECT_ID}
+
+# add GPU node pools, user can modify number of node based on workloads
+gcloud container node-pools create accel \
+ --project ${PROJECT_ID} \
+ --zone ${ZONE} \
+ --cluster ${DEPLOYMENT_NAME} \
+ --service-account=${SERVICE_ACCOUNT} \
+ --num-nodes 2 \
+ --accelerator type=nvidia-tesla-t4,count=1 \
+ --enable-autoscaling --min-nodes 2 --max-nodes 3 \
+ --machine-type n1-standard-4 \
+ --disk-size=100 \
+ --scopes cloud-platform \
+ --verbosity error
+
+# so that you can run kubectl locally to the cluster
+gcloud container clusters get-credentials ${DEPLOYMENT_NAME} --project ${PROJECT_ID} --zone ${ZONE}
+
+# deploy NVIDIA device plugin for GKE to prepare GPU nodes for driver install
+kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded-latest.yaml
+
+# make sure you can run kubectl locally to access the cluster
+kubectl create clusterrolebinding cluster-admin-binding --clusterrole cluster-admin --user "$(gcloud config get-value account)"
+
+# enable stackdriver custom metrics adaptor
+kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml
+
+# create an ip for ingress traffic
+gcloud compute addresses create ingress-triton --global
+```
+
+Creating a cluster and adding GPU nodes could take up-to 10 minutes. Please be patient after executing this command. GPU resources in GCP could be fully utilized, so please try a different zone in case compute resource cannot be allocated. After GKE cluster is running, run `kubectl get pods --all-namespaces` to make sure the client can access the cluster correctly:
+
+If user would like to experiment with A100 MIG partitioned GPU in GKE, please create node pool with following command:
+```
+gcloud beta container node-pools create accel \
+ --project ${PROJECT_ID} \
+ --zone ${ZONE} \
+ --cluster ${DEPLOYMENT_NAME} \
+ --service-account=${SERVICE_ACCOUNT} \
+ --num-nodes 1 \
+ --accelerator type=nvidia-tesla-a100,count=1,gpu-partition-size=1g.5gb \
+ --enable-autoscaling --min-nodes 1 --max-nodes 2 \
+ --machine-type=a2-highgpu-1g \
+ --disk-size=100 \
+ --scopes cloud-platform \
+ --verbosity error
+```
+
+Please note that A100 MIG in GKE does not support GPU metrics yet, also Triton GPU Metrics is not compatible with A100 MIG. Hence, please disable GPU metrics by unselect allowGPUMetrics while deploy Triton GKE app. Also for the same reason, this deployer doesn't support inference workfload auto-scaling on A100 MIG as well.
+
+Second, go to this [GKE Marketplace link](https://console.cloud.google.com/marketplace/details/nvidia-ngc-public/triton-inference-server) to deploy Triton application.
+
+Users can leave everything as default if their models have already been tested/validated with Triton. They can provide a GCS path pointing to the model repository containing their models. By default, we provide a BERT large model optimized by TensorRT in a public demo GCS bucket that is compatible with the `xx.yy` release of Triton Server in `gs://triton_sample_models/xx_yy`. However, please take note of the following about this demo bucket:
+- The TensorRT engine provided in the demo bucket is only compatible with Tesla T4 GPUs.
+- This bucket is located in `us-central1`, so loading from this bucket into Triton in other regions may be affected.
+- The first deployment of this Triton GKE application will be slower than consecutive runs because the image needs to be pulled into the GKE cluster.
+- You can find an example of how this model is generated and uploaded [here](trt-engine/README.md).
+
+Where is the version of NGC Triton container needed.
+
+![GKE Marketplace Application UI](ui.png)
+
+We want to discuss HPA autoscaling metrics users can leverage. GPU Power(Percentage of Power) tends to be a reliable metric, especially for larger GPU like V100 and A100. GKE currently natively support GPU duty cycle which is GPU utilization in `nvidia-smi`. We ask users always profile their model to determine the autoscaling target and metrics. When attempting to select the right metrics for autoscaling, the goal should be to pick metrics based on the following: 1, meet SLA rrequirement. 2, give consideration to transient request load, 3, keep GPU as fully utilized as possible. Profiling comes in 2 aspects: If user decided to use Duty Cycle or other GPU metric, it is recommend establish baseline to link SLA requirement such as latency with GPU metrics, for example, for model A, latency will be below 10ms 99% of time when Duty Cycle is below 80% utilized. Additionally, profiling also provide insight to model optimization for inference, with tools like [Nsight](https://developer.nvidia.com/nsight-systems).
+
+Once the application is deployed successfully, get the public ip from ingress:
+```
+> kubectl get ingress
+NAME CLASS HOSTS ADDRESS PORTS AGE
+triton-external * 35.186.215.182 80 107s
+```
+
+Third, we will try sending request to server with provide client example.
+
+If User selected deploy Triton to accept HTTP request, please launch [Locust](https://docs.locust.io/en/stable/installation.html) with Ingress host and port to query Triton Inference Server. In this [example script](https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app/client-sample/locustfile_bert.py), we send request to Triton server which has loaded a BERT large TensorRT Engine with Sequence length of 128 into GCP bucket. We simulate 1000 concurrent user as target and spawn user at rate of 50 users per second.
+```
+locust -f locustfile_bert.py -H http://${INGRESS_HOST}:${INGRESS_PORT}
+```
+
+The client example push about ~650 QPS(Query per second) to Triton Server, and will trigger a auto scale of T4 GPU nodes (We recommend to use T4 and A100[MIG] for inference). From locust UI, we will observer a drop of latency mean and variance for the requests. At the end, after autoscaling, we see the latency stablized at ~200 ms, end to end from US client to europe server, which is excellent for a model that has 345 million parameters. Since for each node, we use 1T4 + n1-standard-4 instance, and it can handle ~450 QPS, with on-demand price, it is ($0.35+$0.19)=$0.54/hr, that translate to 3 million inference per dollar for BERT large model at batch size 1. Further more, with 3 year commitment price, hr rate is ($0.16+$0.08)=$0.24/hr, that translate to 6.75 million inference per dollar.
+
+![Locust Client Chart](client.png)
+
+Alternatively, user can opt to use
+[Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+to profile and study the performance of Triton Inference Server. Here we also
+provide a
+[client script](https://github.com/triton-inference-server/server/tree/master/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh)
+to use Perf Analyzer to send gRPC to Triton Server GKE deployment. Perf Analyzer
+client requires user to use NGC Triton Client Container.
+
+```
+bash perf_analyzer_grpc.sh ${INGRESS_HOST}:${INGRESS_PORT}
+```
+
+## Additional Resources
+
+See the following resources to learn more about NVIDIA Triton Inference Server and GKE GPU capabilities.
+
+**Documentation**
+
+- [GPU in Google Kubernetes Engine](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus)
+- [Optimize GPU Performance in Google Cloud Platform](https://cloud.google.com/compute/docs/gpus/optimize-gpus)
+- [Triton Inference Server](https://github.com/triton-inference-server/server)
+- [AI Platform Prediction: Custom container concepts with Triton Server](https://cloud.google.com/solutions/ai-platform-prediction-custom-container-concepts) by [Kevin Tsai](https://github.com/merlin1649)
+- [AI Platform Prediction: Direct model server setup for NVIDIA Triton Inference Server](https://cloud.google.com/solutions/ai-platform-prediction-direct-model-server-nvidia) by [Kevin Tsai](https://github.com/merlin1649)
+
+## Known Issues
+
+- GKE one click cluster creation doesn't support GPU node pools at the moment, users have to manually create a compatible (>=1.18.7) cluster and attach node pool (T4 and A100 MIG recommended)
+- When Horizontal Pod Autoscaler(HPA) expand and all GPU node pool already utilized, GKE will request new GPU node and it can take between 4-7 minutes, it could be a long wait plus GPU driver install and image pulling. We recommend user to leverage multi-tier model serving and Triton's priority feature to create cushion for latency critical models, and allocate active standby GPU node for spike of requests.
diff --git a/deploy/gke-marketplace-app/benchmark/README.md b/deploy/gke-marketplace-app/benchmark/README.md
new file mode 100644
index 0000000000..5138148035
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/README.md
@@ -0,0 +1,95 @@
+
+
+# Benchmarking with NVIDIA Triton Inference Server GKE Marketplace Application
+
+**Table Of Contents**
+- [Models](#models)
+- [Performance](#performance)
+
+## Models
+
+First, we collect a set of TensorFlow and TensorRT models to compare:
+
+- Get [Distill Bert fine-tuned with Squad Q&A task](https://huggingface.co/distilbert-base-cased-distilled-squad/tree/main) from Huggingface. `wget https://huggingface.co/distilbert-base-cased-distilled-squad/blob/main/saved_model.tar.gz`
+- Get [Bert base fine-tuned with Squad Q&A task](https://huggingface.co/deepset/bert-base-cased-squad2/tree/main) from Huggingface `wget https://huggingface.co/deepset/bert-base-cased-squad2/blob/main/saved_model.tar.gz`
+- Follow [TensorRT Demo Bert](https://github.com/NVIDIA/TensorRT/tree/master/demo/BERT) to convert BERT base model to TensorRT Engine, choose sequence length of 384 to match previous 2 TensorFlow models. Last step, we choose to create TensorRT engine with 2 optimization profile, profile 0 for batch size 1 and profile 1 for batch size 4 run: `python3 builder.py -m models/fine-tuned/bert_tf_ckpt_base_qa_squad2_amp_384_v19.03.1/model.ckpt -o engines/model.plan -b 8 -s 384 --fp16 --int8 --strict -c models/fine-tuned/bert_tf_ckpt_base_qa_squad2_amp_384_v19.03.1 --squad-json ./squad/train-v2.0.json -v models/fine-tuned/bert_tf_ckpt_base_qa_squad2_amp_384_v19.03.1/vocab.txt --calib-num 100 -iln -imh`. This needs to be ran on the inference GPU respectively (Engine optimized with A100 cannot be used for inference on T4).
+
+We the place the model into a GCS with following structure, `config.pbtxt` was provided.
+```
+ ├── bert_base_trt_gpu
+ │ ├── 1
+ │ │ └── model.plan
+ │ └── config.pbtxt
+ ├── bert_base_trt_gpu_seqlen128
+ │ ├── 1
+ │ │ └── model.plan
+ │ └── config.pbtxt
+ ├── bert_base_tf_gpu
+ │ ├── 1
+ │ │ └── model.savedmodel
+ │ └── config.pbtxt
+ ├── bert_base_tf_cpu
+ │ ├── 1
+ │ │ └── model.savedmodel
+ │ └── config.pbtxt
+ ├── bert_distill_tf_gpu
+ │ ├── 1
+ │ │ └── model.savedmodel
+ │ └── config.pbtxt
+ └── bert_distill_tf_cpu
+ ├── 1
+ │ └── model.savedmodel
+ └── config.pbtxt
+```
+
+When deploy Triton GKE application, point the model repository to directory contains the structure above with actual models.
+
+## Performance
+
+We use perf analyzer of Triton to benchmark the performance of each model, the perf analyzer reside in another pod of the GKE cluster.
+```bash
+export INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
+export INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].port}')
+bash perf_query.sh 35.194.5.119:80 bert_base_trt_gpu 384
+```
+
+We deploy model on n1-standard-96 for CPU BERT BASE and Distill BERT and (n1-standard-4 + T4) for GPU BERT models, the sequence length of the BERT model is 384 token, and measure the latency/throughput with a concurrency sweep with Triton's performance analyzer. The latency includes Istio ingress/load balancing and reflect the true round trip cost in the same GCP zone.
+
+For all the model with sequence length of 384:
+CPU BERT BASE: latency: 700ms, throughput: 12 qps
+CPU Distill BERT: latency: 369ms, throughput: 24 qps
+
+GPU BERT BASE: latency: 230ms, throughput: 34.7 qps
+GPU Distill BERT: latency: 118ms, throughput: 73.3 qps
+GPU TensorRT BERT BASE: latency: 50ms, throughput: 465 qps
+
+With n1-standard-96 priced at $4.56/hr and n1-standard-4 at $0.19/hr and T4 at $0.35/hr totaling $0.54/hr. While achieving a much lower latency, the TCO of BERT inference with TensorRT on T4 is over 163 times that of Distill BERT inference on n1-standard-96.
+
+
+
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_cpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_cpu/config.pbtxt
new file mode 100644
index 0000000000..3bfccb5c45
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_cpu/config.pbtxt
@@ -0,0 +1,35 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+max_batch_size: 4
+dynamic_batching {
+ preferred_batch_size: 1
+ max_queue_delay_microseconds: 2000000
+}
+instance_group {
+ count: 2
+ kind: KIND_CPU
+}
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
new file mode 100644
index 0000000000..b6ca32f9a2
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_tf_gpu/config.pbtxt
@@ -0,0 +1,35 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+max_batch_size: 4
+dynamic_batching {
+ preferred_batch_size: 4
+ max_queue_delay_microseconds: 200000
+}
+instance_group {
+ count: 2
+ kind: KIND_GPU
+}
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
new file mode 100644
index 0000000000..acbd124bf2
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu/config.pbtxt
@@ -0,0 +1,38 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+platform: "tensorrt_plan"
+max_batch_size: 4
+dynamic_batching {
+ preferred_batch_size: 4
+ max_queue_delay_microseconds: 200000
+}
+instance_group {
+ count: 2
+ profile: "1"
+ kind: KIND_GPU
+}
+
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu_seqlen128/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu_seqlen128/config.pbtxt
new file mode 100644
index 0000000000..2ee39e7dbc
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_base_trt_gpu_seqlen128/config.pbtxt
@@ -0,0 +1,37 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+platform: "tensorrt_plan"
+max_batch_size: 8
+dynamic_batching {
+ preferred_batch_size: 8
+ max_queue_delay_microseconds: 200000
+}
+instance_group {
+ count: 2
+ kind: KIND_GPU
+}
+
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
new file mode 100644
index 0000000000..c8e8074309
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_cpu/config.pbtxt
@@ -0,0 +1,35 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+max_batch_size: 4
+dynamic_batching {
+ preferred_batch_size: 1
+ max_queue_delay_microseconds: 2000000
+}
+instance_group {
+ count: 2
+ kind: KIND_CPU
+}
diff --git a/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
new file mode 100644
index 0000000000..b6ca32f9a2
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/model-store/bert_distill_tf_gpu/config.pbtxt
@@ -0,0 +1,35 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+max_batch_size: 4
+dynamic_batching {
+ preferred_batch_size: 4
+ max_queue_delay_microseconds: 200000
+}
+instance_group {
+ count: 2
+ kind: KIND_GPU
+}
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh
new file mode 100755
index 0000000000..0ce6e120b7
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/perf_query.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SERVER_HOST=${1:-"${INGRESS_HOST}:${INGRESS_PORT}"} # need update public IP
+MODEL_NAME=${2:-"${MODEL_NAME}"}
+SEQ_LENGTH=${3:-"${SEQ_LEN}"}
+BATCH_SIZE=${4:-2}
+MAX_LATENCY=${5:-5000}
+MAX_CLIENT_THREADS=${6:-20}
+MAX_CONCURRENCY=${7:-24}
+MODEL_VERSION=${8:-1}
+precision=${9:-"fp32"}
+PERFCLIENT_PERCENTILE=${10:-90}
+MAX_TRIALS=${12:-40}
+
+ARGS="\
+ --max-threads ${MAX_CLIENT_THREADS} \
+ -m ${MODEL_NAME} \
+ -x ${MODEL_VERSION} \
+ -p 3000 \
+ --async \
+ --concurrency-range 4:${MAX_CONCURRENCY}:2 \
+ -r ${MAX_TRIALS} \
+ -v \
+ -i HTTP \
+ -u ${SERVER_HOST} \
+ -b ${BATCH_SIZE} \
+ -l ${MAX_LATENCY} \
+ -z \
+ --percentile=${PERFCLIENT_PERCENTILE}"
+
+echo "Using args: $(echo "$ARGS" | sed -e 's/ -/\n-/g')"
+
+/workspace/install/bin/perf_client $ARGS -f perf.csv
\ No newline at end of file
diff --git a/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
new file mode 100644
index 0000000000..7339361528
--- /dev/null
+++ b/deploy/gke-marketplace-app/benchmark/perf-analyzer-script/triton_client.yaml
@@ -0,0 +1,42 @@
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Pod
+metadata:
+ labels:
+ app: nv-triton-client
+ name: nv-triton-client
+ namespace: default
+spec:
+ containers:
+ - image: nvcr.io/nvidia/tritonserver:24.03-py3-sdk
+ imagePullPolicy: Always
+ name: nv-triton-client
+ securityContext:
+ privileged: true
+ command: [ "/bin/bash", "-c", "--" ]
+ args: [ "while true; do sleep 30; done;" ]
diff --git a/deploy/gke-marketplace-app/client-sample/bert_request.json b/deploy/gke-marketplace-app/client-sample/bert_request.json
new file mode 100644
index 0000000000..ce4b956db6
--- /dev/null
+++ b/deploy/gke-marketplace-app/client-sample/bert_request.json
@@ -0,0 +1,27 @@
+{
+ "inputs": [{
+ "name": "input_ids",
+ "shape": [1, 128],
+ "datatype": "INT32",
+ "parameters": {},
+ "data": [101, 2054, 2003, 23435, 5339, 1029, 102, 23435, 5339, 2003, 1037, 2152, 2836, 2784, 4083, 28937, 4132, 2008, 18058, 2659, 2397, 9407, 1998, 2152, 2083, 18780, 2005, 18726, 2107, 2004, 16755, 2545, 1010, 4613, 1998, 3746, 1013, 2678, 2006, 1050, 17258, 2401, 14246, 2271, 1012, 2009, 2950, 11968, 8043, 2015, 2000, 12324, 4275, 1010, 1998, 13354, 7076, 2000, 2490, 3117, 23092, 1998, 9014, 2077, 11243, 20600, 2015, 2005, 28937, 1012, 2651, 1050, 17258, 2401, 2003, 2330, 1011, 14768, 6129, 11968, 8043, 2015, 1998, 13354, 7076, 1999, 23435, 5339, 2061, 2008, 1996, 2784, 4083, 2451, 2064, 7661, 4697, 1998, 7949, 2122, 6177, 2000, 2202, 5056, 1997, 3928, 23435, 5339, 20600, 2015, 2005, 2115, 18726, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ }, {
+ "name": "input_mask",
+ "shape": [1, 128],
+ "datatype": "INT32",
+ "parameters": {},
+ "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ }, {
+ "name": "segment_ids",
+ "shape": [1, 128],
+ "datatype": "INT32",
+ "parameters": {},
+ "data": [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+ }],
+ "outputs": [{
+ "name": "cls_squad_logits",
+ "parameters": {
+ "binary_data": false
+ }
+ }]
+}
diff --git a/deploy/gke-marketplace-app/client-sample/locustfile_bert.py b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py
new file mode 100755
index 0000000000..aae8c69f43
--- /dev/null
+++ b/deploy/gke-marketplace-app/client-sample/locustfile_bert.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+
+from locust import HttpUser, LoadTestShape, between, task
+
+
+class ProfileLoad(LoadTestShape):
+ """
+ This load profile starts at 0 and steps up by step_users
+ increments every tick, up to target_users. After reaching
+ target_user level, load will stay at target_user level
+ until time_limit is reached.
+ """
+
+ target_users = 1000
+ step_users = 50 # ramp users each step
+ time_limit = 3600 # seconds
+
+ def tick(self):
+ num_steps = self.target_users / self.step_users
+ run_time = round(self.get_run_time())
+
+ if run_time < self.time_limit:
+ if num_steps < run_time:
+ user_count = num_steps * self.step_users
+ else:
+ user_count = self.target_users
+ return (user_count, self.step_users)
+ else:
+ return None
+
+
+class TritonUser(HttpUser):
+ wait_time = between(0.2, 0.2)
+
+ @task()
+ def bert(self):
+ response = self.client.post(self.url1, data=json.dumps(self.data))
+
+ def on_start(self):
+ with open("bert_request.json") as f:
+ self.data = json.load(f)
+
+ self.url1 = "{}/v2/models/{}/infer".format(self.environment.host, "bert")
diff --git a/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh b/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh
new file mode 100755
index 0000000000..ae5476f338
--- /dev/null
+++ b/deploy/gke-marketplace-app/client-sample/perf_analyzer_grpc.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SERVER_HOST=${1:-"${INGRESS_HOST}:${INGRESS_PORT}"} # need update public IP
+MODEL_VERSION=${2:-1}
+precision=${3:-"int8"}
+BATCH_SIZE=${4:-1}
+MAX_LATENCY=${5:-500}
+MAX_CLIENT_THREADS=${6:-6}
+MAX_CONCURRENCY=${7:-20}
+MODEL_NAME=${8:-"bert"}
+SEQ_LENGTH=${9:-"128"}
+PERFCLIENT_PERCENTILE=${10:-90}
+STABILITY_PERCENTAGE=${11:-0.01}
+MAX_TRIALS=${12:-1000000}
+
+ARGS="\
+ --max-threads ${MAX_CLIENT_THREADS} \
+ -m ${MODEL_NAME} \
+ -x ${MODEL_VERSION} \
+ -p 1000 \
+ -t ${MAX_CONCURRENCY} \
+ -s ${STABILITY_PERCENTAGE} \
+ -r ${MAX_TRIALS} \
+ -v \
+ -i gRPC \
+ -u ${SERVER_HOST} \
+ -b ${BATCH_SIZE} \
+ -l ${MAX_LATENCY} \
+ -z \
+ --shape=input_ids:${SEQ_LENGTH} \
+ --shape=segment_ids:${SEQ_LENGTH} \
+ --shape=input_mask:${SEQ_LENGTH} \
+ --percentile=${PERFCLIENT_PERCENTILE}"
+
+echo "Using args: $(echo "$ARGS" | sed -e 's/ -/\n-/g')"
+
+/workspace/install/bin/perf_client $ARGS
diff --git a/deploy/gke-marketplace-app/client.png b/deploy/gke-marketplace-app/client.png
new file mode 100644
index 0000000000..1fe3dbe7d5
Binary files /dev/null and b/deploy/gke-marketplace-app/client.png differ
diff --git a/deploy/gke-marketplace-app/diagram.png b/deploy/gke-marketplace-app/diagram.png
new file mode 100644
index 0000000000..7592672e94
Binary files /dev/null and b/deploy/gke-marketplace-app/diagram.png differ
diff --git a/deploy/gke-marketplace-app/server-deployer/Dockerfile b/deploy/gke-marketplace-app/server-deployer/Dockerfile
new file mode 100644
index 0000000000..5bb34adc65
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/Dockerfile
@@ -0,0 +1,28 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM gcr.io/cloud-marketplace-tools/k8s/deployer_helm/onbuild
+
diff --git a/deploy/gke-marketplace-app/server-deployer/build_and_push.sh b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
new file mode 100755
index 0000000000..8114dbe6f8
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/build_and_push.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+export REGISTRY=gcr.io/$(gcloud config get-value project | tr ':' '/')
+export APP_NAME=tritonserver
+export MAJOR_VERSION=2.41
+export MINOR_VERSION=2.44.0
+export NGC_VERSION=24.03-py3
+
+docker pull nvcr.io/nvidia/$APP_NAME:$NGC_VERSION
+
+docker tag nvcr.io/nvidia/$APP_NAME:$NGC_VERSION $REGISTRY/$APP_NAME:$MAJOR_VERSION
+docker tag nvcr.io/nvidia/$APP_NAME:$NGC_VERSION $REGISTRY/$APP_NAME:$MINOR_VERSION
+docker tag nvcr.io/nvidia/$APP_NAME:$NGC_VERSION $REGISTRY/$APP_NAME:$NGC_VERSION
+
+docker push $REGISTRY/$APP_NAME:$MINOR_VERSION
+docker push $REGISTRY/$APP_NAME:$MAJOR_VERSION
+docker push $REGISTRY/$APP_NAME:$NGC_VERSION
+
+docker build --tag $REGISTRY/$APP_NAME/deployer .
+
+docker tag $REGISTRY/$APP_NAME/deployer $REGISTRY/$APP_NAME/deployer:$MAJOR_VERSION
+docker tag $REGISTRY/$APP_NAME/deployer $REGISTRY/$APP_NAME/deployer:$MINOR_VERSION
+docker push $REGISTRY/$APP_NAME/deployer:$MAJOR_VERSION
+docker push $REGISTRY/$APP_NAME/deployer:$MINOR_VERSION
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
new file mode 100644
index 0000000000..73590f2ea0
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/Chart.yaml
@@ -0,0 +1,31 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+appVersion: "2.41"
+description: Triton Inference Server
+name: triton-inference-server
+version: 2.44.0
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/logo.png b/deploy/gke-marketplace-app/server-deployer/chart/triton/logo.png
new file mode 100644
index 0000000000..9c70ab77fb
Binary files /dev/null and b/deploy/gke-marketplace-app/server-deployer/chart/triton/logo.png differ
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/_helpers.tpl b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/_helpers.tpl
new file mode 100644
index 0000000000..cd4ef9264a
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/_helpers.tpl
@@ -0,0 +1,60 @@
+{{/*
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/}}
+
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "triton-inference-server.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "triton-inference-server.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "triton-inference-server.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
new file mode 100644
index 0000000000..28bfbf08c4
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/application.yaml
@@ -0,0 +1,68 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+{{ if and .Values.gcpMarketplace (eq .Values.gcpMarketplace true) }}
+---
+apiVersion: app.k8s.io/v1beta1
+kind: Application
+metadata:
+ name: "{{ .Release.Name }}"
+ annotations:
+ kubernetes-engine.cloud.google.com/icon: >-
+ data:image/png;base64,{{ .Files.Get "logo.png" | b64enc }}
+ marketplace.cloud.google.com/deploy-info: '{"partner_id": "nvidia", "product_id": "triton", "partner_name": "NVIDIA"}'
+ labels:
+ app.kubernetes.io/name: "{{ .Release.Name }}"
+spec:
+ descriptor:
+ type: Triton
+ version: "{{ .Values.publishedVersion }}"
+ description: |-
+ Triton Inference Server provides a cloud and edge inferencing solution
+ optimized for both CPUs and GPUs. Triton supports an HTTP/REST and GRPC
+ protocol that allows remote clients to request inferencing for any model
+ being managed by the server.
+
+ notes: |-
+
+ Send request to Triton server by using IP address "ingress-triton",
+ send to IP:80/v2/models/{}/infer
+
+ Links:
+ - [NVIDIA Triton page](https://developer.nvidia.com/nvidia-triton-inference-server)
+ - [Documentation](https://github.com/triton-inference-server/server)
+
+ selector:
+ matchLabels:
+ app.kubernetes.io/name: "{{ .Release.Name }}"
+ componentKinds:
+ - group: apps/v1
+ kind: Deployment
+ - group: v1
+ kind: Service
+ - group: autoscaling/v2
+ kind: HorizontalPodAutoscaler
+{{ end }}
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
new file mode 100644
index 0000000000..75ac1aee81
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/deployment.yaml
@@ -0,0 +1,93 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ template "triton-inference-server.name" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ replicas: {{ .Values.initReplicaCount }}
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+ template:
+ metadata:
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+
+ spec:
+ containers:
+ - name: {{ .Chart.Name }}
+ image: "{{ .Values.image.registry }}/{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+
+ resources:
+ limits:
+ nvidia.com/gpu: {{ .Values.image.numGpus }}
+ env:
+ - name: LD_PRELOAD
+ value: {{ .Values.image.ldPreloadPath }}
+ args: ["tritonserver", "--model-store={{ .Values.modelRepositoryPath }}",
+ "--strict-model-config={{ .Values.image.strictModelConfig }}",
+ "--log-verbose={{ .Values.image.logVerboseLevel }}",
+ "--allow-gpu-metrics={{ .Values.image.allowGPUMetrics }}"]
+
+ ports:
+ - containerPort: 8000
+ name: http
+ - containerPort: 8001
+ name: grpc
+ - containerPort: 8002
+ name: metrics
+ livenessProbe:
+ httpGet:
+ path: /v2/health/live
+ port: http
+ initialDelaySeconds: {{ .Values.deployment.livenessProbe.initialDelaySeconds }}
+ periodSeconds: {{ .Values.deployment.livenessProbe.periodSeconds }}
+ timeoutSeconds: {{ .Values.deployment.livenessProbe.timeoutSeconds }}
+ successThreshold: {{ .Values.deployment.livenessProbe.successThreshold }}
+ failureThreshold: {{ .Values.deployment.livenessProbe.failureThreshold }}
+ readinessProbe:
+ httpGet:
+ path: /v2/health/ready
+ port: http
+ initialDelaySeconds: {{ .Values.deployment.readinessProbe.initialDelaySeconds }}
+ periodSeconds: {{ .Values.deployment.readinessProbe.periodSeconds }}
+ timeoutSeconds: {{ .Values.deployment.readinessProbe.timeoutSeconds }}
+ successThreshold: {{ .Values.deployment.readinessProbe.successThreshold }}
+ failureThreshold: {{ .Values.deployment.readinessProbe.failureThreshold }}
+
+ securityContext:
+ runAsUser: 1000
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/hpa.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/hpa.yaml
new file mode 100644
index 0000000000..89275ea7de
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/hpa.yaml
@@ -0,0 +1,49 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+ name: triton-hpa
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: triton-hpa
+spec:
+ minReplicas: {{ .Values.minReplicaCount }}
+ maxReplicas: {{ .Values.maxReplicaCount }}
+ metrics:
+ - type: External
+ external:
+ metric:
+ name: kubernetes.io|container|accelerator|duty_cycle
+ target:
+ type: AverageValue
+ averageValue: {{ .Values.HPATargetAverageValue }}
+
+ scaleTargetRef:
+ apiVersion: apps/v1
+ kind: Deployment
+ name: {{ template "triton-inference-server.name" . }}
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/ingress.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/ingress.yaml
new file mode 100644
index 0000000000..2b6da5fe18
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/ingress.yaml
@@ -0,0 +1,48 @@
+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: triton-external
+ annotations:
+ kubernetes.io/ingress.class: "gce"
+ kubernetes.io/ingress.global-static-ip-name: "ingress-triton"
+spec:
+ rules:
+ - http:
+ paths:
+ - path: "/"
+ pathType: Prefix
+ backend:
+ service:
+ name: triton-inference-server
+ port:
+ {{ if eq .Values.tritonProtocol "gRPC" }}
+ number: 8001
+ {{ else }}
+ number: 8000
+ {{ end }}
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
new file mode 100644
index 0000000000..93ef6f9da3
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/templates/service.yaml
@@ -0,0 +1,55 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server.name" . }}
+ namespace: {{ .Release.Namespace }}
+ annotations:
+ cloud.google.com/neg: '{"ingress": true}'
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ type: {{ .Values.service.type }}
+ ports:
+ - port: 8000
+ targetPort: http
+ name: http-inference-server
+ - port: 8001
+ targetPort: grpc
+ name: grpc-inference-server
+ - port: 8002
+ targetPort: metrics
+ name: metrics-inference-server
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+
+
diff --git a/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
new file mode 100644
index 0000000000..3e5eac70b5
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/chart/triton/values.yaml
@@ -0,0 +1,66 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+initReplicaCount: 1
+minReplicaCount: 1
+maxReplicaCount: 3
+# choice from gRPC and HTTP
+tritonProtocol: HTTP
+# HPA GPU utilization autoscaling target
+HPATargetAverageValue: 85
+modelRepositoryPath: gs://triton_sample_models/24_03
+publishedVersion: '2.44.0'
+gcpMarketplace: true
+
+image:
+ registry: gcr.io
+ repository: nvidia-ngc-public/tritonserver
+ tag: 24.03-py3
+ pullPolicy: IfNotPresent
+ # modify the model repository here to match your GCP storage bucket
+ numGpus: 1
+ strictModelConfig: False
+ # add in custom library which could include custom ops in the model
+ ldPreloadPath: ''
+ logVerboseLevel: 0
+ allowGPUMetrics: True
+
+service:
+ type: NodePort
+
+deployment:
+ livenessProbe:
+ failureThreshold: 60
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ successThreshold: 1
+ timeoutSeconds: 1
+ readinessProbe:
+ failureThreshold: 60
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ successThreshold: 1
+ timeoutSeconds: 1
diff --git a/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
new file mode 100644
index 0000000000..9fd8cbe1c4
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/data-test/schema.yaml
@@ -0,0 +1,123 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+x-google-marketplace:
+ schemaVersion: v2
+ applicationApiVersion: v1beta1
+ publishedVersion: '2.44.0'
+ publishedVersionMetadata:
+ releaseNote: >-
+ Initial release.
+ releaseTypes:
+ - Feature
+ recommended: true
+
+ clusterConstraints:
+ k8sVersion: ">=1.18.7"
+ assistedClusterCreation:
+ type: DISABLED
+ creationGuidance: GKE currently doesn't support auto-create GPU clusters, please refer to Triton GKE Marketplace Deployer to manually create the GKE cluster >= 1.18.7 and add GPU node pools
+ resources:
+ - requests:
+ gpu:
+ nvidia.com/gpu: {}
+ istio:
+ type: REQUIRED
+
+ images:
+ '':
+ properties:
+ triton.image.registry:
+ type: REGISTRY
+ triton.image.repository:
+ type: REPO_WITHOUT_REGISTRY
+ triton.image.tag:
+ type: TAG
+
+properties:
+ name:
+ type: string
+ x-google-marketplace:
+ type: NAME
+ namespace:
+ type: string
+ x-google-marketplace:
+ type: NAMESPACE
+ initReplicaCount:
+ title: Initial number of Triton pod instances to deploy.
+ type: integer
+ default: 1
+ minReplicaCount:
+ title: Minimum number of Triton pod instances in the deployment for autoscaling.
+ type: integer
+ default: 1
+ maxReplicaCount:
+ title: Maximum number of Triton pod instances in the deployment for autoscaling.
+ type: integer
+ default: 3
+ tritonProtocol:
+ title: Request protocol to send data to Triton, choose from gRPC and HTTP.
+ type: string
+ default: HTTP
+ HPATargetAverageValue:
+ title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency.
+ type: integer
+ default: 85
+ modelRepositoryPath:
+ type: string
+ title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
+ default: gs://triton_sample_models/models
+ image.ldPreloadPath:
+ type: string
+ title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
+ default: ''
+ image.logVerboseLevel:
+ type: integer
+ title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug.
+ default: 0
+ image.strictModelConfig:
+ type: boolean
+ title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository.
+ default: False
+ image.allowGPUMetrics:
+ type: boolean
+ title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG.
+ default: True
+ istioEnabled:
+ type: boolean
+ x-google-marketplace:
+ type: ISTIO_ENABLED
+ default: True
+
+
+required:
+- name
+- namespace
+- modelRepositoryPath
+
+form:
+- widget: help
+ description: GKE currently doesn't support autocreate GPU clusters, please refer to Triton GKE Marketplace Deployer to manually create the GKE cluster >= 1.18.7 and add GPU node pools. Also, please refer to the Triton GITHUB page for product information.
diff --git a/deploy/gke-marketplace-app/server-deployer/schema.yaml b/deploy/gke-marketplace-app/server-deployer/schema.yaml
new file mode 100644
index 0000000000..0efdef3e72
--- /dev/null
+++ b/deploy/gke-marketplace-app/server-deployer/schema.yaml
@@ -0,0 +1,123 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+x-google-marketplace:
+ schemaVersion: v2
+ applicationApiVersion: v1beta1
+ publishedVersion: '2.44.0'
+ publishedVersionMetadata:
+ releaseNote: >-
+ Initial release.
+ releaseTypes:
+ - Feature
+ recommended: true
+
+ clusterConstraints:
+ k8sVersion: ">=1.18.7"
+ assistedClusterCreation:
+ type: DISABLED
+ creationGuidance: GKE currently doesn't support auto-create GPU clusters, please refer to Triton GKE Marketplace Deployer to manually create the GKE cluster >= 1.18.7 and add GPU node pools
+ resources:
+ - requests:
+ gpu:
+ nvidia.com/gpu: {}
+ istio:
+ type: REQUIRED
+
+ images:
+ '':
+ properties:
+ triton.image.registry:
+ type: REGISTRY
+ triton.image.repository:
+ type: REPO_WITHOUT_REGISTRY
+ triton.image.tag:
+ type: TAG
+
+properties:
+ name:
+ type: string
+ x-google-marketplace:
+ type: NAME
+ namespace:
+ type: string
+ x-google-marketplace:
+ type: NAMESPACE
+ initReplicaCount:
+ title: Initial number of Triton pod instances to deploy.
+ type: integer
+ default: 1
+ minReplicaCount:
+ title: Minimum number of Triton pod instances in the deployment for autoscaling.
+ type: integer
+ default: 1
+ maxReplicaCount:
+ title: Maximum number of Triton pod instances in the deployment for autoscaling.
+ type: integer
+ default: 3
+ tritonProtocol:
+ title: Request protocol to send data to Triton, choose from gRPC and HTTP.
+ type: string
+ default: HTTP
+ HPATargetAverageValue:
+ title: HPA autoscaling target, GKE currently support Duty Cycle which is GPU utilization, when target is reached, Triton Server service will create another pod instance. We ask user to analyze model inference to associate appropriate GPU metric target based on latency requirement. We also recommend to leave some room to mitigate transient load effect. For user interested in customizing autoscaling metrics, we recommends GPU Power (Percentage of Power), Queue time or SLA measurements such as latency.
+ type: integer
+ default: 85
+ modelRepositoryPath:
+ type: string
+ title: Bucket where models are stored. Please make sure the user/service account to create the GKE app has permission to this GCS bucket. Read Triton documentation on configs and formatting details, supporting TensorRT, TensorFlow, Pytorch, Onnx ... etc.
+ default: gs://triton_sample_models/24_03
+ image.ldPreloadPath:
+ type: string
+ title: Leave this empty by default. Triton allows users to create custom layers for backend such as TensorRT plugin or Tensorflow custom ops, the compiled shared library must be provided via LD_PRELOAD environment variable.
+ default: ''
+ image.logVerboseLevel:
+ type: integer
+ title: Set verbose logging level. Zero (0) disables verbose logging and values >= 1 enable verbose logging, this is helpful when user unsure if the model is compatible with Triton or for general debug.
+ default: 0
+ image.strictModelConfig:
+ type: boolean
+ title: Leave this unchecked by default. When strictModelConfig is not checked(False), Triton will try to infer the config file from model file, when checked(True), user need to provide config.pbtxt in model repository.
+ default: False
+ image.allowGPUMetrics:
+ type: boolean
+ title: Select by default. When use A100 MIG, unselect to disable GPU Memory metrics reported by Triton, as current GPU metrics not support on A100 MIG.
+ default: True
+ istioEnabled:
+ type: boolean
+ x-google-marketplace:
+ type: ISTIO_ENABLED
+ default: True
+
+
+required:
+- name
+- namespace
+- modelRepositoryPath
+
+form:
+- widget: help
+ description: GKE currently doesn't support autocreate GPU clusters, please refer to Triton GKE Marketplace Deployer to manually create the GKE cluster >= 1.18.7 and add GPU node pools. Also, please refer to the Triton GITHUB page for product information.
diff --git a/deploy/gke-marketplace-app/trt-engine/README.md b/deploy/gke-marketplace-app/trt-engine/README.md
new file mode 100644
index 0000000000..fd9ad2e0a5
--- /dev/null
+++ b/deploy/gke-marketplace-app/trt-engine/README.md
@@ -0,0 +1,63 @@
+
+
+# Instruction to create BERT engine for each Triton update
+
+## Description
+
+```
+docker run --gpus all -it --network host \
+ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
+ -v ~:/scripts nvcr.io/nvidia/tensorrt:24.03-py3
+
+pip install onnx six torch tf2onnx tensorflow
+
+git clone -b main https://github.com/NVIDIA/TensorRT.git
+cd TensorRT
+git submodule update --init --recursive
+
+export TRT_OSSPATH=/workspace/TensorRT
+export TRT_LIBPATH=/lib/x86_64-linux-gnu
+
+pushd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && ln -s ngc-cli/ngc ngc && echo "no-apikey\nascii\n" | ngc config set
+
+popd
+
+cd /workspace/TensorRT/demo/BERT
+bash ./scripts/download_squad.sh
+bash ./scripts/download_model.sh large 128
+# bash ./scripts/download_model.sh large 384
+
+mkdir -p engines
+
+python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
+
+gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/24_03/bert/1/model.plan
+```
+
+For each Triton upgrade, container version used to generate the model, and the model path in GCS `gs://triton_sample_models/24_03/` should be updated accordingly with the correct version.
diff --git a/deploy/gke-marketplace-app/ui.png b/deploy/gke-marketplace-app/ui.png
new file mode 100644
index 0000000000..7afec326ee
Binary files /dev/null and b/deploy/gke-marketplace-app/ui.png differ
diff --git a/deploy/k8s-onprem/Chart.yaml b/deploy/k8s-onprem/Chart.yaml
new file mode 100644
index 0000000000..92830bc297
--- /dev/null
+++ b/deploy/k8s-onprem/Chart.yaml
@@ -0,0 +1,44 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v2
+appVersion: "1.0"
+description: Triton Inference Server
+name: triton-inference-server
+version: 1.0.0
+dependencies:
+ - name: traefik
+ version: "~10.6.2"
+ repository: "https://helm.traefik.io/traefik"
+ tags:
+ - loadBalancing
+ - name: prometheus-adapter
+ version: "~3.0.0"
+ repository: "https://prometheus-community.github.io/helm-charts"
+ tags:
+ - autoscaling
+
+
diff --git a/deploy/k8s-onprem/README.md b/deploy/k8s-onprem/README.md
new file mode 100644
index 0000000000..4287b23c35
--- /dev/null
+++ b/deploy/k8s-onprem/README.md
@@ -0,0 +1,329 @@
+
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+# Kubernetes Deploy: NVIDIA Triton Inference Server Cluster
+
+This repository includes a Helm chart and instructions for installing NVIDIA Triton
+Inference Server in an on-premises or AWS EC2 Kubernetes cluster. You can also use this
+repository to enable load balancing and autoscaling for your Triton cluster.
+
+This guide assumes you already have a functional Kubernetes cluster with support for GPUs.
+See the [NVIDIA GPU Operator documentation](https://docs.nvidia.com/datacenter/cloud-native/kubernetes/install-k8s.html)
+for instructions on how to install Kubernetes and enable GPU access in your Kubernetes cluster.
+You must also have Helm installed (see [Installing Helm](#installing-helm) for instructions). Note the following requirements:
+
+* To deploy Prometheus and Grafana to collect and display Triton metrics, your cluster must contain sufficient CPU resources to support these services.
+
+* To use GPUs for inferencing, your cluster must be configured to contain the desired number of GPU nodes, with
+support for the NVIDIA driver and CUDA version required by the version
+of the inference server you are using.
+
+* To enable autoscaling, your cluster's kube-apiserver must have the [aggregation layer
+enabled](https://kubernetes.io/docs/tasks/extend-kubernetes/configure-aggregation-layer/).
+This will allow the horizontal pod autoscaler to read custom metrics from the prometheus adapter.
+
+This Helm chart is available from [Triton Inference Server
+GitHub.](https://github.com/triton-inference-server/server)
+
+For more information on Helm and Helm charts, visit the [Helm documentation](https://helm.sh/docs/).
+
+## Quickstart
+
+First, clone this repository to a local machine. Then, execute the following commands:
+
+Install helm
+
+```
+$ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
+$ chmod 700 get_helm.sh
+$ ./get_helm.sh
+```
+
+Deploy Prometheus and Grafana
+
+```
+$ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+$ helm repo update
+$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack
+```
+
+Deploy Triton with default settings
+
+```
+helm install example ./deploy/k8s-onprem
+```
+
+
+
+
+
+## Installing Helm
+
+### Helm v3
+
+If you do not already have Helm installed in your Kubernetes cluster,
+executing the following steps from the [official Helm install
+guide](https://helm.sh/docs/intro/install/) will
+give you a quick setup.
+
+If you are currently using Helm v2 and would like to migrate to Helm v3,
+see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/).
+
+## Model Repository
+If you already have a model repository, you may use that with this Helm
+chart. If you do not have a model repository, you can check out a local
+copy of the server source repository to create an example
+model repository:
+
+```
+$ git clone https://github.com/triton-inference-server/server.git
+```
+
+Triton Server needs a repository of models that it will make available
+for inferencing. For this example, we are using an existing NFS server and
+placing our model files there. See the
+[Model Repository documentation](../../docs/user_guide/model_repository.md) for other
+supported locations.
+
+Following the [QuickStart](../../docs/getting_started/quickstart.md), download the
+example model repository to your system and copy it onto your NFS server.
+Then, add the url or IP address of your NFS server and the server path of your
+model repository to `values.yaml`.
+
+
+## Deploy Prometheus and Grafana
+
+The inference server metrics are collected by Prometheus and viewable
+through Grafana. The inference server Helm chart assumes that Prometheus
+and Grafana are available so this step must be followed even if you
+do not want to use Grafana.
+
+Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) Helm chart to install these components. The
+*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that
+Prometheus can find the inference server metrics in the *example*
+release deployed in a later section.
+
+```
+$ helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+$ helm repo update
+$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack
+```
+
+Then port-forward to the Grafana service so you can access it from
+your local browser.
+
+```
+$ kubectl port-forward service/example-metrics-grafana 8080:80
+```
+
+Now you should be able to navigate in your browser to localhost:8080
+and see the Grafana login page. Use username=admin and
+password=prom-operator to log in.
+
+An example Grafana dashboard is available in dashboard.json. Use the
+import function in Grafana to import and view this dashboard.
+
+## Enable Autoscaling
+To enable autoscaling, ensure that autoscaling tag in `values.yaml`is set to `true`.
+This will do two things:
+
+1. Deploy a Horizontal Pod Autoscaler that will scale replicas of the triton-inference-server
+based on the information included in `values.yaml`.
+
+2. Install the [prometheus-adapter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-adapter) helm chart, allowing the Horizontal Pod Autoscaler to scale
+based on custom metrics from prometheus.
+
+The included configuration will scale Triton pods based on the average queue time,
+as described in [this blog post](https://developer.nvidia.com/blog/deploying-nvidia-triton-at-scale-with-mig-and-kubernetes/#:~:text=Query%20NVIDIA%20Triton%20metrics%20using%20Prometheus). To customize this,
+you may replace or add to the list of custom rules in `values.yaml`. If you change
+the custom metric, be sure to change the values in autoscaling.metrics.
+
+If autoscaling is disabled, the number of Triton server pods is set to the minReplicas
+variable in `values.yaml`.
+
+## Enable Load Balancing
+To enable load balancing, ensure that the loadBalancing tag in `values.yaml`
+is set to `true`. This will do two things:
+
+1. Deploy a Traefik reverse proxy through the [Traefik Helm Chart](https://github.com/traefik/traefik-helm-chart).
+
+2. Configure two Traefik [IngressRoutes](https://doc.traefik.io/traefik/providers/kubernetes-crd/),
+one for http and one for grpc. This will allow the Traefik service to expose two
+ports that will be forwarded to and balanced across the Triton pods.
+
+To choose the port numbers exposed, or to disable either http or grpc, edit the
+configured variables in `values.yaml`.
+
+## Deploy the Inference Server
+
+Deploy the inference server, autoscaler, and load balancer using the default
+configuration with the following commands.
+
+Here, and in the following commands we use the name `example` for our chart.
+This name will be added to the beginning of all resources created during the helm
+installation.
+
+```
+$ cd
+$ helm install example .
+```
+
+Use kubectl to see status and wait until the inference server pods are
+running.
+
+```
+$ kubectl get pods
+NAME READY STATUS RESTARTS AGE
+example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s
+```
+
+There are several ways of overriding the default configuration as
+described in this [Helm
+documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing).
+
+You can edit the values.yaml file directly or you can use the *--set*
+option to override a single parameter with the CLI. For example, to
+deploy a cluster with a minimum of two inference servers use *--set* to
+set the autoscaler.minReplicas parameter.
+
+```
+$ helm install example --set autoscaler.minReplicas=2 .
+```
+
+You can also write your own "config.yaml" file with the values you
+want to override and pass it to Helm. If you specify a "config.yaml" file, the
+values set will override those in values.yaml.
+
+```
+$ cat << EOF > config.yaml
+namespace: MyCustomNamespace
+image:
+ imageName: nvcr.io/nvidia/tritonserver:custom-tag
+ modelRepositoryPath: gs://my_model_repository
+EOF
+$ helm install example -f config.yaml .
+```
+
+## Probe Configuration
+
+In `templates/deployment.yaml` is configurations for `livenessProbe`, `readinessProbe` and `startupProbe` for the Triton server container.
+By default, Triton loads all the models before starting the HTTP server to respond to the probes. The process can take several minutes, depending on the models sizes.
+If it is not completed in `startupProbe.failureThreshold * startupProbe.periodSeconds` seconds then Kubernetes considers this as a pod failure and restarts it,
+ending up with an infinite loop of restarting pods, so make sure to sufficiently set these values for your use case.
+The liveliness and readiness probes are being sent only after the first success of a startup probe.
+
+For more details, see the [Kubernetes probe documentation](https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/) and the [feature page of the startup probe](https://github.com/kubernetes/enhancements/blob/master/keps/sig-node/950-liveness-probe-holdoff/README.md).
+
+## Using Triton Inference Server
+
+Now that the inference server is running you can send HTTP or GRPC
+requests to it to perform inferencing. By default, this chart deploys [Traefik](https://traefik.io/)
+and uses [IngressRoutes](https://doc.traefik.io/traefik/providers/kubernetes-crd/)
+to balance requests across all available nodes.
+
+To send requests through the Traefik proxy, use the Cluster IP of the
+traefik service deployed by the Helm chart. In this case, it is 10.111.128.124.
+
+```
+$ kubectl get services
+NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
+...
+example-traefik LoadBalancer 10.111.128.124 8001:31752/TCP,8000:31941/TCP,80:30692/TCP,443:30303/TCP 74m
+example-triton-inference-server ClusterIP None 8000/TCP,8001/TCP,8002/TCP 74m
+```
+
+Use the following command to refer to the Cluster IP:
+```
+cluster_ip=`kubectl get svc -l app.kubernetes.io/name=traefik -o=jsonpath='{.items[0].spec.clusterIP}'`
+```
+
+
+The Traefik reverse-proxy exposes an HTTP endpoint on port 8000, and GRPC
+endpoint on port 8001 and a Prometheus metrics endpoint on
+port 8002. You can use curl to get the meta-data of the inference server
+from the HTTP endpoint.
+
+```
+$ curl $cluster_ip:8000/v2
+```
+
+Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example
+image classification client that can be used to perform inferencing
+using image classification models on the inference
+server. For example,
+
+```
+$ image_client -u $cluster_ip:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+Request 0, batch size 1
+Image 'images/mug.jpg':
+ 504 (COFFEE MUG) = 0.723992
+ 968 (CUP) = 0.270953
+ 967 (ESPRESSO) = 0.00115997
+```
+
+## Testing Load Balancing and Autoscaling
+After you have confirmed that your Triton cluster is operational and can perform inference,
+you can test the load balancing and autoscaling features by sending a heavy load of requests.
+One option for doing this is using the
+[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+application.
+
+You can apply a progressively increasing load with a command like:
+```
+perf_analyzer -m simple -u $cluster_ip:8000 --concurrency-range 1:10
+```
+
+From your Grafana dashboard, you should be able to see the number of pods increase
+as the load increases, with requests being routed evenly to the new pods.
+
+## Cleanup
+
+After you have finished using the inference server, you should use Helm to
+delete the deployment.
+
+```
+$ helm list
+NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE
+example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default
+example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default
+
+$ helm uninstall example
+$ helm uninstall example-metrics
+```
+
+For the Prometheus and Grafana services, you should [explicitly delete
+CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart):
+
+```
+$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com
+```
diff --git a/deploy/k8s-onprem/dashboard.json b/deploy/k8s-onprem/dashboard.json
new file mode 100644
index 0000000000..9c99a2751c
--- /dev/null
+++ b/deploy/k8s-onprem/dashboard.json
@@ -0,0 +1,1172 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__elements": {},
+ "__requires": [
+ {
+ "type": "panel",
+ "id": "gauge",
+ "name": "Gauge",
+ "version": ""
+ },
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "10.0.1"
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ },
+ {
+ "type": "panel",
+ "id": "stat",
+ "name": "Stat",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "timeseries",
+ "name": "Time series",
+ "version": ""
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": {
+ "type": "datasource",
+ "uid": "grafana"
+ },
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "target": {
+ "limit": 100,
+ "matchAny": false,
+ "tags": [],
+ "type": "dashboard"
+ },
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "fiscalYearStartMonth": 0,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "liveNow": false,
+ "panels": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 8,
+ "x": 0,
+ "y": 0
+ },
+ "id": 9,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "lastNotNull"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "text": {},
+ "textMode": "auto"
+ },
+ "pluginVersion": "10.0.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "exemplar": true,
+ "expr": "count(count(nv_inference_count) by (instance))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "title": "Active Triton Instances",
+ "type": "stat"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 50,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineStyle": {
+ "fill": "solid"
+ },
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "auto",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "percent"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "mappings": [],
+ "max": 1,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": [
+ {
+ "__systemRef": "hideSeriesFrom",
+ "matcher": {
+ "id": "byNames",
+ "options": {
+ "mode": "exclude",
+ "names": [
+ "example-triton-inference-server-6784d84f5d-v9scn"
+ ],
+ "prefix": "All except:",
+ "readOnly": true
+ }
+ },
+ "properties": [
+ {
+ "id": "custom.hideFrom",
+ "value": {
+ "legend": false,
+ "tooltip": false,
+ "viz": true
+ }
+ }
+ ]
+ }
+ ]
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 16,
+ "x": 8,
+ "y": 0
+ },
+ "id": 11,
+ "interval": "15s",
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "exemplar": true,
+ "expr": "sum by (pod) (rate(nv_inference_count[1m])) / ignoring(pod) group_left sum (rate(nv_inference_count[1m]))",
+ "instant": false,
+ "interval": "",
+ "legendFormat": "{{pod}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Proportion of Requests by Pod",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": true,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "short"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 8
+ },
+ "id": 2,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "8.2.3",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "exemplar": true,
+ "expr": "sum(nv_inference_request_success) by (pod)",
+ "interval": "",
+ "legendFormat": "Success {{pod}}",
+ "refId": "A"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "exemplar": true,
+ "expr": "sum(nv_inference_request_failure) by (pod)",
+ "interval": "",
+ "legendFormat": "Failure {{pod}}",
+ "refId": "B"
+ }
+ ],
+ "title": "Cumulative Inference Requests by Pod",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Compute Time (ms)",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": true,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "ms"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 17,
+ "w": 12,
+ "x": 12,
+ "y": 8
+ },
+ "id": 5,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "8.2.3",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "exemplar": true,
+ "expr": "sum(rate(nv_inference_compute_infer_duration_us[30s])) by (model) / 1000",
+ "interval": "",
+ "legendFormat": "{{model}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Compute Time by Model (milliseconds)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "Queue Time (ms)",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 1,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": true,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "µs"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 17
+ },
+ "id": 4,
+ "options": {
+ "legend": {
+ "calcs": [],
+ "displayMode": "list",
+ "placement": "bottom",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "single",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "8.2.3",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "exemplar": true,
+ "expr": "avg(rate(nv_inference_queue_duration_us[30s])/(1+rate(nv_inference_request_success[30s]))) by (pod)",
+ "interval": "",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "title": "Average Queue Time by Pod (microseconds)",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "watt"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 18,
+ "x": 0,
+ "y": 25
+ },
+ "id": 10,
+ "options": {
+ "legend": {
+ "calcs": [
+ "mean",
+ "lastNotNull",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "right",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "10.0.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "nv_gpu_power_usage",
+ "interval": "",
+ "legendFormat": "GPU {{ gpu_uuid }}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "GPU Power Usage",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "max": 2400,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 1800
+ },
+ {
+ "color": "red",
+ "value": 2200
+ }
+ ]
+ },
+ "unit": "watt"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 6,
+ "x": 18,
+ "y": 25
+ },
+ "id": 16,
+ "links": [],
+ "options": {
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "sum"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "10.0.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "sum(nv_gpu_power_usage)",
+ "interval": "",
+ "legendFormat": "",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "GPU Power Total",
+ "type": "gauge"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 33
+ },
+ "id": 18,
+ "options": {
+ "legend": {
+ "calcs": [
+ "mean",
+ "max"
+ ],
+ "displayMode": "list",
+ "placement": "right",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "10.0.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "exemplar": false,
+ "expr": "nv_gpu_memory_used_bytes",
+ "interval": "",
+ "legendFormat": "GPU {{gpu_uuid}}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "GPU Framebuffer Mem Used",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 33
+ },
+ "id": 6,
+ "options": {
+ "legend": {
+ "calcs": [
+ "mean",
+ "lastNotNull",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "right",
+ "showLegend": true,
+ "sortBy": "Max",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "10.0.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "nv_gpu_utilization * 100",
+ "interval": "",
+ "legendFormat": "GPU {{gpu_uuid}}",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "GPU Utilization",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "bytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 41
+ },
+ "id": 19,
+ "options": {
+ "legend": {
+ "calcs": [
+ "mean",
+ "max"
+ ],
+ "displayMode": "list",
+ "placement": "right",
+ "showLegend": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "10.0.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "nv_cpu_memory_used_bytes",
+ "hide": false,
+ "instant": false,
+ "legendFormat": "Memory",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "Memory Used",
+ "type": "timeseries"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "palette-classic"
+ },
+ "custom": {
+ "axisCenteredZero": false,
+ "axisColorMode": "text",
+ "axisLabel": "",
+ "axisPlacement": "auto",
+ "barAlignment": 0,
+ "drawStyle": "line",
+ "fillOpacity": 10,
+ "gradientMode": "none",
+ "hideFrom": {
+ "legend": false,
+ "tooltip": false,
+ "viz": false
+ },
+ "lineInterpolation": "linear",
+ "lineWidth": 2,
+ "pointSize": 5,
+ "scaleDistribution": {
+ "type": "linear"
+ },
+ "showPoints": "never",
+ "spanNulls": false,
+ "stacking": {
+ "group": "A",
+ "mode": "none"
+ },
+ "thresholdsStyle": {
+ "mode": "off"
+ }
+ },
+ "links": [],
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 41
+ },
+ "id": 20,
+ "options": {
+ "legend": {
+ "calcs": [
+ "mean",
+ "lastNotNull",
+ "max"
+ ],
+ "displayMode": "table",
+ "placement": "right",
+ "showLegend": true,
+ "sortBy": "Max",
+ "sortDesc": true
+ },
+ "tooltip": {
+ "mode": "multi",
+ "sort": "none"
+ }
+ },
+ "pluginVersion": "10.0.1",
+ "targets": [
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "editorMode": "code",
+ "expr": "nv_cpu_utilization * 100",
+ "interval": "",
+ "legendFormat": "CPU",
+ "range": true,
+ "refId": "A"
+ }
+ ],
+ "title": "CPU Utilization",
+ "type": "timeseries"
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 38,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Triton Inference Server",
+ "uid": "slEY4dsZk",
+ "version": 5,
+ "weekStart": ""
+}
\ No newline at end of file
diff --git a/deploy/k8s-onprem/templates/_helpers.tpl b/deploy/k8s-onprem/templates/_helpers.tpl
new file mode 100644
index 0000000000..a65331e0f0
--- /dev/null
+++ b/deploy/k8s-onprem/templates/_helpers.tpl
@@ -0,0 +1,111 @@
+{{/*
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/}}
+
+# Defines a set of helper functions that produce templated values for other files.
+# Mostly for things like names and labels. This file does not produce any
+# kubernetes resources by itself
+
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Create inference server name.
+*/}}
+{{- define "triton-inference-server.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "triton-inference-server.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics service name and fullname derived from above and
+ truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics monitor name and fullname derived from
+ above and truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics-monitor.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics-monitor.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{/*
+ Create ingressroute names derived from above and truncated appropriately
+*/}}
+{{- define "triton-inference-server-ingressroute-http.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 50 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "ingress-http" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-ingressroute-grpc.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 50 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "ingress-grpc" -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "triton-inference-server.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/deploy/k8s-onprem/templates/deployment.yaml b/deploy/k8s-onprem/templates/deployment.yaml
new file mode 100644
index 0000000000..8c3a19d136
--- /dev/null
+++ b/deploy/k8s-onprem/templates/deployment.yaml
@@ -0,0 +1,111 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Creates a deployment for the Triton Inference Server pods
+# Each pod contains a Triton container and an nfs mount as specified in
+# values.yaml for the model repository
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ replicas: {{ .Values.autoscaling.minReplicas }}
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+ template:
+ metadata:
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+
+ spec:
+ volumes:
+ - name: models
+ nfs:
+ server: {{ .Values.image.modelRepositoryServer }}
+ path: {{ .Values.image.modelRepositoryPath }}
+ readOnly: false
+ containers:
+ - name: {{ .Chart.Name }}
+ image: "{{ .Values.image.imageName }}"
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+ volumeMounts:
+ - mountPath: /models
+ name: models
+
+ resources:
+ limits:
+ nvidia.com/gpu: {{ .Values.image.numGpus }}
+
+ args:
+ - tritonserver
+ {{- range .Values.serverArgs }}
+ - {{ . }}
+ {{- end }}
+
+ ports:
+ - containerPort: 8000
+ name: http
+ - containerPort: 8001
+ name: grpc
+ - containerPort: 8002
+ name: metrics
+ livenessProbe:
+ initialDelaySeconds: 15
+ failureThreshold: 3
+ periodSeconds: 10
+ httpGet:
+ path: /v2/health/live
+ port: http
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ failureThreshold: 3
+ httpGet:
+ path: /v2/health/ready
+ port: http
+ startupProbe:
+ # allows Triton to load the models during 30*10 = 300 sec = 5 min
+ # starts checking the other probes only after the success of this one
+ # for details, see https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-startup-probes
+ periodSeconds: 10
+ failureThreshold: 30
+ httpGet:
+ path: /v2/health/ready
+ port: http
+
+ securityContext:
+ runAsUser: 1000
+ fsGroup: 1000
diff --git a/deploy/k8s-onprem/templates/hpa.yaml b/deploy/k8s-onprem/templates/hpa.yaml
new file mode 100644
index 0000000000..4a4afa48d9
--- /dev/null
+++ b/deploy/k8s-onprem/templates/hpa.yaml
@@ -0,0 +1,52 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Creates the horizontal pod autoscaler for the Triton pod deployment.
+# In order to use custom metrics (ie metrics other than CPU usage) with this
+# autoscaler, you must have enabled installation of the prometheus adapter.
+# This autoscaler (and the prometheus adapter) will only be installed in the
+# autoscaling tag is set to true.
+
+{{- if .Values.tags.autoscaling }}
+apiVersion: autoscaling/v2beta2
+kind: HorizontalPodAutoscaler
+metadata:
+ name: triton-hpa
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ scaleTargetRef:
+ apiVersion: apps/v1
+ kind: Deployment
+ name: {{ template "triton-inference-server.fullname" . }}
+ minReplicas: {{ .Values.autoscaling.minReplicas }}
+ maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+ metrics: {{ toYaml .Values.autoscaling.metrics | nindent 2}}
+{{- end -}}
diff --git a/deploy/k8s-onprem/templates/ingressroute.yaml b/deploy/k8s-onprem/templates/ingressroute.yaml
new file mode 100644
index 0000000000..ee1cbee76f
--- /dev/null
+++ b/deploy/k8s-onprem/templates/ingressroute.yaml
@@ -0,0 +1,69 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Creates the traefik IngressRoutes that allow for external access to the
+# triton service. Two routes are created, one for gRPC and one for HTTP.
+# Requires deployment of the traefik IngressRoute CRD, along with various roles
+# and permissions, most easily accomplished through the referenced traefik
+# helm chart. Will only be installed if the loadBalancing tag is set to true.
+
+{{- if .Values.tags.loadBalancing }}
+apiVersion: traefik.containo.us/v1alpha1
+kind: IngressRoute
+metadata:
+ name: {{ template "triton-inference-server-ingressroute-http.name" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ entryPoints:
+ - triton-http
+ routes:
+ - match: PathPrefix(`/`)
+ kind: Rule
+ services:
+ - name: {{ template "triton-inference-server.fullname" . }}
+ port: 8000
+---
+apiVersion: traefik.containo.us/v1alpha1
+kind: IngressRoute
+metadata:
+ name: {{ template "triton-inference-server-ingressroute-grpc.name" . }}
+ namespace: {{ .Release.Namespace }}
+spec:
+ entryPoints:
+ - triton-grpc
+ routes:
+ - match: PathPrefix(`/`)
+ kind: Rule
+ services:
+ - name: {{ template "triton-inference-server.fullname" . }}
+ port: 8001
+ scheme: h2c
+{{- end -}}
diff --git a/deploy/k8s-onprem/templates/service.yaml b/deploy/k8s-onprem/templates/service.yaml
new file mode 100644
index 0000000000..6d5bf2cb00
--- /dev/null
+++ b/deploy/k8s-onprem/templates/service.yaml
@@ -0,0 +1,94 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Defines the services for triton and the triton metrics service.
+# Also creates a ServiceMonitor for the triton metrics service.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ clusterIP: None
+ ports:
+ - port: 8000
+ targetPort: http
+ name: http-inference-server
+ - port: 8001
+ targetPort: grpc
+ name: grpc-inference-server
+ - port: 8002
+ targetPort: metrics
+ name: metrics-inference-server
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server-metrics.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ annotations:
+ alpha.monitoring.coreos.com/non-namespaced: "true"
+spec:
+ ports:
+ - name: metrics
+ port: 8080
+ targetPort: metrics
+ protocol: TCP
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: {{ template "triton-inference-server-metrics-monitor.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics-monitor.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ endpoints:
+ - port: metrics
+ interval: 15s
diff --git a/deploy/k8s-onprem/values.yaml b/deploy/k8s-onprem/values.yaml
new file mode 100644
index 0000000000..6bdf2e3cde
--- /dev/null
+++ b/deploy/k8s-onprem/values.yaml
@@ -0,0 +1,83 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+tags:
+ autoscaling: true
+ loadBalancing: true
+
+image:
+ imageName: nvcr.io/nvidia/tritonserver:24.03-py3
+ pullPolicy: IfNotPresent
+ modelRepositoryServer: < Replace with the IP Address of your file server >
+ modelRepositoryPath: /srv/models
+ numGpus: 1
+
+# add server args here e.g. --grpc-use-ssl, --grpc-server-certs, repository-poll-secs, etc
+serverArgs:
+ - '--model-repository=/models'
+
+traefik:
+ ports:
+ triton-http:
+ port: 18000
+ exposedPort: 8000
+ expose: true
+ protocol: TCP
+ triton-grpc:
+ port: 18001
+ exposedPort: 8001
+ expose: true
+ protocol: TCP
+
+autoscaling:
+ minReplicas: 1
+ maxReplicas: 3
+ metrics:
+ - type: Pods
+ pods:
+ metric:
+ name: avg_time_queue_us
+ target:
+ type: AverageValue
+ averageValue: 50
+
+prometheus-adapter:
+ prometheus:
+ url: http://example-metrics-kube-prome-prometheus.default.svc.cluster.local
+ port: 9090
+ rules:
+ custom:
+ - seriesQuery: 'nv_inference_queue_duration_us{namespace="default",pod!=""}'
+ resources:
+ overrides:
+ namespace:
+ resource: "namespace"
+ pod:
+ resource: "pod"
+ name:
+ matches: "nv_inference_queue_duration_us"
+ as: "avg_time_queue_us"
+ metricsQuery: 'avg(delta(nv_inference_queue_duration_us{<<.LabelMatchers>>}[30s])/(1+delta(nv_inference_request_success{<<.LabelMatchers>>}[30s]))) by (<<.GroupBy>>)'
\ No newline at end of file
diff --git a/deploy/mlflow-triton-plugin/README.md b/deploy/mlflow-triton-plugin/README.md
new file mode 100644
index 0000000000..c011194299
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/README.md
@@ -0,0 +1,255 @@
+
+# MLflow Triton
+
+MLflow plugin for deploying your models from MLflow to Triton Inference Server.
+Scripts are included for publishing models, which are in Triton recognized
+structure, to your MLflow Model Registry.
+
+### Supported flavors
+
+MLFlow Triton plugin currently supports the following flavors, you may
+substitute the flavor specification in the example below according to the model
+to be deployed.
+
+* onnx
+* triton
+
+## Requirements
+
+* MLflow
+* Triton Python HTTP client
+* Triton Inference Server
+
+## Installation
+
+The plugin can be installed from source using the following commands
+
+```
+python setup.py install
+```
+
+## Quick Start
+
+In this documentation, we will use the files in `examples` to showcase how
+the plugin interacts with Triton Inference Server. The `onnx_float32_int32_int32`
+model in `examples` is a simple model that takes two float32 inputs, INPUT0 and
+INPUT1, with shape [-1, 16], and produces two int32 outputs, OUTPUT0 and
+OUTPUT1, where OUTPUT0 is the element-wise summation of INPUT0 and INPUT1 and
+OUTPUT1 is the element-wise subtraction of INPUT0 and INPUT1.
+
+### Start Triton Inference Server in EXPLICIT mode
+
+The MLflow Triton plugin must work with a running Triton server, see
+[documentation](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/quickstart.md)
+of Triton Inference Server for how to start the server. Note that
+the server should be run in EXPLICIT mode (`--model-control-mode=explicit`)
+to exploit the deployment feature of the plugin.
+
+Once the server has started, the following environment must be set so that the plugin
+can interact with the server properly:
+* `TRITON_URL`: The address to the Triton HTTP endpoint
+* `TRITON_MODEL_REPO`: The path to the Triton model repository. It can be an s3 URI but keep in \
+mind that the env vars AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are needed.
+
+### Publish models to MLflow
+
+#### ONNX flavor
+
+The MLFlow ONNX built-in functionalities can be used to publish `onnx` flavor
+models to MLFlow directly, and the MLFlow Triton plugin will prepare the model
+to the format expected by Triton. You may also log
+[`config.pbtxt`](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_configuration.md)
+as additional artifact which Triton will be used to serve the model. Otherwise,
+the server should be run with auto-complete feature enabled
+(`--strict-model-config=false`) to generate the model configuration.
+
+```
+import mlflow.onnx
+import onnx
+model = onnx.load("examples/onnx_float32_int32_int32/1/model.onnx")
+mlflow.onnx.log_model(model, "triton", registered_model_name="onnx_float32_int32_int32")
+```
+
+#### Triton flavor
+
+For other model frameworks that Triton supports but not yet recognized by
+the MLFlow Triton plugin, the `publish_model_to_mlflow.py` script can be used to
+publish `triton` flavor models to MLflow. A `triton` flavor model is a directory
+containing the model files following the
+[model layout](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_repository.md#repository-layout).
+Below is an example usage:
+
+```
+cd /scripts
+
+python publish_model_to_mlflow.py --model_name onnx_float32_int32_int32 --model_directory /onnx_float32_int32_int32 --flavor triton
+```
+
+### Deploy models tracked in MLflow to Triton
+
+Once a model is published and tracked in MLflow, it can be deployed to Triton
+via MLflow's deployments command, the following command will download the model
+to Triton's model repository and request Triton to load the model.
+
+```
+mlflow deployments create -t triton --flavor triton --name onnx_float32_int32_int32 -m models:/onnx_float32_int32_int32/1
+```
+
+### Perform inference
+
+After the model is deployed, the following command is the CLI usage to send
+inference request to a deployment.
+
+```
+mlflow deployments predict -t triton --name onnx_float32_int32_int32 --input-path /input.json --output-path output.json
+```
+
+The inference result will be written in `output.json` and you may compare it
+with the results in `expected_output.json`
+
+## MLflow Deployments
+
+"MLflow Deployments" is a set of MLflow APIs for deploying MLflow models to
+custom serving tools. The MLflow Triton plugin implements the following
+deployment functions to support the interaction with Triton server in MLflow.
+
+### Create Deployment
+
+MLflow deployments create API deploys a model to the Triton target, which will
+download the model to Triton's model repository and request Triton to load the
+model.
+
+To create a MLflow deployment using CLI:
+
+```
+mlflow deployments create -t triton --flavor triton --name model_name -m models:/model_name/1
+```
+
+To create a MLflow deployment using Python API:
+
+```
+from mlflow.deployments import get_deploy_client
+client = get_deploy_client('triton')
+client.create_deployment("model_name", "models:/model_name/1", flavor="triton")
+```
+
+### Delete Deployment
+
+MLflow deployments delete API removes an existing deployment from the Triton
+target, which will remove the model in Triton's model repository and request
+Triton to unload the model.
+
+To delete a MLflow deployment using CLI
+
+```
+mlflow deployments delete -t triton --name model_name
+```
+
+To delete a MLflow deployment using CLI
+
+```
+from mlflow.deployments import get_deploy_client
+client = get_deploy_client('triton')
+client.delete_deployment("model_name")
+```
+
+### Update Deployment
+
+MLflow deployments update API updates an existing deployment with another model
+(version) tracked in MLflow, which will overwrite the model in Triton's model
+repository and request Triton to reload the model.
+
+To update a MLflow deployment using CLI
+
+```
+mlflow deployments update -t triton --flavor triton --name model_name -m models:/model_name/2
+```
+
+To update a MLflow deployment using Python API
+
+```
+from mlflow.deployments import get_deploy_client
+client = get_deploy_client('triton')
+client.update_deployment("model_name", "models:/model_name/2", flavor="triton")
+```
+
+### List Deployments
+
+MLflow deployments list API lists all existing deployments in Triton target.
+
+To list all MLflow deployments using CLI
+
+```
+mlflow deployments list -t triton
+```
+
+To list all MLflow deployments using Python API
+
+```
+from mlflow.deployments import get_deploy_client
+client = get_deploy_client('triton')
+client.list_deployments()
+```
+
+### Get Deployment
+
+MLflow deployments get API returns information regarding a specific deployments
+in Triton target.
+
+To list a specific MLflow deployment using CLI
+```
+mlflow deployments get -t triton --name model_name
+```
+
+To list a specific MLflow deployment using Python API
+```
+from mlflow.deployments import get_deploy_client
+client = get_deploy_client('triton')
+client.get_deployment("model_name")
+```
+
+### Run Inference on Deployments
+
+MLflow deployments predict API runs inference by preparing and sending the
+request to Triton and returns the Triton response.
+
+To run inference using CLI
+
+```
+mlflow deployments predict -t triton --name model_name --input-path input_file --output-path output_file
+
+```
+
+To run inference using Python API
+
+```
+from mlflow.deployments import get_deploy_client
+client = get_deploy_client('triton')
+client.predict("model_name", inputs)
+```
diff --git a/deploy/mlflow-triton-plugin/examples/expected_output.json b/deploy/mlflow-triton-plugin/examples/expected_output.json
new file mode 100644
index 0000000000..320f8f4815
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/examples/expected_output.json
@@ -0,0 +1,6 @@
+{"outputs":
+ {
+ "OUTPUT0": [[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]],
+ "OUTPUT1": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
+ }
+}
\ No newline at end of file
diff --git a/deploy/mlflow-triton-plugin/examples/input.json b/deploy/mlflow-triton-plugin/examples/input.json
new file mode 100644
index 0000000000..418396ccf0
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/examples/input.json
@@ -0,0 +1,6 @@
+{"inputs":
+ {
+ "INPUT0": [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]],
+ "INPUT1": [[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]]
+ }
+}
\ No newline at end of file
diff --git a/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx
new file mode 100755
index 0000000000..f12d500597
Binary files /dev/null and b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/1/model.onnx differ
diff --git a/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt
new file mode 100644
index 0000000000..75ea016cfa
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/examples/onnx_float32_int32_int32/config.pbtxt
@@ -0,0 +1,57 @@
+
+# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+platform: "onnxruntime_onnx"
+max_batch_size: 8
+version_policy: { latest { num_versions: 1 }}
+input [
+ {
+ name: "INPUT0"
+ data_type: TYPE_FP32
+ dims: [ 16 ]
+ }
+]
+input [
+ {
+ name: "INPUT1"
+ data_type: TYPE_FP32
+ dims: [ 16 ]
+ }
+]
+output [
+ {
+ name: "OUTPUT0"
+ data_type: TYPE_INT32
+ dims: [ 16 ]
+ }
+]
+output [
+ {
+ name: "OUTPUT1"
+ data_type: TYPE_INT32
+ dims: [ 16 ]
+ }
+]
\ No newline at end of file
diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py
new file mode 100755
index 0000000000..0b73b537d4
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/mlflow_triton/__init__.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/config.py b/deploy/mlflow-triton-plugin/mlflow_triton/config.py
new file mode 100755
index 0000000000..0a381fd407
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/mlflow_triton/config.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+import re
+from collections import namedtuple
+
+from mlflow.exceptions import MlflowException
+
+
+class Config(dict):
+ def __init__(self):
+ super().__init__()
+ self["triton_url"] = os.environ.get("TRITON_URL")
+ self["triton_model_repo"] = os.environ.get("TRITON_MODEL_REPO")
+
+ if self["triton_model_repo"].startswith("s3://"):
+ self.s3_regex = re.compile(
+ "s3://(http://|https://|)([0-9a-zA-Z\\-.]+):([0-9]+)/"
+ "([0-9a-z.\\-]+)(((/[0-9a-zA-Z.\\-_]+)*)?)"
+ )
+
+ uri = self.parse_path(self["triton_model_repo"])
+ if uri.protocol == "https://":
+ protocol = "https://"
+ else:
+ protocol = "http://"
+ endpoint_url = None
+ if uri.host_name != "" and uri.host_port != "":
+ endpoint_url = "{}{}:{}".format(protocol, uri.host_name, uri.host_port)
+
+ import boto3
+
+ # boto3 handles AWS credentials
+ self["s3"] = boto3.client("s3", endpoint_url=endpoint_url)
+ self["s3_bucket"] = uri.bucket
+ self["s3_prefix"] = uri.prefix
+ self["triton_model_repo"] = "s3://{}".format(
+ os.path.join(uri.bucket, uri.prefix)
+ )
+
+ def parse_path(self, path):
+ # Cleanup extra slashes
+ clean_path = self.clean_path(path)
+
+ # Get the bucket name and the object path. Return error if path is malformed
+ match = self.s3_regex.fullmatch(clean_path)
+ S3URI = namedtuple(
+ "S3URI", ["protocol", "host_name", "host_port", "bucket", "prefix"]
+ )
+ if match:
+ uri = S3URI(*match.group(1, 2, 3, 4, 5))
+ if uri.prefix and uri.prefix[0] == "/":
+ uri = uri._replace(prefix=uri.prefix[1:])
+ else:
+ bucket_start = clean_path.find("s3://") + len("s3://")
+ bucket_end = clean_path.find("/", bucket_start)
+
+ # If there isn't a slash, the address has only the bucket
+ if bucket_end > bucket_start:
+ bucket = clean_path[bucket_start:bucket_end]
+ prefix = clean_path[bucket_end + 1 :]
+ else:
+ bucket = clean_path[bucket_start:]
+ prefix = ""
+ uri = S3URI("", "", "", bucket, prefix)
+
+ if uri.bucket == "":
+ raise MlflowException("No bucket name found in path: " + path)
+
+ return uri
+
+ def clean_path(self, s3_path):
+ # Must handle paths with s3 prefix
+ start = s3_path.find("s3://")
+ path = ""
+ if start != -1:
+ path = s3_path[start + len("s3://") :]
+ clean_path = "s3://"
+ else:
+ path = s3_path
+ clean_path = ""
+
+ # Must handle paths with https:// or http:// prefix
+ https_start = path.find("https://")
+ if https_start != -1:
+ path = path[https_start + len("https://") :]
+ clean_path += "https://"
+ else:
+ http_start = path.find("http://")
+ if http_start != -1:
+ path = path[http_start + len("http://") :]
+ clean_path += "http://"
+
+ # Remove trailing slashes
+ rtrim_length = len(path.rstrip("/"))
+ if rtrim_length == 0:
+ raise MlflowException("Invalid bucket name: '" + path + "'")
+
+ # Remove leading slashes
+ ltrim_length = len(path) - len(path.lstrip("/"))
+ if ltrim_length == len(path):
+ raise MlflowException("Invalid bucket name: '" + path + "'")
+
+ # Remove extra internal slashes
+ true_path = path[ltrim_length : rtrim_length + 1]
+ previous_slash = False
+ for i in range(len(true_path)):
+ if true_path[i] == "/":
+ if not previous_slash:
+ clean_path += true_path[i]
+ previous_slash = True
+ else:
+ clean_path += true_path[i]
+ previous_slash = False
+
+ return clean_path
diff --git a/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py
new file mode 100755
index 0000000000..bebe559b9e
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/mlflow_triton/deployments.py
@@ -0,0 +1,540 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import ast
+import glob
+import json
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import tritonclient.http as tritonhttpclient
+from mlflow.deployments import BaseDeploymentClient
+from mlflow.exceptions import MlflowException
+from mlflow.models import Model
+from mlflow.tracking.artifact_utils import _download_artifact_from_uri
+from mlflow_triton.config import Config
+from tritonclient.utils import (
+ InferenceServerException,
+ np_to_triton_dtype,
+ triton_to_np_dtype,
+)
+
+logger = logging.getLogger(__name__)
+
+_MLFLOW_META_FILENAME = "mlflow-meta.json"
+
+
+class TritonPlugin(BaseDeploymentClient):
+ def __init__(self, uri):
+ """
+ Initializes the deployment plugin, sets the triton model repo
+ """
+ super(TritonPlugin, self).__init__(target_uri=uri)
+ self.server_config = Config()
+ triton_url, self.triton_model_repo = self._get_triton_server_config()
+ # need to add other flavors
+ self.supported_flavors = ["triton", "onnx"]
+ # URL cleaning for constructing Triton client
+ ssl = False
+ if triton_url.startswith("http://"):
+ triton_url = triton_url[len("http://") :]
+ elif triton_url.startswith("https://"):
+ triton_url = triton_url[len("https://") :]
+ ssl = True
+ self.triton_client = tritonhttpclient.InferenceServerClient(
+ url=triton_url, ssl=ssl
+ )
+
+ def _get_triton_server_config(self):
+ triton_url = "localhost:8000"
+ if self.server_config["triton_url"]:
+ triton_url = self.server_config["triton_url"]
+ logger.info("Triton url = {}".format(triton_url))
+
+ if not self.server_config["triton_model_repo"]:
+ raise Exception("Check that environment variable TRITON_MODEL_REPO is set")
+ triton_model_repo = self.server_config["triton_model_repo"]
+ logger.info("Triton model repo = {}".format(triton_model_repo))
+
+ return triton_url, triton_model_repo
+
+ def create_deployment(self, name, model_uri, flavor=None, config=None):
+ """
+ Deploy the model at the model_uri to the Triton model repo. Associated config.pbtxt and *labels* files will be deployed.
+
+ :param name: Name of the of the model
+ :param model_uri: Model uri in format model://
+ :param flavor: Flavor of the deployed model
+ :param config: Configuration parameters
+
+ :return: Model flavor and name
+ """
+ self._validate_flavor(flavor)
+
+ # Verify model does not already exist in Triton
+ if self._model_exists(name):
+ raise Exception(
+ "Unable to create deployment for name %s because it already exists."
+ % (name)
+ )
+
+ # Get the path of the artifact
+ path = Path(_download_artifact_from_uri(model_uri))
+ self._copy_files_to_triton_repo(path, name, flavor)
+ self._generate_mlflow_meta_file(name, flavor, model_uri)
+
+ try:
+ self.triton_client.load_model(name)
+ except InferenceServerException as ex:
+ raise MlflowException(str(ex))
+
+ return {"name": name, "flavor": flavor}
+
+ def delete_deployment(self, name):
+ """
+ Delete the deployed model in Triton with the provided model name
+
+ :param name: Name of the of the model with version number. For ex: "densenet_onnx/2"
+
+ :return: None
+ """
+ # Verify model is already deployed to Triton
+ if not self._model_exists(name):
+ raise Exception(
+ "Unable to delete deployment for name %s because it does not exist."
+ % (name)
+ )
+
+ try:
+ self.triton_client.unload_model(name)
+ except InferenceServerException as ex:
+ raise MlflowException(str(ex))
+
+ self._delete_deployment_files(name)
+
+ return None
+
+ def update_deployment(self, name, model_uri=None, flavor=None, config=None):
+ """
+ Update the model deployment in triton with the provided name
+
+ :param name: Name and version number of the model, /.
+ :param model_uri: Model uri models:/model_name/version
+ :param flavor: The flavor of the model
+ :param config: Configuration parameters
+
+ :return: Returns the flavor of the model
+ """
+ # TODO: Update this function with a warning. If config and label files associated with this
+ # updated model are different than the ones already deployed to triton, issue a warning to the user.
+ self._validate_flavor(flavor)
+
+ # Verify model is already deployed to Triton
+ if not self._model_exists(name):
+ raise Exception(
+ "Unable to update deployment for name %s because it does not exist."
+ % (name)
+ )
+
+ self.get_deployment(name)
+
+ # Get the path of the artifact
+ path = Path(_download_artifact_from_uri(model_uri))
+
+ self._copy_files_to_triton_repo(path, name, flavor)
+
+ self._generate_mlflow_meta_file(name, flavor, model_uri)
+
+ try:
+ self.triton_client.load_model(name)
+ except InferenceServerException as ex:
+ raise MlflowException(str(ex))
+
+ return {"flavor": flavor}
+
+ def list_deployments(self):
+ """
+ List models deployed to Triton.
+
+ :return: None
+ """
+ resp = self.triton_client.get_model_repository_index()
+ actives = []
+ for d in resp:
+ if "state" in d and d["state"] == "READY":
+ mlflow_meta_path = os.path.join(
+ self.triton_model_repo, d["name"], _MLFLOW_META_FILENAME
+ )
+ if "s3" in self.server_config:
+ meta_dict = ast.literal_eval(
+ self.server_config["s3"]
+ .get_object(
+ Bucket=self.server_config["s3_bucket"],
+ Key=os.path.join(
+ self.server_config["s3_prefix"],
+ d["name"],
+ _MLFLOW_META_FILENAME,
+ ),
+ )["Body"]
+ .read()
+ .decode("utf-8")
+ )
+ elif os.path.isfile(mlflow_meta_path):
+ meta_dict = self._get_mlflow_meta_dict(d["name"])
+ else:
+ continue
+
+ d["triton_model_path"] = meta_dict["triton_model_path"]
+ d["mlflow_model_uri"] = meta_dict["mlflow_model_uri"]
+ d["flavor"] = meta_dict["flavor"]
+ actives.append(d)
+
+ return actives
+
+ def get_deployment(self, name):
+ """
+ Get deployment from Triton.
+
+ :param name: Name of the model. \n
+ Ex: "mini_bert_onnx" - gets the details of active version of this model \n
+
+ :return: output - Returns a dict with model info
+ """
+ deployments = self.list_deployments()
+ for d in deployments:
+ if d["name"] == name:
+ return d
+ raise ValueError(f"Unable to get deployment with name {name}")
+
+ def predict(self, deployment_name, df):
+ single_input_np = None
+ if isinstance(df, np.ndarray):
+ single_input_np = df
+
+ inputs = []
+ if single_input_np is not None:
+ raise MlflowException("Unnamed input is not currently supported")
+ else:
+ if isinstance(df, pd.DataFrame):
+ model_metadata = self.triton_client.get_model_metadata(deployment_name)
+ input_dtype = {}
+ for input in model_metadata["inputs"]:
+ input_dtype[input["name"]] = triton_to_np_dtype(input["datatype"])
+ # Sanity check
+ if len(df.columns) != 1:
+ raise MlflowException("Expect Pandas DataFrame has only 1 column")
+ col = df.columns[0]
+ for row in df.index:
+ val = df[col][row]
+ # Need to form numpy array of the data type expected
+ if type(df[col][row]) != np.ndarray:
+ val = np.array(val, dtype=input_dtype[row])
+ inputs.append(
+ tritonhttpclient.InferInput(
+ row, val.shape, np_to_triton_dtype(val.dtype)
+ )
+ )
+ inputs[-1].set_data_from_numpy(val)
+ else:
+ for key, val in df.items():
+ inputs.append(
+ tritonhttpclient.InferInput(
+ key, val.shape, np_to_triton_dtype(val.dtype)
+ )
+ )
+ inputs[-1].set_data_from_numpy(val)
+
+ try:
+ resp = self.triton_client.infer(model_name=deployment_name, inputs=inputs)
+ res = {}
+ for output in resp.get_response()["outputs"]:
+ res[output["name"]] = resp.as_numpy(output["name"])
+ return pd.DataFrame.from_dict({"outputs": res})
+ except InferenceServerException as ex:
+ raise MlflowException(str(ex))
+
+ def _generate_mlflow_meta_file(self, name, flavor, model_uri):
+ triton_deployment_dir = os.path.join(self.triton_model_repo, name)
+ meta_dict = {
+ "name": name,
+ "triton_model_path": triton_deployment_dir,
+ "mlflow_model_uri": model_uri,
+ "flavor": flavor,
+ }
+
+ if "s3" in self.server_config:
+ self.server_config["s3"].put_object(
+ Body=json.dumps(meta_dict, indent=4).encode("utf-8"),
+ Bucket=self.server_config["s3_bucket"],
+ Key=os.path.join(
+ self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME
+ ),
+ )
+ else:
+ with open(
+ os.path.join(triton_deployment_dir, _MLFLOW_META_FILENAME), "w"
+ ) as outfile:
+ json.dump(meta_dict, outfile, indent=4)
+
+ print("Saved", _MLFLOW_META_FILENAME, "to", triton_deployment_dir)
+
+ def _get_mlflow_meta_dict(self, name):
+ mlflow_meta_path = os.path.join(
+ self.triton_model_repo, name, _MLFLOW_META_FILENAME
+ )
+
+ if "s3" in self.server_config:
+ mlflow_meta_dict = ast.literal_eval(
+ self.server_config["s3"]
+ .get_object(
+ Bucket=self.server_config["s3_bucket"],
+ Key=os.path.join(
+ self.server_config["s3_prefix"], name, _MLFLOW_META_FILENAME
+ ),
+ )["Body"]
+ .read()
+ .decode("utf-8")
+ )
+ else:
+ with open(mlflow_meta_path, "r") as metafile:
+ mlflow_meta_dict = json.load(metafile)
+
+ return mlflow_meta_dict
+
+ def _get_copy_paths(self, artifact_path, name, flavor):
+ copy_paths = {}
+ copy_paths["model_path"] = {}
+ triton_deployment_dir = os.path.join(self.triton_model_repo, name)
+ if flavor == "triton":
+ # When flavor is 'triton', the model is assumed to be preconfigured
+ # with proper model versions and version strategy, which may differ from
+ # the versioning in MLFlow
+ for file in artifact_path.iterdir():
+ if file.is_dir():
+ copy_paths["model_path"]["from"] = file
+ break
+ copy_paths["model_path"]["to"] = triton_deployment_dir
+ elif flavor == "onnx":
+ # Look for model file via MLModel metadata or iterating dir
+ model_file = None
+ config_file = None
+ for file in artifact_path.iterdir():
+ if file.name == "MLmodel":
+ mlmodel = Model.load(file)
+ onnx_meta_data = mlmodel.flavors.get("onnx", None)
+ if onnx_meta_data is not None:
+ model_file = onnx_meta_data.get("data", None)
+ elif file.name == "config.pbtxt":
+ config_file = file.name
+ copy_paths["config_path"] = {}
+ elif file.suffix == ".txt" and file.stem != "requirements":
+ copy_paths[file.stem] = {"from": file, "to": triton_deployment_dir}
+ if model_file is None:
+ for file in artifact_path.iterdir():
+ if file.suffix == ".onnx":
+ model_file = file.name
+ break
+ copy_paths["model_path"]["from"] = os.path.join(artifact_path, model_file)
+ copy_paths["model_path"]["to"] = os.path.join(triton_deployment_dir, "1")
+
+ if config_file is not None:
+ copy_paths["config_path"]["from"] = os.path.join(
+ artifact_path, config_file
+ )
+ copy_paths["config_path"]["to"] = triton_deployment_dir
+ else:
+ # Make sure the directory has been created for config.pbtxt
+ os.makedirs(triton_deployment_dir, exist_ok=True)
+ # Provide a minimum config file so Triton knows what backend
+ # should be performing the auto-completion
+ config = """
+backend: "onnxruntime"
+default_model_filename: "{}"
+""".format(
+ model_file
+ )
+ with open(
+ os.path.join(triton_deployment_dir, "config.pbtxt"), "w"
+ ) as cfile:
+ cfile.write(config)
+ return copy_paths
+
+ def _walk(self, path):
+ """Walk a path like os.walk() if path is dir,
+ return file in the expected format otherwise.
+ :param path: dir or file path
+
+ :return: root, dirs, files
+ """
+ if os.path.isfile(path):
+ return [(os.path.dirname(path), [], [os.path.basename(path)])]
+ elif os.path.isdir(path):
+ return list(os.walk(path))
+ else:
+ raise Exception(f"path: {path} is not a valid path to a file or dir.")
+
+ def _copy_files_to_triton_repo(self, artifact_path, name, flavor):
+ copy_paths = self._get_copy_paths(artifact_path, name, flavor)
+ for key in copy_paths:
+ if "s3" in self.server_config:
+ # copy model dir to s3 recursively
+ for root, dirs, files in self._walk(copy_paths[key]["from"]):
+ for filename in files:
+ local_path = os.path.join(root, filename)
+
+ if flavor == "onnx":
+ s3_path = os.path.join(
+ self.server_config["s3_prefix"],
+ copy_paths[key]["to"]
+ .replace(self.server_config["triton_model_repo"], "")
+ .strip("/"),
+ filename,
+ )
+
+ elif flavor == "triton":
+ rel_path = os.path.relpath(
+ local_path,
+ copy_paths[key]["from"],
+ )
+ s3_path = os.path.join(
+ self.server_config["s3_prefix"], name, rel_path
+ )
+
+ self.server_config["s3"].upload_file(
+ local_path,
+ self.server_config["s3_bucket"],
+ s3_path,
+ )
+ else:
+ if os.path.isdir(copy_paths[key]["from"]):
+ if os.path.isdir(copy_paths[key]["to"]):
+ shutil.rmtree(copy_paths[key]["to"])
+ shutil.copytree(copy_paths[key]["from"], copy_paths[key]["to"])
+ else:
+ if not os.path.isdir(copy_paths[key]["to"]):
+ os.makedirs(copy_paths[key]["to"])
+ shutil.copy(copy_paths[key]["from"], copy_paths[key]["to"])
+
+ if "s3" not in self.server_config:
+ triton_deployment_dir = os.path.join(self.triton_model_repo, name)
+ version_folder = os.path.join(triton_deployment_dir, "1")
+ os.makedirs(version_folder, exist_ok=True)
+
+ return copy_paths
+
+ def _delete_mlflow_meta(self, filepath):
+ if "s3" in self.server_config:
+ self.server_config["s3"].delete_object(
+ Bucket=self.server_config["s3_bucket"],
+ Key=filepath,
+ )
+ elif os.path.isfile(filepath):
+ os.remove(filepath)
+
+ def _delete_deployment_files(self, name):
+ triton_deployment_dir = os.path.join(self.triton_model_repo, name)
+
+ if "s3" in self.server_config:
+ objs = self.server_config["s3"].list_objects(
+ Bucket=self.server_config["s3_bucket"],
+ Prefix=os.path.join(self.server_config["s3_prefix"], name),
+ )
+
+ for key in objs["Contents"]:
+ key = key["Key"]
+ try:
+ self.server_config["s3"].delete_object(
+ Bucket=self.server_config["s3_bucket"],
+ Key=key,
+ )
+ except Exception as e:
+ raise Exception(f"Could not delete {key}: {e}")
+
+ else:
+ # Check if the deployment directory exists
+ if not os.path.isdir(triton_deployment_dir):
+ raise Exception(
+ "A deployment does not exist for this model in directory {} for model name {}".format(
+ triton_deployment_dir, name
+ )
+ )
+
+ model_file = glob.glob("{}/model*".format(triton_deployment_dir))
+ for file in model_file:
+ print("Model directory found: {}".format(file))
+ os.remove(file)
+ print("Model directory removed: {}".format(file))
+
+ # Delete mlflow meta file
+ mlflow_meta_path = os.path.join(
+ self.triton_model_repo, name, _MLFLOW_META_FILENAME
+ )
+ self._delete_mlflow_meta(mlflow_meta_path)
+
+ def _validate_config_args(self, config):
+ if not config["version"]:
+ raise Exception("Please provide the version as a config argument")
+ if not config["version"].isdigit():
+ raise ValueError(
+ "Please make sure version is a number. version = {}".format(
+ config["version"]
+ )
+ )
+
+ def _validate_flavor(self, flavor):
+ if flavor not in self.supported_flavors:
+ raise Exception("{} model flavor not supported by Triton".format(flavor))
+
+ def _model_exists(self, name):
+ deploys = self.list_deployments()
+ exists = False
+ for d in deploys:
+ if d["name"] == name:
+ exists = True
+ return exists
+
+
+def run_local(name, model_uri, flavor=None, config=None):
+ raise NotImplementedError("run_local has not been implemented yet")
+
+
+def target_help():
+ help_msg = (
+ "\nmlflow-triton plugin integrates the Triton Inference Server to the mlflow deployment pipeline. \n\n "
+ "Example command: \n\n"
+ ' mlflow deployments create -t triton --name mymodel --flavor onnx -m models:/mymodel/Production -C "version=1" \n\n'
+ "The environment variable TRITON_MODEL_REPO must be set to the location that the Triton"
+ "Inference Server is storing its models\n\n"
+ "export TRITON_MODEL_REPO = /path/to/triton/model/repo\n\n"
+ "Use the following config options:\n\n"
+ "- version: The version of the model to be released. This config will be used by Triton to create a new model sub-directory.\n"
+ )
+ return help_msg
diff --git a/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py
new file mode 100755
index 0000000000..779d393020
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/scripts/publish_model_to_mlflow.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import os
+
+import click
+import mlflow
+import triton_flavor
+
+
+@click.command()
+@click.option(
+ "--model_name",
+ help="Model name",
+)
+@click.option(
+ "--model_directory",
+ type=click.Path(exists=True, readable=True),
+ required=True,
+ help="Model filepath",
+)
+@click.option(
+ "--flavor",
+ type=click.Choice(["triton"], case_sensitive=True),
+ required=True,
+ help="Model flavor",
+)
+def publish_to_mlflow(model_name, model_directory, flavor):
+ mlflow_tracking_uri = os.environ["MLFLOW_TRACKING_URI"]
+ artifact_path = "triton"
+
+ mlflow.set_tracking_uri(uri=mlflow_tracking_uri)
+
+ with mlflow.start_run() as run:
+ if flavor == "triton":
+ triton_flavor.log_model(
+ model_directory,
+ artifact_path=artifact_path,
+ registered_model_name=model_name,
+ )
+ else:
+ # Enhancement, for model in other flavor (framework) that Triton
+ # supports, try to format it in Triton style and provide
+ # config.pbtxt file. Should this be done in the plugin?
+ raise Exception("Other flavor is not supported")
+
+ print(mlflow.get_artifact_uri())
+
+
+if __name__ == "__main__":
+ publish_to_mlflow()
diff --git a/deploy/mlflow-triton-plugin/scripts/triton_flavor.py b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py
new file mode 100755
index 0000000000..7b0f61630d
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/scripts/triton_flavor.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+The ``triton`` module provides APIs for logging and loading Triton-recognized
+models in the MLflow Model format. This module exports MLflow Models with the following
+flavors:
+
+Triton format
+ model files in the structure that Triton can load the model from.
+
+"""
+import os
+import shutil
+import sys
+
+from mlflow.exceptions import MlflowException
+from mlflow.models import Model
+from mlflow.models.model import MLMODEL_FILE_NAME
+from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS
+from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
+from mlflow.utils.annotations import experimental
+
+FLAVOR_NAME = "triton"
+
+
+@experimental
+def save_model(
+ triton_model_path,
+ path,
+ mlflow_model=None,
+):
+ """
+ Save an Triton model to a path on the local file system.
+
+ :param triton_model_path: File path to Triton model to be saved.
+ :param path: Local path where the model is to be saved.
+ :param mlflow_model: :py:mod:`mlflow.models.Model` this flavor is being added to.
+
+ """
+
+ path = os.path.abspath(path)
+ if os.path.exists(path):
+ raise MlflowException(
+ message="Path '{}' already exists".format(path),
+ error_code=RESOURCE_ALREADY_EXISTS,
+ )
+ os.makedirs(path)
+ triton_model_path = os.path.normpath(triton_model_path)
+ model_data_subpath = os.path.basename(triton_model_path)
+ model_data_path = os.path.join(path, model_data_subpath)
+
+ # Save Triton model
+ shutil.copytree(triton_model_path, model_data_path)
+
+ mlflow_model.add_flavor(FLAVOR_NAME, data=model_data_subpath)
+ mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
+
+
+@experimental
+def log_model(
+ triton_model_path,
+ artifact_path,
+ registered_model_name=None,
+ await_registration_for=DEFAULT_AWAIT_MAX_SLEEP_SECONDS,
+):
+ """
+ Log an Triton model as an MLflow artifact for the current run.
+
+ :param triton_model_path: File path to Triton model.
+ :param artifact_path: Run-relative artifact path.
+ :param registered_model_name: (Experimental) If given, create a model version under
+ ``registered_model_name``, also creating a registered model if one
+ with the given name does not exist.
+
+ :param await_registration_for: Number of seconds to wait for the model version to finish
+ being created and is in ``READY`` status. By default, the function
+ waits for five minutes. Specify 0 or None to skip waiting.
+
+ """
+ Model.log(
+ artifact_path=artifact_path,
+ flavor=sys.modules[__name__],
+ triton_model_path=triton_model_path,
+ registered_model_name=registered_model_name,
+ await_registration_for=await_registration_for,
+ )
diff --git a/deploy/mlflow-triton-plugin/setup.py b/deploy/mlflow-triton-plugin/setup.py
new file mode 100755
index 0000000000..65b8e0df1e
--- /dev/null
+++ b/deploy/mlflow-triton-plugin/setup.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+
+# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+from setuptools import find_packages, setup
+
+setup(
+ name="mlflow-triton",
+ version="0.2.0",
+ description="Triton Mlflow Deployment",
+ long_description=open("README.md").read(),
+ long_description_content_type="text/markdown",
+ packages=find_packages(),
+ install_requires=["mlflow>=2.2.1,<3.0", "tritonclient[all]", "boto3"],
+ entry_points={"mlflow.deployments": "triton=mlflow_triton.deployments"},
+)
diff --git a/deploy/oci/Chart.yaml b/deploy/oci/Chart.yaml
new file mode 100644
index 0000000000..2b7541bee6
--- /dev/null
+++ b/deploy/oci/Chart.yaml
@@ -0,0 +1,31 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+appVersion: "1.0"
+description: Triton Inference Server
+name: triton-inference-server
+version: 1.0.0
diff --git a/deploy/oci/README.md b/deploy/oci/README.md
new file mode 100644
index 0000000000..dc293c7378
--- /dev/null
+++ b/deploy/oci/README.md
@@ -0,0 +1,306 @@
+
+
+[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)
+
+# Kubernetes Deploy: Triton Inference Server Cluster
+
+A helm chart for installing a single cluster of Triton Inference
+Server is provided. By default the cluster contains a single instance
+of the inference server but the *replicaCount* configuration parameter
+can be set to create a cluster of any size, as described below.
+
+This guide assumes you already have a functional Kubernetes cluster
+and helm installed (see below for instructions on installing
+helm). Note the following requirements:
+
+* The helm chart deploys Prometheus and Grafana to collect and display Triton metrics. To use this helm chart you must install Prometheus and Grafana in your cluster as described below and your cluster must contain sufficient CPU resources to support these services.
+
+* If you want Triton Server to use GPUs for inferencing, your cluster
+must be configured to contain the desired number of GPU nodes (A10 GPU instances recommended)
+with support for the NVIDIA driver and CUDA version required by the version
+of the inference server you are using.
+
+The steps below describe how to set-up a model repository, use helm to
+launch the inference server, and then send inference requests to the
+running server. You can access a Grafana endpoint to see real-time
+metrics reported by the inference server.
+
+## Notes for OKE cluster
+
+When creating your node pool, the default value for the boot volume is 46.6GB.
+Due to the size of the server container, it is recommended to increase this value
+to 150GB and set a [cloud-init script to increase the partition](https://blogs.oracle.com/ateam/post/oke-node-sizing-for-very-large-container-images):
+
+```
+#!/bin/bash
+curl --fail -H "Authorization: Bearer Oracle" -L0 http://169.254.169.254/opc/v2/instance/metadata/oke_init_script | base64 --decode >/var/run/oke-init.sh
+bash /var/run/oke-init.sh
+sudo /usr/libexec/oci-growfs -y
+```
+
+
+## Installing Helm
+
+### Using Cloud Shell from OCI Web Console
+
+It is possible to access your OKE Cluster [directly from the OCI Web Console](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengaccessingclusterkubectl.htm).
+Helm v3 is already available from the Cloud Shell.
+
+### Helm v3
+
+If you do not already have Helm installed in your Kubernetes cluster,
+executing the following steps from the [official helm install
+guide](https://helm.sh/docs/intro/install/) will
+give you a quick setup.
+
+If you're currently using Helm v2 and would like to migrate to Helm v3,
+please see the [official migration guide](https://helm.sh/docs/topics/v2_v3_migration/).
+
+### Helm v2
+
+> **NOTE**: Moving forward this chart will only be tested and maintained for Helm v3.
+
+Below are example instructions for installing Helm v2.
+
+```
+$ curl https://raw.githubusercontent.com/helm/helm/master/scripts/get | bash
+$ kubectl create serviceaccount -n kube-system tiller
+serviceaccount/tiller created
+$ kubectl create clusterrolebinding tiller-cluster-rule --clusterrole=cluster-admin --serviceaccount=kube-system:tiller
+$ helm init --service-account tiller --wait
+```
+
+If you run into any issues, you can refer to the official installation guide [here](https://v2.helm.sh/docs/install/).
+
+## Model Repository
+
+If you already have a model repository you may use that with this helm
+chart. If you do not have a model repository, you can checkout a local
+copy of the inference server source repository to create an example
+model repository:
+
+```
+$ git clone https://github.com/triton-inference-server/server.git
+```
+
+Triton Server needs a repository of models that it will make available
+for inferencing. For this example you will place the model repository
+in an S3 compatible OCI Object Storage Bucket.
+
+```
+$ oci os bucket create --compartment-id --name triton-inference-server-repository
+```
+
+Following the [QuickStart](../../docs/getting_started/quickstart.md) download the
+example model repository to your system and copy it into the OCI
+Bucket.
+
+```
+$ oci os object bulk-upload -bn triton-inference-server-repository --src-dir docs/examples/model_repository/
+```
+
+### OCI Model Repository
+To load the model from the OCI Object Storage Bucket, you need to convert the following OCI credentials in the base64 format and add it to the values.yaml
+
+```
+echo -n 'REGION' | base64
+```
+```
+echo -n 'SECRECT_KEY_ID' | base64
+```
+```
+echo -n 'SECRET_ACCESS_KEY' | base64
+```
+
+You also need to adapt _modelRepositoryPath_ in values.yaml to your [namespace](https://docs.oracle.com/en-us/iaas/Content/Object/Tasks/understandingnamespaces.htm) and [OCI region](https://docs.oracle.com/en-us/iaas/Content/General/Concepts/regions.htm).
+
+```
+s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository
+```
+
+## Deploy Prometheus and Grafana
+
+The inference server metrics are collected by Prometheus and viewable
+by Grafana. The inference server helm chart assumes that Prometheus
+and Grafana are available so this step must be followed even if you
+don't want to use Grafana.
+
+Use the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) to install these components. The
+*serviceMonitorSelectorNilUsesHelmValues* flag is needed so that
+Prometheus can find the inference server metrics in the *example*
+release deployed below.
+
+```
+$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false prometheus-community/kube-prometheus-stack
+```
+
+Then port-forward to the Grafana service so you can access it from
+your local browser.
+
+```
+$ kubectl port-forward service/example-metrics-grafana 8080:80
+```
+
+Now you should be able to navigate in your browser to localhost:8080
+and see the Grafana login page. Use username=admin and
+password=prom-operator to login.
+
+Note that it is also possible to set a load balancer service for the grafana dashboard
+by running:
+
+```
+$ helm install example-metrics --set prometheus.prometheusSpec.serviceMonitorSelectorNilUsesHelmValues=false --set grafana.service.type=LoadBalancer prometheus-community/kube-prometheus-stack
+```
+
+You can then see the Public IP of you grafana dashboard by running:
+
+```
+$ kubectl get svc
+NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
+alertmanager-operated ClusterIP None 9093/TCP,9094/TCP,9094/UDP 2m33s
+example-metrics-grafana LoadBalancer 10.96.82.33 141.145.220.114 80:31005/TCP 2m38s
+```
+
+The default load balancer created comes with a fixed shape and a bandwidth of 100Mbps. You can switch to a [flexible](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengcreatingloadbalancers-subtopic.htm#contengcreatingloadbalancers_subtopic) shape and adapt the bandwidth according to your OCI limits in case the bandwidth is a bottleneck.
+
+
+An example Grafana dashboard is available in dashboard.json. Use the
+import function in Grafana to import and view this dashboard.
+
+## Deploy the Inference Server
+
+Deploy the inference server using the default configuration with the
+following commands.
+
+```
+$ cd
+$ helm install example .
+```
+
+Use kubectl to see status and wait until the inference server pods are
+running.
+
+```
+$ kubectl get pods
+NAME READY STATUS RESTARTS AGE
+example-triton-inference-server-5f74b55885-n6lt7 1/1 Running 0 2m21s
+```
+
+There are several ways of overriding the default configuration as
+described in this [helm
+documentation](https://helm.sh/docs/using_helm/#customizing-the-chart-before-installing).
+
+You can edit the values.yaml file directly or you can use the *--set*
+option to override a single parameter with the CLI. For example, to
+deploy a cluster of four inference servers use *--set* to set the
+replicaCount parameter.
+
+```
+$ helm install example --set replicaCount=4 .
+```
+
+You can also write your own "config.yaml" file with the values you
+want to override and pass it to helm.
+
+```
+$ cat << EOF > config.yaml
+namespace: MyCustomNamespace
+image:
+ imageName: nvcr.io/nvidia/tritonserver:custom-tag
+ modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository
+EOF
+$ helm install example -f config.yaml .
+```
+
+## Using Triton Inference Server
+
+Now that the inference server is running you can send HTTP or GRPC
+requests to it to perform inferencing. By default, the inferencing
+service is exposed with a LoadBalancer service type. Use the following
+to find the external IP for the inference server. In this case it is
+34.83.9.133.
+
+```
+$ kubectl get services
+NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
+...
+example-triton-inference-server LoadBalancer 10.18.13.28 34.83.9.133 8000:30249/TCP,8001:30068/TCP,8002:32723/TCP 47m
+```
+
+The inference server exposes an HTTP endpoint on port 8000, and GRPC
+endpoint on port 8001 and a Prometheus metrics endpoint on
+port 8002. You can use curl to get the meta-data of the inference server
+from the HTTP endpoint.
+
+```
+$ curl 34.83.9.133:8000/v2
+```
+
+Follow the [QuickStart](../../docs/getting_started/quickstart.md) to get the example
+image classification client that can be used to perform inferencing
+using image classification models being served by the inference
+server. For example,
+
+```
+$ image_client -u 34.83.9.133:8000 -m inception_graphdef -s INCEPTION -c3 mug.jpg
+Request 0, batch size 1
+Image 'images/mug.jpg':
+ 504 (COFFEE MUG) = 0.723992
+ 968 (CUP) = 0.270953
+ 967 (ESPRESSO) = 0.00115997
+```
+
+## Cleanup
+
+Once you've finished using the inference server you should use helm to
+delete the deployment.
+
+```
+$ helm list
+NAME REVISION UPDATED STATUS CHART APP VERSION NAMESPACE
+example 1 Wed Feb 27 22:16:55 2019 DEPLOYED triton-inference-server-1.0.0 1.0 default
+example-metrics 1 Tue Jan 21 12:24:07 2020 DEPLOYED prometheus-operator-6.18.0 0.32.0 default
+
+$ helm uninstall example
+$ helm uninstall example-metrics
+```
+
+For the Prometheus and Grafana services, you should [explicitly delete
+CRDs](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#uninstall-helm-chart):
+
+```
+$ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com alertmanagers.monitoring.coreos.com podmonitors.monitoring.coreos.com probes.monitoring.coreos.com prometheuses.monitoring.coreos.com prometheusrules.monitoring.coreos.com servicemonitors.monitoring.coreos.com thanosrulers.monitoring.coreos.com
+```
+
+You may also want to delete the OCI bucket you created to hold the
+model repository.
+
+```
+$ oci os bucket delete --bucket-name triton-inference-server-repository --empty
+```
diff --git a/deploy/oci/dashboard.json b/deploy/oci/dashboard.json
new file mode 100644
index 0000000000..8960b41d35
--- /dev/null
+++ b/deploy/oci/dashboard.json
@@ -0,0 +1,411 @@
+{
+ "__inputs": [
+ {
+ "name": "DS_PROMETHEUS",
+ "label": "Prometheus",
+ "description": "",
+ "type": "datasource",
+ "pluginId": "prometheus",
+ "pluginName": "Prometheus"
+ }
+ ],
+ "__requires": [
+ {
+ "type": "grafana",
+ "id": "grafana",
+ "name": "Grafana",
+ "version": "6.3.5"
+ },
+ {
+ "type": "panel",
+ "id": "graph",
+ "name": "Graph",
+ "version": ""
+ },
+ {
+ "type": "panel",
+ "id": "heatmap",
+ "name": "Heatmap",
+ "version": ""
+ },
+ {
+ "type": "datasource",
+ "id": "prometheus",
+ "name": "Prometheus",
+ "version": "1.0.0"
+ }
+ ],
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": null,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "nv_inference_request_success",
+ "legendFormat": "Success {{instance}}",
+ "refId": "A"
+ },
+ {
+ "expr": "nv_inference_request_failure",
+ "legendFormat": "Failure {{instance}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Cumulative Inference Requests",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "cards": {
+ "cardPadding": null,
+ "cardRound": null
+ },
+ "color": {
+ "cardColor": "#b4ff00",
+ "colorScale": "sqrt",
+ "colorScheme": "interpolateReds",
+ "exponent": 0.5,
+ "mode": "spectrum"
+ },
+ "dataFormat": "timeseries",
+ "gridPos": {
+ "h": 9,
+ "w": 12,
+ "x": 12,
+ "y": 0
+ },
+ "heatmap": {},
+ "hideZeroBuckets": false,
+ "highlightCards": true,
+ "id": 7,
+ "legend": {
+ "show": false
+ },
+ "options": {},
+ "reverseYBuckets": false,
+ "targets": [
+ {
+ "expr": "sum(increase(nv_inference_load_ratio_bucket[1m])) by (le)",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Load Ratio (Total Time / Compute Time)",
+ "tooltip": {
+ "show": true,
+ "showHistogram": false
+ },
+ "type": "heatmap",
+ "xAxis": {
+ "show": true
+ },
+ "xBucketNumber": null,
+ "xBucketSize": null,
+ "yAxis": {
+ "decimals": null,
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true,
+ "splitFactor": null
+ },
+ "yBucketBound": "auto",
+ "yBucketNumber": null,
+ "yBucketSize": null
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 9
+ },
+ "id": 4,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_queue_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Queue Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Queue Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "${DS_PROMETHEUS}",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 9
+ },
+ "id": 5,
+ "legend": {
+ "avg": false,
+ "current": false,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": false
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(nv_inference_compute_duration_us[30s]) / 1000",
+ "legendFormat": "{{instance}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Compute Time (milliseconds)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "Compute Time (ms)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 19,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Triton Inference Server",
+ "uid": "slEY4dsZk",
+ "version": 8
+}
diff --git a/deploy/oci/templates/_helpers.tpl b/deploy/oci/templates/_helpers.tpl
new file mode 100644
index 0000000000..6dba910012
--- /dev/null
+++ b/deploy/oci/templates/_helpers.tpl
@@ -0,0 +1,92 @@
+{{/*
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/}}
+
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Create inference server name.
+*/}}
+{{- define "triton-inference-server.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "triton-inference-server.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics service name and fullname derived from above and
+ truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 55 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics" -}}
+{{- end -}}
+
+{{/*
+ Create inference server metrics monitor name and fullname derived from
+ above and truncated appropriately.
+*/}}
+{{- define "triton-inference-server-metrics-monitor.name" -}}
+{{- $basename := include "triton-inference-server.name" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{- define "triton-inference-server-metrics-monitor.fullname" -}}
+{{- $basename := include "triton-inference-server.fullname" . -}}
+{{- $basename_trimmed := $basename | trunc 47 | trimSuffix "-" -}}
+{{- printf "%s-%s" $basename_trimmed "metrics-monitor" -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "triton-inference-server.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/deploy/oci/templates/deployment.yaml b/deploy/oci/templates/deployment.yaml
new file mode 100644
index 0000000000..f374bd181f
--- /dev/null
+++ b/deploy/oci/templates/deployment.yaml
@@ -0,0 +1,100 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ replicas: {{ .Values.replicaCount }}
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+ template:
+ metadata:
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+
+ spec:
+ containers:
+ - name: {{ .Chart.Name }}
+ image: "{{ .Values.image.imageName }}"
+ imagePullPolicy: {{ .Values.image.pullPolicy }}
+
+ resources:
+ limits:
+ nvidia.com/gpu: {{ .Values.image.numGpus }}
+
+ args: ["tritonserver", "--model-store={{ .Values.image.modelRepositoryPath }}",
+ "--model-control-mode=poll",
+ "--repository-poll-secs=5"]
+
+ env:
+ - name: AWS_DEFAULT_REGION
+ valueFrom:
+ secretKeyRef:
+ name: oci-credentials
+ key: OCI_DEFAULT_REGION
+ - name: AWS_ACCESS_KEY_ID
+ valueFrom:
+ secretKeyRef:
+ name: oci-credentials
+ key: OCI_ACCESS_KEY_ID
+ - name: AWS_SECRET_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: oci-credentials
+ key: OCI_SECRET_ACCESS_KEY
+
+ ports:
+ - containerPort: 8000
+ name: http
+ - containerPort: 8001
+ name: grpc
+ - containerPort: 8002
+ name: metrics
+ livenessProbe:
+ httpGet:
+ path: /v2/health/live
+ port: http
+ readinessProbe:
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ httpGet:
+ path: /v2/health/ready
+ port: http
+
+ securityContext:
+ runAsUser: 1000
+ fsGroup: 1000
diff --git a/deploy/oci/templates/secrets.yaml b/deploy/oci/templates/secrets.yaml
new file mode 100644
index 0000000000..0546fdda9d
--- /dev/null
+++ b/deploy/oci/templates/secrets.yaml
@@ -0,0 +1,35 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Secret
+metadata:
+ name: oci-credentials
+type: Opaque
+data:
+ OCI_DEFAULT_REGION: {{ .Values.secret.region }}
+ OCI_ACCESS_KEY_ID: {{ .Values.secret.id }}
+ OCI_SECRET_ACCESS_KEY: {{ .Values.secret.key }}
diff --git a/deploy/oci/templates/service.yaml b/deploy/oci/templates/service.yaml
new file mode 100644
index 0000000000..3315fd77db
--- /dev/null
+++ b/deploy/oci/templates/service.yaml
@@ -0,0 +1,91 @@
+# Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ type: {{ .Values.service.type }}
+ ports:
+ - port: 8000
+ targetPort: http
+ name: http-inference-server
+ - port: 8001
+ targetPort: grpc
+ name: grpc-inference-server
+ - port: 8002
+ targetPort: metrics
+ name: metrics-inference-server
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "triton-inference-server-metrics.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ annotations:
+ alpha.monitoring.coreos.com/non-namespaced: "true"
+spec:
+ ports:
+ - name: metrics
+ port: 8080
+ targetPort: metrics
+ protocol: TCP
+ selector:
+ app: {{ template "triton-inference-server.name" . }}
+ release: {{ .Release.Name }}
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+ name: {{ template "triton-inference-server-metrics-monitor.fullname" . }}
+ namespace: {{ .Release.Namespace }}
+ labels:
+ app: {{ template "triton-inference-server-metrics-monitor.name" . }}
+ chart: {{ template "triton-inference-server.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ selector:
+ matchLabels:
+ app: {{ template "triton-inference-server-metrics.name" . }}
+ endpoints:
+ - port: metrics
+ interval: 15s
diff --git a/deploy/oci/values.yaml b/deploy/oci/values.yaml
new file mode 100644
index 0000000000..00d66d2594
--- /dev/null
+++ b/deploy/oci/values.yaml
@@ -0,0 +1,41 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+replicaCount: 1
+
+image:
+ imageName: nvcr.io/nvidia/tritonserver:24.03-py3
+ pullPolicy: IfNotPresent
+ modelRepositoryPath: s3://https://.compat.objectstorage..oraclecloud.com:443/triton-inference-server-repository
+ numGpus: 1
+
+service:
+ type: LoadBalancer
+
+secret:
+ region: OCI_REGION
+ id: OCI_SECRET_KEY_ID
+ key: OCI_SECRET_ACCESS_KEY
\ No newline at end of file
diff --git a/docker/README.third-party-src b/docker/README.third-party-src
new file mode 100644
index 0000000000..85f17d11ee
--- /dev/null
+++ b/docker/README.third-party-src
@@ -0,0 +1,5 @@
+This directory contains the licenses and source code for software
+included in the Triton Inference Server build. To extract the files
+use:
+
+ $ tar zxf src.tar.gz
diff --git a/docker/cpu_only/entrypoint.d/12-banner.sh b/docker/cpu_only/entrypoint.d/12-banner.sh
new file mode 100755
index 0000000000..0b4adda84b
--- /dev/null
+++ b/docker/cpu_only/entrypoint.d/12-banner.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+prodname_uc=$(echo "${NVIDIA_PRODUCT_NAME}" | tr [:lower:] [:upper:] | sed 's/ /_/g' | sed 's/^NVIDIA_//') # Product name
+_prodver="NVIDIA_${prodname_uc}_VERSION" # Container product version variable name
+_compver="${prodname_uc}_VERSION" # Upstream component version variable name
+
+echo
+echo "NVIDIA Release ${!_prodver} (build ${NVIDIA_BUILD_ID})"
+[ -n "${!_compver}" ] && echo "${NVIDIA_PRODUCT_NAME} Version ${!_compver}"
diff --git a/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
new file mode 100755
index 0000000000..4caa8eeff7
--- /dev/null
+++ b/docker/cpu_only/entrypoint.d/50-gpu-driver-check2.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+export TRITON_SERVER_CPU_ONLY=1
diff --git a/docker/cpu_only/nvidia_entrypoint.sh b/docker/cpu_only/nvidia_entrypoint.sh
new file mode 100755
index 0000000000..82859d1bb6
--- /dev/null
+++ b/docker/cpu_only/nvidia_entrypoint.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+# Copyright 2016-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Gather parts in alpha order
+shopt -s nullglob extglob
+SCRIPT_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
+declare -a PARTS=( "${SCRIPT_DIR}/entrypoint.d"/*@(.txt|.sh) )
+shopt -u nullglob extglob
+
+# Execute the entrypoint parts
+for file in "${PARTS[@]}"; do
+ case "${file}" in
+ *.txt) cat "${file}";;
+ *.sh) source "${file}";;
+ esac
+done
+
+echo
+
+# This script can either be a wrapper around arbitrary command lines,
+# or it will simply exec bash if no arguments were given
+if [[ $# -eq 0 ]]; then
+ exec "/bin/bash"
+else
+ exec "$@"
+fi
diff --git a/docker/entrypoint.d/10-banner.txt b/docker/entrypoint.d/10-banner.txt
new file mode 100644
index 0000000000..56a8b28e55
--- /dev/null
+++ b/docker/entrypoint.d/10-banner.txt
@@ -0,0 +1,4 @@
+
+=============================
+== Triton Inference Server ==
+=============================
diff --git a/docker/entrypoint.d/15-container-copyright.txt b/docker/entrypoint.d/15-container-copyright.txt
new file mode 100644
index 0000000000..5e077f288f
--- /dev/null
+++ b/docker/entrypoint.d/15-container-copyright.txt
@@ -0,0 +1,2 @@
+
+Copyright (c) 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/docker/entrypoint.d/50-gpu-driver-check2.sh b/docker/entrypoint.d/50-gpu-driver-check2.sh
new file mode 100755
index 0000000000..bc22dd55ad
--- /dev/null
+++ b/docker/entrypoint.d/50-gpu-driver-check2.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+if [[ "${NVIDIA_CPU_ONLY:-0}" == "1" ]]; then
+ export TRITON_SERVER_CPU_ONLY=1
+fi
diff --git a/docker/entrypoint.d/56-network-driver-version-check.sh b/docker/entrypoint.d/56-network-driver-version-check.sh
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/docker/entrypoint.d/56-network-driver-version-check.sh
@@ -0,0 +1 @@
+
diff --git a/docker/entrypoint.d/70-shm-check.sh b/docker/entrypoint.d/70-shm-check.sh
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/docker/entrypoint.d/70-shm-check.sh
@@ -0,0 +1 @@
+
diff --git a/docker/entrypoint.d/99-check-run-aip-mode.sh b/docker/entrypoint.d/99-check-run-aip-mode.sh
new file mode 100755
index 0000000000..ec9249e944
--- /dev/null
+++ b/docker/entrypoint.d/99-check-run-aip-mode.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# If detect Vertex AI environment, launch tritonserver with supplied arguments
+
+# This has the effect of "unshifting" the tritonserver command onto the front
+# of $@ if AIP_MODE is nonempty; it will then be exec'd by entrypoint.sh
+set -- ${AIP_MODE:+"/opt/tritonserver/bin/tritonserver"} "$@"
diff --git a/docker/sagemaker/serve b/docker/sagemaker/serve
new file mode 100755
index 0000000000..e9abc00bf5
--- /dev/null
+++ b/docker/sagemaker/serve
@@ -0,0 +1,169 @@
+#!/bin/bash
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+SAGEMAKER_SINGLE_MODEL_REPO=/opt/ml/model/
+
+# Use 'ready' for ping check in single-model endpoint mode, and use 'live' for ping check in multi-model endpoint model
+# https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/rest_predict_v2.yaml#L10-L26
+if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then
+ SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE}
+else
+ SAGEMAKER_TRITON_PING_MODE="ready"
+fi
+
+# Note: in Triton on SageMaker, each model url is registered as a separate repository
+# e.g., /opt/ml/models//model. Specifying MME model repo path as /opt/ml/models causes Triton
+# to treat it as an additional empty repository and changes
+# the state of all models to be UNAVAILABLE in the model repository
+# https://github.com/triton-inference-server/core/blob/main/src/model_repository_manager.cc#L914,L922
+# On Triton, this path will be a dummy path as it's mandatory to specify a model repo when starting triton
+SAGEMAKER_MULTI_MODEL_REPO=/tmp/sagemaker
+
+SAGEMAKER_MODEL_REPO=${SAGEMAKER_SINGLE_MODEL_REPO}
+is_mme_mode=false
+
+if [ -n "$SAGEMAKER_MULTI_MODEL" ]; then
+ if [ "$SAGEMAKER_MULTI_MODEL" == "true" ]; then
+ mkdir -p ${SAGEMAKER_MULTI_MODEL_REPO}
+ SAGEMAKER_MODEL_REPO=${SAGEMAKER_MULTI_MODEL_REPO}
+ if [ -n "$SAGEMAKER_TRITON_OVERRIDE_PING_MODE" ]; then
+ SAGEMAKER_TRITON_PING_MODE=${SAGEMAKER_TRITON_OVERRIDE_PING_MODE}
+ else
+ SAGEMAKER_TRITON_PING_MODE="live"
+ fi
+ is_mme_mode=true
+ echo -e "Triton is running in SageMaker MME mode. Using Triton ping mode: \"${SAGEMAKER_TRITON_PING_MODE}\""
+ fi
+fi
+
+SAGEMAKER_ARGS="--model-repository=${SAGEMAKER_MODEL_REPO}"
+#Set model namespacing to true, but allow disabling if required
+if [ -n "$SAGEMAKER_TRITON_DISABLE_MODEL_NAMESPACING" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-namespacing=${SAGEMAKER_TRITON_DISABLE_MODEL_NAMESPACING}"
+else
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-namespacing=true"
+fi
+if [ -n "$SAGEMAKER_BIND_TO_PORT" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-port=${SAGEMAKER_BIND_TO_PORT}"
+fi
+if [ -n "$SAGEMAKER_SAFE_PORT_RANGE" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-safe-port-range=${SAGEMAKER_SAFE_PORT_RANGE}"
+fi
+if [ -n "$SAGEMAKER_TRITON_ALLOW_GRPC" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=${SAGEMAKER_TRITON_ALLOW_GRPC}"
+else
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-grpc=false"
+fi
+if [ -n "$SAGEMAKER_TRITON_ALLOW_METRICS" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=${SAGEMAKER_TRITON_ALLOW_METRICS}"
+else
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --allow-metrics=false"
+fi
+if [ -n "$SAGEMAKER_TRITON_METRICS_PORT" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --metrics-port=${SAGEMAKER_TRITON_METRICS_PORT}"
+fi
+if [ -n "$SAGEMAKER_TRITON_GRPC_PORT" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --grpc-port=${SAGEMAKER_TRITON_GRPC_PORT}"
+fi
+if [ -n "$SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --buffer-manager-thread-count=${SAGEMAKER_TRITON_BUFFER_MANAGER_THREAD_COUNT}"
+fi
+if [ -n "$SAGEMAKER_TRITON_THREAD_COUNT" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --sagemaker-thread-count=${SAGEMAKER_TRITON_THREAD_COUNT}"
+fi
+# Enable verbose logging by default. If env variable is specified, use value from env variable
+if [ -n "$SAGEMAKER_TRITON_LOG_VERBOSE" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-verbose=${SAGEMAKER_TRITON_LOG_VERBOSE}"
+else
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-verbose=true"
+fi
+if [ -n "$SAGEMAKER_TRITON_LOG_INFO" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-info=${SAGEMAKER_TRITON_LOG_INFO}"
+fi
+if [ -n "$SAGEMAKER_TRITON_LOG_WARNING" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-warning=${SAGEMAKER_TRITON_LOG_WARNING}"
+fi
+if [ -n "$SAGEMAKER_TRITON_LOG_ERROR" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --log-error=${SAGEMAKER_TRITON_LOG_ERROR}"
+fi
+if [ -n "$SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-default-byte-size=${SAGEMAKER_TRITON_SHM_DEFAULT_BYTE_SIZE}"
+else
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-default-byte-size=16777216" #16MB
+fi
+if [ -n "$SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-growth-byte-size=${SAGEMAKER_TRITON_SHM_GROWTH_BYTE_SIZE}"
+else
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=python,shm-growth-byte-size=1048576" #1MB
+fi
+if [ -n "$SAGEMAKER_TRITON_TENSORFLOW_VERSION" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --backend-config=tensorflow,version=${SAGEMAKER_TRITON_TENSORFLOW_VERSION}"
+fi
+if [ -n "$SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT" ]; then
+ num_gpus=$(nvidia-smi -L | wc -l)
+ for ((i=0; i<${num_gpus}; i++)); do
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --model-load-gpu-limit ${i}:${SAGEMAKER_TRITON_MODEL_LOAD_GPU_LIMIT}"
+ done
+fi
+if [ -n "$SAGEMAKER_TRITON_ADDITIONAL_ARGS" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} ${SAGEMAKER_TRITON_ADDITIONAL_ARGS}"
+fi
+
+
+if [ "${is_mme_mode}" = false ] && [ -f "${SAGEMAKER_MODEL_REPO}/config.pbtxt" ]; then
+ echo "ERROR: Incorrect directory structure."
+ echo " Model directory needs to contain the top level folder"
+ exit 1
+fi
+
+if [ "${is_mme_mode}" = false ] && [ -n "$SAGEMAKER_TRITON_DEFAULT_MODEL_NAME" ]; then
+ if [ -d "${SAGEMAKER_MODEL_REPO}/$SAGEMAKER_TRITON_DEFAULT_MODEL_NAME" ]; then
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}"
+ else
+ echo "ERROR: Directory with provided SAGEMAKER_TRITON_DEFAULT_MODEL_NAME ${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME} does not exist"
+ exit 1
+ fi
+elif [ "${is_mme_mode}" = false ]; then
+ MODEL_DIRS=(`find "${SAGEMAKER_MODEL_REPO}" -mindepth 1 -maxdepth 1 -type d -printf "%f\n"`)
+ case ${#MODEL_DIRS[@]} in
+ 0) echo "ERROR: No model found in model repository";
+ exit 1
+ ;;
+ 1) echo "WARNING: No SAGEMAKER_TRITON_DEFAULT_MODEL_NAME provided."
+ echo " Starting with the only existing model directory ${MODEL_DIRS[0]}";
+ export SAGEMAKER_TRITON_DEFAULT_MODEL_NAME=${MODEL_DIRS[0]}
+ ;;
+ *) echo "ERROR: More than 1 model directory found in model repository."
+ echo " Either provide a single directory or set SAGEMAKER_TRITON_DEFAULT_MODEL_NAME to run the ensemble backend."
+ echo " Directories found in model repository: ${MODEL_DIRS[@]}";
+ exit 1
+ ;;
+ esac
+ SAGEMAKER_ARGS="${SAGEMAKER_ARGS} --load-model=${SAGEMAKER_TRITON_DEFAULT_MODEL_NAME}"
+fi
+
+tritonserver --allow-sagemaker=true --allow-http=false --model-control-mode=explicit $SAGEMAKER_ARGS
diff --git a/docs/Dockerfile.docs b/docs/Dockerfile.docs
new file mode 100644
index 0000000000..ba30a144ac
--- /dev/null
+++ b/docs/Dockerfile.docs
@@ -0,0 +1,78 @@
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM ubuntu:22.04
+
+# various documentation dependencies
+RUN apt-get update -q=2 \
+ && apt-get install -y --no-install-recommends \
+ build-essential \
+ curl \
+ doxygen \
+ git \
+ git-lfs \
+ pandoc \
+ python3-dev \
+ python3-pip \
+ ssh \
+ unzip \
+ wget \
+ && rm -rf /var/lib/apt/lists/*
+
+# install protobuf
+RUN wget https://github.com/google/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip -O /tmp/proto.zip \
+ && unzip /tmp/proto.zip -d /usr/local \
+ && rm /tmp/proto.zip
+
+# install pseudomuto/protoc-gen-doc
+RUN wget https://github.com/pseudomuto/protoc-gen-doc/releases/download/v1.3.2/protoc-gen-doc-1.3.2.linux-amd64.go1.12.6.tar.gz -O /tmp/protoc-gen-doc.tar.gz \
+ && tar -xvf /tmp/protoc-gen-doc.tar.gz --strip-components=1 -C /usr/local/bin/ \
+ && rm /tmp/protoc-gen-doc.tar.gz
+
+# install sphinx et al
+RUN pip3 install \
+ ablog \
+ attrs \
+ breathe \
+ docutils \
+ exhale \
+ ipython \
+ myst-nb \
+ nbclient \
+ nbsphinx \
+ rst-to-myst \
+ sphinx==5.0.0 \
+ sphinx-book-theme \
+ sphinx-copybutton \
+ sphinx-design \
+ sphinx-prompt \
+ sphinx-sitemap \
+ sphinx-tabs \
+ sphinxcontrib-bibtex
+
+# Set visitor script to be included on every HTML page
+ENV VISITS_COUNTING_SCRIPT="//assets.adobedtm.com/b92787824f2e0e9b68dc2e993f9bd995339fe417/satelliteLib-7ba51e58dc61bcb0e9311aadd02a0108ab24cc6c.js"
+
diff --git a/docs/Makefile b/docs/Makefile
index fb11718781..b8cf4b654b 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@@ -24,34 +24,35 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# Makefile for Sphinx documentation
+# Minimal makefile for Sphinx documentation
#
-# You can set these variables from the command line.
-SPHINXOPTS =
-SPHINXBUILD = sphinx-build
-SPHINXPROJ = TRTIS
-SOURCEDIR = .
-BUILDDIR = build
-EXHALEDIRS = cpp_api doxyoutput
-PROTOBUFFILES = $(wildcard ../src/core/*.proto)
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = build
+TRITONCLIENTRSTDIR = _reference/tritonclient
+
+#PROTOBUFFILES = $(wildcard ../triton/proto/*.proto)
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
clean:
- @rm -fr $(BUILDDIR) $(EXHALEDIRS)
- @rm -f protobuf_api/*.proto.rst
+ @rm -fr ${BUILDDIR}
+ @rm -fr ${TRITONCLIENTRSTDIR}
+
+.PHONY: help Makefile clean
-protobufdoc: protobuf_api/gen_proto_doc.sh
- cd protobuf_api && \
- rm -f *.proto.rst && \
- bash -x ./gen_proto_doc.sh $(PROTOBUFFILES:%=../%)
+# protobuf: source/reference/protos/gen_proto_doc.sh
+# cd source/reference/protos && \
+# rm -f *.proto.rst && \
+# bash -x ./gen_proto_doc.sh $(PROTOBUFFILES:%=../%)
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile protobufdoc
+%:
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help clean protobufdoc Makefile
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000000..22e0c0d691
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,218 @@
+
+
+# **Triton Inference Server Documentation**
+
+| [Installation](README.md#installation) | [Getting Started](README.md#getting-started) | [User Guide](README.md#user-guide) | [API Guide](protocol/README.md) | [Additional Resources](README.md#resources) | [Customization Guide](README.md#customization-guide) |
+| ------------ | --------------- | --------------- | ------------ | --------------- | --------------- |
+
+**New to Triton Inference Server?** Make use of
+[these tutorials](https://github.com/triton-inference-server/tutorials)
+ to begin your Triton journey!
+
+## **Installation**
+Before you can use the Triton Docker image you must install
+[Docker](https://docs.docker.com/engine/install). If you plan on using
+a GPU for inference you must also install the [NVIDIA Container
+Toolkit](https://github.com/NVIDIA/nvidia-docker). DGX users should
+follow [Preparing to use NVIDIA
+Containers](http://docs.nvidia.com/deeplearning/dgx/preparing-containers/index.html).
+
+Pull the image using the following command.
+
+```
+$ docker pull nvcr.io/nvidia/tritonserver:-py3
+```
+
+Where \ is the version of Triton that you want to pull. For a complete list of all the variants and versions of the Triton Inference Server Container, visit the [NGC Page](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver). More information about customizing the Triton Container can be found in [this section](customization_guide/compose.md) of the User Guide.
+
+## **Getting Started**
+
+This guide covers the simplest possible workflow for deploying a model using a Triton Inference Server.
+- [Create a Model Repository](getting_started/quickstart.md#create-a-model-repository)
+- [Launch Triton](getting_started/quickstart.md#launch-triton)
+- [Send an Inference Request](getting_started/quickstart.md#send-an-inference-request)
+
+Triton Inference Server has a considerable list versatile and powerful features. All new users are recommended to explore the [User Guide](README.md#user-guide) and the [additional resources](README.md#resources) sections for features most relevant to their use case.
+
+## **User Guide**
+The User Guide describes how to configure Triton, organize and configure your models, use the C++ and Python clients, etc. This guide includes the following:
+* Creating a Model Repository [[Overview](README.md#model-repository) || [Details](user_guide/model_repository.md)]
+* Writing a Model Configuration [[Overview](README.md#model-configuration) || [Details](user_guide/model_configuration.md)]
+* Buillding a Model Pipeline [[Overview](README.md#model-pipeline)]
+* Managing Model Availability [[Overview](README.md#model-management) || [Details](user_guide/model_management.md)]
+* Collecting Server Metrics [[Overview](README.md#metrics) || [Details](user_guide/metrics.md)]
+* Supporting Custom Ops/layers [[Overview](README.md#framework-custom-operations) || [Details](user_guide/custom_operations.md)]
+* Using the Client API [[Overview](README.md#client-libraries-and-examples) || [Details](https://github.com/triton-inference-server/client)]
+* Cancelling Inference Requests [[Overview](README.md#cancelling-inference-requests) || [Details](user_guide/request_cancellation.md)]
+* Analyzing Performance [[Overview](README.md#performance-analysis)]
+* Deploying on edge (Jetson) [[Overview](README.md#jetson-and-jetpack)]
+* Debugging Guide [Details](./user_guide/debugging_guide.md)
+
+### Model Repository
+[Model Repositories](user_guide/model_repository.md) are the organizational hub for using Triton. All models, configuration files, and additional resources needed to serve the models are housed inside a model repository.
+- [Cloud Storage](user_guide/model_repository.md#model-repository-locations)
+- [File Organization](user_guide/model_repository.md#model-files)
+- [Model Versioning](user_guide/model_repository.md#model-versions)
+### Model Configuration
+
+A [Model Configuration](user_guide/model_configuration.md) file is where you set the model-level options, such as output tensor reshaping and dynamic batch sizing.
+
+#### Required Model Configuration
+
+Triton Inference Server requires some [Minimum Required parameters](user_guide/model_configuration.md#minimal-model-configuration) to be filled in the Model Configuration. These required parameters essentially pertain to the structure of the model. For TensorFlow, ONNX and TensorRT models, users can rely on Triton to [Auto Generate](user_guide/model_configuration.md#auto-generated-model-configuration) the Minimum Required model configuration.
+- [Maximum Batch Size - Batching and Non-Batching Models](user_guide/model_configuration.md#maximum-batch-size)
+- [Input and Output Tensors](user_guide/model_configuration.md#inputs-and-outputs)
+ - [Tensor Datatypes](user_guide/model_configuration.md#datatypes)
+ - [Tensor Reshape](user_guide/model_configuration.md#reshape)
+ - [Shape Tensor](user_guide/model_configuration.md#shape-tensors)
+
+#### Versioning Models
+Users need the ability to save and serve different versions of models based on business requirements. Triton allows users to set policies to make available different versions of the model as needed. [Learn More](user_guide/model_configuration.md#version-policy).
+
+#### Instance Groups
+Triton allows users to use of multiple instances of the same model. Users can specify how many instances (copies) of a model to load and whether to use GPU or CPU. If the model is being loaded on GPU, users can also select which GPUs to use. [Learn more](user_guide/model_configuration.md#instance-groups).
+- [Specifying Multiple Model Instances](user_guide/model_configuration.md#multiple-model-instances)
+- [CPU and GPU Instances](user_guide/model_configuration.md#cpu-model-instance)
+- [Configuring Rate Limiter](user_guide/model_configuration.md#rate-limiter-configuration)
+
+#### Optimization Settings
+
+The Model Configuration ModelOptimizationPolicy property is used to specify optimization and prioritization settings for a model. These settings control if/how a model is optimized by the backend and how it is scheduled and executed by Triton. See the [ModelConfig Protobuf](https://github.com/triton-inference-server/common/blob/main/protobuf/model_config.proto) and [Optimization Documentation](user_guide/optimization.md#optimization) for the currently available settings.
+- [Framework-Specific Optimization](user_guide/optimization.md#framework-specific-optimization)
+ - [ONNX-TensorRT](user_guide/optimization.md#onnx-with-tensorrt-optimization-ort-trt)
+ - [ONNX-OpenVINO](user_guide/optimization.md#onnx-with-openvino-optimization)
+ - [TensorFlow-TensorRT](user_guide/optimization.md#tensorflow-with-tensorrt-optimization-tf-trt)
+ - [TensorFlow-Mixed-Precision](user_guide/optimization.md#tensorflow-automatic-fp16-optimization)
+- [NUMA Optimization](user_guide/optimization.md#numa-optimization)
+
+#### Scheduling and Batching
+
+Triton supports batching individual inference requests to improve compute resource utilization. This is extremely important as individual requests typically will not saturate GPU resources thus not leveraging the parallelism provided by GPUs to its extent. Learn more about Triton's [Batcher and Scheduler](user_guide/model_configuration.md#scheduling-and-batching).
+- [Default Scheduler - Non-Batching](user_guide/model_configuration.md#default-scheduler)
+- [Dynamic Batcher](user_guide/model_configuration.md#dynamic-batcher)
+ - [How to Configure Dynamic Batcher](user_guide/model_configuration.md#recommended-configuration-process)
+ - [Delayed Batching](user_guide/model_configuration.md#delayed-batching)
+ - [Preferred Batch Size](user_guide/model_configuration.md#preferred-batch-sizes)
+ - [Preserving Request Ordering](user_guide/model_configuration.md#preserve-ordering)
+ - [Priority Levels](user_guide/model_configuration.md#priority-levels)
+ - [Queuing Policies](user_guide/model_configuration.md#queue-policy)
+ - [Ragged Batching](user_guide/ragged_batching.md)
+- [Sequence Batcher](user_guide/model_configuration.md#sequence-batcher)
+ - [Stateful Models](user_guide/architecture.md#stateful-models)
+ - [Control Inputs](user_guide/architecture.md#control-inputs)
+ - [Implicit State - Stateful Inference Using a Stateless Model](user_guide/architecture.md#implicit-state-management)
+ - [Sequence Scheduling Strategies](user_guide/architecture.md#scheduling-strategies)
+ - [Direct](user_guide/architecture.md#direct)
+ - [Oldest](user_guide/architecture.md#oldest)
+
+#### Rate Limiter
+Rate limiter manages the rate at which requests are scheduled on model instances by Triton. The rate limiter operates across all models loaded in Triton to allow cross-model prioritization. [Learn more](user_guide/rate_limiter.md).
+
+#### Model Warmup
+For a few of the Backends (check [Additional Resources](README.md#resources)) some or all of initialization is deferred until the first inference request is received, the benefit is resource conservation but comes with the downside of the initial requests getting processed slower than expected. Users can pre-"warm up" the model by instructing Triton to initialize the model. [Learn more](user_guide/model_configuration.md#model-warmup).
+
+#### Inference Request/Response Cache
+Triton has a feature which allows inference responses to get cached. [Learn More](user_guide/response_cache.md).
+
+### Model Pipeline
+Building ensembles is as easy as adding an addition configuration file which outlines the specific flow of tensors from one model to another. Any additional changes required by the model ensemble can be made in existing (individual) model configurations.
+- [Model Ensemble](user_guide/architecture.md#ensemble-models)
+- [Business Logic Scripting (BLS)](https://github.com/triton-inference-server/python_backend#business-logic-scripting)
+### Model Management
+Users can specify policies in the model configuration for loading and unloading of models. This [section](user_guide/model_management.md) covers user selectable policy details.
+- [Explicit Model Loading and Unloading](user_guide/model_management.md#model-control-mode-explicit)
+- [Modifying the Model Repository](user_guide/model_management.md#modifying-the-model-repository)
+### Metrics
+Triton provides Prometheus metrics like GPU Utilization, Memory Usage, Latency and more. Learn about [available metrics](user_guide/metrics.md).
+### Framework Custom Operations
+Some frameworks provide the option of building custom layers/operations. These can be added to specific Triton Backends for the those frameworks. [Learn more](user_guide/custom_operations.md)
+- [TensorRT](user_guide/custom_operations.md#tensorrt)
+- [TensorFlow](user_guide/custom_operations.md#tensorflow)
+- [PyTorch](user_guide/custom_operations.md#pytorch)
+- [ONNX](user_guide/custom_operations.md#onnx)
+### Client Libraries and Examples
+Use the [Triton Client](https://github.com/triton-inference-server/client) API to integrate client applications over the network HTTP/gRPC API or integrate applications directly with Triton using CUDA shared memory to remove network overhead.
+- [C++ HTTP/GRPC Libraries](https://github.com/triton-inference-server/client#client-library-apis)
+- [Python HTTP/GRPC Libraries](https://github.com/triton-inference-server/client#client-library-apis)
+- [Java HTTP Library](https://github.com/triton-inference-server/client/tree/main/src/java)
+- GRPC Generated Libraries
+ - [go](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/go)
+ - [Java/Scala](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/java)
+ - [Javascript](https://github.com/triton-inference-server/client/tree/main/src/grpc_generated/javascript)
+- [Shared Memory Extension](protocol/extension_shared_memory.md)
+### Cancelling Inference Requests
+Triton can detect and handle requests that have been cancelled from the client-side. This [document](user_guide/request_cancellation.md) discusses scope and limitations of the feature.
+### Performance Analysis
+Understanding Inference performance is key to better resource utilization. Use Triton's Tools to costomize your deployment.
+- [Performance Tuning Guide](user_guide/performance_tuning.md)
+- [Optimization](user_guide/optimization.md)
+- [Model Analyzer](user_guide/model_analyzer.md)
+- [Performance Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+- [Inference Request Tracing](user_guide/trace.md)
+### Jetson and JetPack
+Triton can be deployed on edge devices. Explore [resources](user_guide/jetson.md) and [examples](examples/jetson/README.md).
+
+## **Resources**
+
+The following resources are recommended to explore the full suite of Triton Inference Server's functionalities.
+- **Clients**: Triton Inference Server comes with C++, Python and Java APIs with which users can send HTTP/REST or gRPC(possible extensions for other languages) requests. Explore the [client repository](https://github.com/triton-inference-server/server/tree/main/docs/protocol) for examples and documentation.
+
+- **Configuring Deployment**: Triton comes with three tools which can be used to configure deployment setting, measure performance and recommend optimizations.
+ - [Model Analyzer](https://github.com/triton-inference-server/model_analyzer) Model Analyzer is CLI tool built to recommend deployment configurations for Triton Inference Server based on user's Quality of Service Requirements. It also generates detailed reports about model performance to summarize the benefits and trade offs of different configurations.
+ - [Perf Analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md):
+ Perf Analyzer is a CLI application built to generate inference requests and
+ measures the latency of those requests and throughput of the model being
+ served.
+ - [Model Navigator](https://github.com/triton-inference-server/model_navigator):
+ The Triton Model Navigator is a tool that provides the ability to automate the process of moving model from source to optimal format and configuration for deployment on Triton Inference Server. The tool supports export model from source to all possible formats and applies the Triton Inference Server backend optimizations.
+
+- **Backends**: Triton supports a wide variety of frameworks used to run models. Users can extend this functionality by creating custom backends.
+ - [PyTorch](https://github.com/triton-inference-server/pytorch_backend): Widely used Open Source DL Framework
+ - [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend): Widely used Open Source DL Framework
+ - [TensorRT](https://github.com/triton-inference-server/tensorrt_backend): NVIDIA [TensorRT](https://developer.nvidia.com/tensorrt) is an inference acceleration SDK that provide a with range of graph optimizations, kernel optimization, use of lower precision, and more.
+ - [ONNX](https://github.com/triton-inference-server/onnxruntime_backend): ONNX Runtime is a cross-platform inference and training machine-learning accelerator.
+ - [OpenVINO](https://github.com/triton-inference-server/openvino_backend): OpenVINOâ„¢ is an open-source toolkit for optimizing and deploying AI inference.
+ - [Paddle Paddle](https://github.com/triton-inference-server/paddlepaddle_backend): Widely used Open Source DL Framework
+ - [Python](https://github.com/triton-inference-server/python_backend): Users can add custom business logic, or any python code/model for serving requests.
+ - [Forest Inference Library](https://github.com/triton-inference-server/fil_backend): Backend built for forest models trained by several popular machine learning frameworks (including XGBoost, LightGBM, Scikit-Learn, and cuML)
+ - [DALI](https://github.com/triton-inference-server/dali_backend): NVIDIA [DALI](https://developer.nvidia.com/dali) is a Data Loading Library purpose built to accelerated pre-processing and data loading steps in a Deep Learning Pipeline.
+ - [HugeCTR](https://github.com/triton-inference-server/hugectr_backend): HugeCTR is a GPU-accelerated recommender framework designed to distribute training across multiple GPUs and nodes and estimate Click-Through Rates
+ - [Managed Stateful Models](https://github.com/triton-inference-server/stateful_backend): This backend automatically manages the input and output states of a model. The states are associated with a sequence id and need to be tracked for inference requests associated with the sequence id.
+ - [Faster Transformer](https://github.com/triton-inference-server/fastertransformer_backend): NVIDIA [FasterTransformer](https://github.com/NVIDIA/FasterTransformer/) (FT) is a library implementing an accelerated engine for the inference of transformer-based neural networks, with a special emphasis on large models, spanning many GPUs and nodes in a distributed manner.
+ - [Building Custom Backends](https://github.com/triton-inference-server/backend/tree/main/examples#tutorial)
+ - [Sample Custom Backend: Repeat_backend](https://github.com/triton-inference-server/repeat_backend): Backend built to demonstrate sending of zero, one, or multiple responses per request.
+
+## **Customization Guide**
+This guide describes how to build and test Triton and also how Triton can be extended with new functionality.
+
+- [Build](customization_guide/build.md)
+- [Protocols and APIs](customization_guide/inference_protocols.md).
+- [Backends](https://github.com/triton-inference-server/backend)
+- [Repository Agents](customization_guide/repository_agents.md)
+- [Test](customization_guide/test.md)
diff --git a/docs/_reference/tritonclient_api.rst b/docs/_reference/tritonclient_api.rst
new file mode 100644
index 0000000000..33dd53127a
--- /dev/null
+++ b/docs/_reference/tritonclient_api.rst
@@ -0,0 +1,10 @@
+Python tritonclient Package API
+===============================
+
+tritonclient python package is hosted at the `pyPI.org `_. This package documentation for tritonclient is genenerated by sphinx autosummary extension.
+
+.. autosummary::
+ :toctree: tritonclient
+ :recursive:
+
+ tritonclient
diff --git a/docs/_static/.gitattributes b/docs/_static/.gitattributes
new file mode 100644
index 0000000000..04865f126a
--- /dev/null
+++ b/docs/_static/.gitattributes
@@ -0,0 +1,2 @@
+nvidia-logo-horiz-rgb-blk-for-screen.png filter=lfs diff=lfs merge=lfs -text
+nvidia-logo-vert-rgb-blk-for-screen.png filter=lfs diff=lfs merge=lfs -text
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
new file mode 100644
index 0000000000..46bab57d4e
--- /dev/null
+++ b/docs/_static/custom.css
@@ -0,0 +1,319 @@
+/*
+# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/5/2/52891dda673228d54e5d57bf1e4a3880d4b22405.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/0/e090b7dda7a582522c7f9045c6ce949cce60134f.woff) format("woff");
+ font-weight: 300;
+ font-style: normal;
+}
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/a/1/a107baabcbf6b241099122336bce7429bcfd377a.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/3/a/3a6060a4e3bce70e5552ba0de8af4b22c6cf9144.woff) format("woff");
+ font-weight: 300;
+ font-style: italic;
+}
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/9/9/9920d2b172b01d92fc9c1c0e521dcf45b59c47c3.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/6/c/6c7d947928a7e4ef3e80ed409bef6c243f2148cb.woff) format("woff");
+ font-weight: 400;
+ font-style: normal;
+}
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/8/e8e63fe1244372cd942d957f44a5616a1eba0644.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/0/f/0f1fb2af0283ab09d36e7097bb07d895c3228f12.woff) format("woff");
+ font-weight: 400;
+ font-style: italic;
+}
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/7/9/79d3c513a9cd72c59f65354f39f89ca52dc17dd2.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/2/5/2581ac533f5d01f4985d8a7245b0766b4630ced8.woff) format("woff");
+ font-weight: 500;
+ font-style: normal;
+}
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/3/9/39d9ef1ee9770dd503f19bb2ace2fdb4eff3bb50.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/7/b/7bb5d5e2e71b2e13c8098b2e67c0a0ed9258e6c7.woff) format("woff");
+ font-weight: 500;
+ font-style: italic;
+}
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/0/5/05276a55a43eb3f74981ec1e93252727afcd9d16.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/9/c/9cfec7ed941b06564aa4d5ca14610e81542d070f.woff) format("woff");
+ font-weight: 700;
+ font-style: normal;
+}
+@font-face {
+ font-family: "NVIDIA Sans";
+ src: url(https://aws1.discourse-cdn.com/nvidia/original/3X/a/e/aebd14d09ba56f541e1b8735fb051e33710f9ae7.woff2) format("woff2"),
+ url(https://aws1.discourse-cdn.com/nvidia/original/3X/e/d/edbdabef43acc5c12e84a94baaa5542c9404cfeb.woff) format("woff");
+ font-weight: 700;
+ font-style: italic;
+}
+
+/* Custom Styles */
+:root {
+--pst-font-size-base: none;
+--pst-color-primary: 0, 133, 197;
+--pst-color-admonition-note: var(--pst-color-primary);
+--pst-color-admonition-default: var(--pst-color-primary);
+--pst-color-info: 255, 193, 7;
+--pst-color-admonition-tip: var(--pst-color-info);
+--pst-color-admonition-hint: var(--pst-color-info);
+--pst-color-admonition-important: var(--pst-color-info);
+--pst-color-warning: 245, 162, 82;
+--pst-color-danger: 230, 101, 129;
+--pst-color-admonition-warning: var(--pst-color-danger);
+--pst-color-link: 118, 185, 0;
+--pst-color-inline-code: 92, 22, 130;
+--font-family-sans-serif: NVIDIA Sans, Helvetica, Arial, Sans-serif;
+--pst-font-family-base-system: NVIDIA Sans, Helvetica, Arial, Sans-serif;
+font-family: NVIDIA Sans, Helvetica, Arial, Sans-serif;
+}
+
+.prev-next-area {
+ font-size: small;
+}
+
+.docutils caption {
+ caption-side: top;
+}
+
+#site-navigation h1.site-logo {
+ font-size: 0.85em;
+}
+
+/* colors
+nv green 118,185,0
+black 0, 0, 0
+light gray 205, 205, 205
+medium gray 140, 140, 140
+dark gray 94, 94, 94
+
+emerald 0, 133, 100
+emerald #008564
+amethyst 92, 22, 130
+amethyst #5C1682
+cpu blue 0, 133, 197
+cpu blue #0085C5
+garnet 137, 12, 88
+garnet 890C58
+fluorite 250, 194, 0
+fluorite FAC200
+*/
+
+:root {
+ --nv-green: #76b900;
+ --nv-green-darken: #6ead00;
+ --emerald: #008564;
+ --emerald-darken: #017c5d;
+ --amethyst: #5d1682;
+ --amethyst-darken: #4c116b;
+ --cpu-blue: #0071c5;
+ --cpu-blue-darken: #0062ad;
+ --garnet: #890c58;
+ --garnet-darken: #7a0c4e;
+ --fluorite: #fac200;
+ --fluorite-darken: #e4b301;
+ --dark-gray: #5e5e5e;
+ --light-gray: #cdcdcd;
+ --medium-gray: #8c8c8c;
+ --medium-gray-darken: #8c8c8cde;
+ --primary: #76b900;
+ --secondary: #008564;
+ --success: #5d1682;
+ --info: #0071c5;
+ --warning: #fac200;
+ --danger: #890c58;
+}
+
+/* Riva TBYB (ASR and TTS) Styling */
+.demo-box {
+ background-color: rgb(245,245,245);
+}
+a:link { text-decoration: none; }
+.scrollable {
+ height: 125px;
+ overflow-y: auto;
+ font-size: 1.3rem;
+}
+.dot {
+ height: 8px;
+ width: 8px;
+ background-color: rgb(228, 77, 77);
+ border-radius: 50%;
+ display: inline-block;
+}
+.timer {
+ font-size: 80%;
+ text-transform: uppercase;
+ white-space: nowrap;
+}
+.form-select {
+ border-radius: 0%;
+ font-size: 80%;
+}
+.form-control {
+ border-radius: 0%;
+}
+.input-group-text {
+ border-radius: 0%;
+ font-size: 80%;
+ text-transform: uppercase;
+ background-color: rgb(245,245,245);
+}
+.card {
+ border-radius: 0%;
+}
+.speech-control {
+ border-top-width: 0px;
+}
+.btn {
+ border-radius: 0%;
+ font-size: 80%;
+ text-transform: uppercase;
+ white-space: nowrap;
+ min-width: 125px;
+}
+.btn-primary {
+ background-color: var(--nv-green);
+ border-color: var(--nv-green);
+}
+.btn-primary:hover {
+ background-color: var(--nv-green-darken);
+ border-color: var(--nv-green-darken);
+}
+.btn-primary:focus, .btn-primary.focus {
+ background-color: var(--nv-green-darken);
+ border-color: var(--nv-green-darken);
+ -webkit-box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5);
+ box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5);
+}
+.btn-primary.disabled, .btn-primary:disabled {
+ background-color: var(--nv-green);
+ border-color: var(--nv-green);
+}
+.btn-primary:not(:disabled):not(.disabled):active, .btn-primary:not(:disabled):not(.disabled).active,
+.show > .btn-primary.dropdown-toggle {
+ background-color: var(--nv-green-darken);
+ border-color: var(--nv-green-darken);
+}
+.btn-primary:not(:disabled):not(.disabled):active:focus, .btn-primary:not(:disabled):not(.disabled).active:focus,
+.show > .btn-primary.dropdown-toggle:focus {
+ -webkit-box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5);
+ box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5);
+}
+.btn-secondary {
+ background-color: var(--medium-gray);
+ border-color: var(--medium-gray);
+}
+.btn-secondary:hover {
+ background-color: var(--medium-gray-darken);
+ border-color: var(--medium-gray-darken);
+}
+.btn-secondary:focus, .btn-secondary.focus {
+ background-color: var(--medium-gray-darken);
+ border-color: var(--medium-gray-darken);
+ -webkit-box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5);
+ box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5);
+}
+.btn-secondary.disabled, .btn-secondary:disabled {
+ background-color: var(--medium-gray);
+ border-color: var(--medium-gray);
+}
+.btn-secondary:not(:disabled):not(.disabled):active, .btn-secondary:not(:disabled):not(.disabled).active,
+.show > .btn-secondary.dropdown-toggle {
+ background-color: var(--medium-gray-darken);
+ border-color: var(--medium-gray-darken);
+}
+.btn-secondary:not(:disabled):not(.disabled):active:focus, .btn-secondary:not(:disabled):not(.disabled).active:focus,
+.show > .btn-secondary.dropdown-toggle:focus {
+ -webkit-box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5);
+ box-shadow: 0 0 0 0.2rem rgba(140, 140, 140, 0.5);
+}
+.btn-link {
+ color: var(--nv-green);
+ text-decoration-line: none;
+}
+.btn-link:hover {
+ color: var(--nv-green-darken);
+}
+.btn-link:focus, .btn-link.focus {
+ color: var(--nv-green-darken);
+ -webkit-box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5);
+ box-shadow: 0 0 0 0.2rem rgba(147, 173, 102, 0.5);
+}
+.link-primary {
+ color: var(--nv-green);
+}
+.link-primary:hover {
+ color: var(--nv-green-darken);
+}
+
+/* Riva ASR Styles */
+#riva-upload-label {
+ margin-top: 0.5rem;
+}
+
+/* Riva TTS Styles */
+.tts-control {
+ justify-content: space-between;
+ align-items: center;
+}
+
+.tts-control > p {
+ margin: unset;
+}
+
+#riva-tts-field {
+ resize: none;
+ border: unset;
+ padding: 0;
+ height: 100%;
+ font-size: 1.0rem;
+}
+
+#riva-terms-of-use p {
+ max-width: 620px;
+}
+
+/* Media Queries */
+@media (max-width: 1024px) {
+
+ /* Riva TTS and ASR */
+ .scrollable {
+ height: 250px;
+ }
+}
+
diff --git a/docs/_static/logo_2color_horizontal.svg b/docs/_static/logo_2color_horizontal.svg
new file mode 100644
index 0000000000..5ab0442d32
--- /dev/null
+++ b/docs/_static/logo_2color_horizontal.svg
@@ -0,0 +1,2 @@
+
+
diff --git a/docs/_static/logo_2color_vertical.svg b/docs/_static/logo_2color_vertical.svg
new file mode 100644
index 0000000000..69e64b7001
--- /dev/null
+++ b/docs/_static/logo_2color_vertical.svg
@@ -0,0 +1,2 @@
+
+
diff --git a/docs/_static/nvidia-logo-horiz-rgb-blk-for-screen.png b/docs/_static/nvidia-logo-horiz-rgb-blk-for-screen.png
new file mode 100644
index 0000000000..6316a9340f
--- /dev/null
+++ b/docs/_static/nvidia-logo-horiz-rgb-blk-for-screen.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd57ffce985e08c97c6af5fdadd2a28e4a92996455edc2d0598dd964cca51eae
+size 48928
diff --git a/docs/_static/nvidia-logo-vert-rgb-blk-for-screen.png b/docs/_static/nvidia-logo-vert-rgb-blk-for-screen.png
new file mode 100644
index 0000000000..5546c1b57d
--- /dev/null
+++ b/docs/_static/nvidia-logo-vert-rgb-blk-for-screen.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:17a25111e145aa52b77ec5a89eb3b0c7d9a2a90dea25a0bb867a937514fc783c
+size 63541
diff --git a/docs/_static/rtd-data.js b/docs/_static/rtd-data.js
new file mode 100644
index 0000000000..7ed13e8ee0
--- /dev/null
+++ b/docs/_static/rtd-data.js
@@ -0,0 +1,36 @@
+/*
+# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// Dummy data for testing ReadTheDocs footer insertion
+// This mimics RTD data for a project that uses both versions + languages
+var READTHEDOCS_DATA = {
+ project: "frc-docs",
+ version: "latest",
+ language: "en",
+ proxied_api_host: "https://readthedocs.org",
+};
diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html
new file mode 100644
index 0000000000..570aba8ba3
--- /dev/null
+++ b/docs/_templates/layout.html
@@ -0,0 +1,31 @@
+
+{% extends "!layout.html" %}
+{%- block footer %}
+
+{%- endblock %}
diff --git a/docs/architecture.rst b/docs/architecture.rst
deleted file mode 100644
index 75da5b0574..0000000000
--- a/docs/architecture.rst
+++ /dev/null
@@ -1,118 +0,0 @@
-..
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions
- # are met:
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- # * Neither the name of NVIDIA CORPORATION nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Architecture
-============
-
-The following figure shows the TensorRT Inference Server high-level
-architecture. The :ref:`model repository `
-is a file-system based store of the models that TRTIS will make
-available for inferencing. Inference requests arrive at the server via
-either :ref:`HTTP or GRPC ` and are then
-routed to the appropriate per-model scheduler queue. The scheduler
-performs fair scheduling and dynamic batching for each model’s
-requests. The schedule passes each request to the framework backend
-corresponding to the model type. The framework backend performs
-inferencing using the inputs provided in the request to produce the
-requested outputs. The outputs are then formatted and a response is
-sent.
-
-.. image:: images/arch.png
-
-.. _section-concurrent-model-execution:
-
-Concurrent Model Execution
---------------------------
-
-The TRTIS architecture allows multiple models and/or multiple
-instances of the same model to execute in parallel on a single
-GPU. The following figure shows an example with two models; model0 and
-model1. Assuming TRTIS is not currently processing any request, when
-two requests arrive simultaneously, one for each model, TRTIS
-immediately schedules both of them onto the GPU and the GPU’s hardware
-scheduler begins working on both computations in parallel.
-
-.. image:: images/multi_model_exec.png
-
-By default, if multiple requests for the same model arrive at the same
-time, TRTIS will serialize their execution by scheduling only one at a
-time on the GPU, as shown in the following figure.
-
-.. image:: images/multi_model_serial_exec.png
-
-The TensorRT inference server provides an :ref:`instance-group
-` feature that allows each model to specify
-how many parallel executions of that model should be allowed. Each
-such enabled parallel execution is referred to as an *execution
-instance*. By default, TRTIS gives each model a single execution
-instance, which means that only a single execution of the model is
-allowed to be in progress at a time as shown in the above figure. By
-using instance-group the number of execution instances for a model can
-be increased. The following figure shows model execution when model1
-is configured to allow three execution instances. As shown in the
-figure, the first three model1 inference requests are immediately
-executed in parallel on the GPU. The fourth model1 inference request
-must wait until one of the first three executions completes before
-beginning.
-
-.. image:: images/multi_model_parallel_exec.png
-
-To provide the current model execution capabilities shown in the above
-figures, TRTIS uses `CUDA streams
-`_
-to exploit the GPU’s hardware scheduling capabilities. CUDA streams
-allow TRTIS to communicate independent sequences of memory-copy and
-kernel executions to the GPU. The hardware scheduler in the GPU takes
-advantage of the independent execution streams to fill the GPU with
-independent memory-copy and kernel executions. For example, using
-streams allows the GPU to execute a memory-copy for one model, a
-kernel for another model, and a different kernel for yet another model
-at the same time.
-
-The following figure shows some details of how this works within the
-TensorRT Inference Server. Each framework backend (TensorRT,
-TensorFlow, Caffe2) provides an API for creating an execution context
-that is used to execute a given model (each framework uses different
-terminology for this concept but here we refer to them generally as
-execution contexts). Each framework allows an execution context to be
-associated with a CUDA stream. This CUDA stream is used by the
-framework to execute all memory copies and kernels needed for the
-model associated with the execution context. For a given model, TRTIS
-creates one execution context for each execution instance specified
-for the model. When an inference request arrives for a given model,
-that request is queued in the model scheduler associated with that
-model. The model scheduler waits for any execution context associated
-with that model to be idle and then sends the queued request to the
-context. The execution context then issues all the memory copies and
-kernel executions required to execute the model to the CUDA stream
-associated with that execution context. The memory copies and kernels
-in each CUDA stream are independent of memory copies and kernels in
-other CUDA streams. The GPU hardware scheduler looks across all CUDA
-streams to find independent memory copies and kernels to execute on
-the GPU.
-
-.. image:: images/cuda_stream_exec.png
diff --git a/docs/build.rst b/docs/build.rst
deleted file mode 100644
index 9e9d7f1ce5..0000000000
--- a/docs/build.rst
+++ /dev/null
@@ -1,121 +0,0 @@
-..
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions
- # are met:
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- # * Neither the name of NVIDIA CORPORATION nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Building
-========
-
-The TensorRT Inference Server is built using Docker and the TensorFlow
-and PyTorch containers from `NVIDIA GPU Cloud (NGC)
-`_. Before building you must install Docker
-and nvidia-docker and login to the NGC registry by following the
-instructions in :ref:`section-installing-prebuilt-containers`.
-
-.. _section-building-the-server:
-
-Building the Server
--------------------
-
-To build a release version of the TRTIS container, change directory to
-the root of the repo and issue the following command::
-
- $ docker build --pull -t tensorrtserver .
-
-Incremental Builds
-^^^^^^^^^^^^^^^^^^
-
-For typical development you will want to run the *build* container
-with your local repo’s source files mounted so that your local changes
-can be incrementally built. This is done by first building the
-*tensorrtserver_build* container::
-
- $ docker build --pull -t tensorrtserver_build --target trtserver_build .
-
-By mounting /path/to/tensorrtserver/src into the container at
-/workspace/src, changes to your local repo will be reflected in the
-container::
-
- $ nvidia-docker run -it --rm -v/path/to/tensorrtserver/src:/workspace/src tensorrtserver_build
-
-Within the container you can perform an incremental server build
-with::
-
- # cd /workspace
- # bazel build -c opt --config=cuda src/servers/trtserver
- # cp /workspace/bazel-bin/src/servers/trtserver /opt/tensorrtserver/bin/trtserver
-
-Similarly, within the container you can perform an incremental build
-of the C++ and Python client libraries and example executables with::
-
- # cd /workspace
- # bazel build -c opt --config=cuda src/clients/…
- # mkdir -p /opt/tensorrtserver/bin
- # cp bazel-bin/src/clients/c++/image_client /opt/tensorrtserver/bin/.
- # cp bazel-bin/src/clients/c++/perf_client /opt/tensorrtserver/bin/.
- # cp bazel-bin/src/clients/c++/simple_client /opt/tensorrtserver/bin/.
- # mkdir -p /opt/tensorrtserver/lib
- # cp bazel-bin/src/clients/c++/librequest.so /opt/tensorrtserver/lib/.
- # cp bazel-bin/src/clients/c++/librequest.a /opt/tensorrtserver/lib/.
- # mkdir -p /opt/tensorrtserver/pip
- # bazel-bin/src/clients/python/build_pip /opt/tensorrtserver/pip/.
-
-Some source changes seem to cause bazel to get confused and not
-correctly rebuild all required sources. You can force bazel to rebuild
-all of the TRTIS source without requiring a complete rebuild of the
-TensorFlow and Caffe2 components by doing the following before issuing
-the above build command::
-
- # rm -fr bazel-bin/src
-
-.. include:: client.rst
- :start-after: build-client-begin-marker-do-not-remove
- :end-before: build-client-end-marker-do-not-remove
-
-Building the Documentation
---------------------------
-
-The TRTIS documentation is found in the docs/ directory and is based
-on `Sphinx `_. `Doxygen
-`_ integrated with `Exhale
-`_ is used for C++ API
-docuementation.
-
-To build the docs install the required dependencies::
-
- $ apt-get update
- $ apt-get install -y --no-install-recommends doxygen
- $ pip install --upgrade sphinx sphinx-rtd-theme nbsphinx exhale
-
-To get the Python client library API docs the TensorRT Inference
-Server Python package must be installed::
-
- $ pip install --upgrade tensorrtserver-*.whl
-
-Then use Sphinx to build the documentation into the build/html
-directory::
-
- $ cd docs
- $ make clean html
diff --git a/docs/client.rst b/docs/client.rst
deleted file mode 100644
index 0335b32cbd..0000000000
--- a/docs/client.rst
+++ /dev/null
@@ -1,383 +0,0 @@
-..
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions
- # are met:
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- # * Neither the name of NVIDIA CORPORATION nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-.. _section-client-libraries-and-examples:
-
-Client Libraries and Examples
-=============================
-
-The TRTIS *client libraries* make it easy to communicate with the
-TensorRT Inference Server from you C++ or Python application. Using
-these libraries you can send either HTTP or GRPC requests to TRTIS to
-check server status or health and to make inference requests.
-
-A couple of example applications show how to use the client libraries
-to perform image classification and to test performance:
-
-* C++ and Python versions of *image\_client*, an example application
- that uses the C++ or Python client library to execute image
- classification models on the TensorRT Inference Server.
-
-* Python version of *grpc\_image\_client*, an example application that
- is functionally equivalent to *image\_client* but that uses GRPC
- generated client code to communicate with TRTIS (instead of the
- client library).
-
-* C++ version of *perf\_client*, an example application that issues a
- large number of concurrent requests to TRTIS to measure latency and
- throughput for a given model. You can use this to experiment with
- different model configuration settings for your models.
-
-.. build-client-begin-marker-do-not-remove
-
-.. _section-building-the-client-libraries-and-examples:
-
-Building the Client Libraries and Examples
-------------------------------------------
-
-The provided Dockerfile can be used to build just the client libraries
-and examples. Issue the following command to build the C++ client
-library, C++ and Python examples, and a Python wheel file for the
-Python client library::
-
- $ docker build -t tensorrtserver_clients --target trtserver_build --build-arg "PYVER=" --build-arg "BUILD_CLIENTS_ONLY=1" .
-
-The -\\-build-arg setting PYVER is optional and can be used to set the
-Python version that you want the Python client library built for (the
-default is 3.5).
-
-After the build completes, the easiest way to extract the built
-libraries and examples from the docker image is to mount a host
-directory and then copy them out from within the container::
-
- $ docker run -it --rm -v/tmp:/tmp/host tensorrtserver_clients
- # cp /opt/tensorrtserver/bin/image_client /tmp/host/.
- # cp /opt/tensorrtserver/bin/perf_client /tmp/host/.
- # cp /opt/tensorrtserver/bin/simple_client /tmp/host/.
- # cp /opt/tensorrtserver/pip/tensorrtserver-*.whl /tmp/host/.
- # cp /opt/tensorrtserver/lib/librequest.* /tmp/host/.
-
-You can now access the files from /tmp on the host system. To run the
-C++ examples you must install some dependencies on your host system::
-
- $ apt-get install curl libcurl3-dev libopencv-dev libopencv-core-dev python-pil
-
-To run the Python examples you will need to additionally install the
-client whl file and some other dependencies::
-
- $ apt-get install python3 python3-pip
- $ pip3 install --user --upgrade tensorrtserver-*.whl pillow
-
-.. build-client-end-marker-do-not-remove
-
-.. _section-image_classification_example:
-
-Image Classification Example Application
-----------------------------------------
-
-The image classification example that uses the C++ client API is
-available at `src/clients/c++/image\_client.cc
-`_. The
-Python version of the image classification client is available at
-`src/clients/python/image\_client.py
-`_.
-
-To use image\_client (or image\_client.py) you must first have a
-running TRTIS that is serving one or more image classification
-models. The image\_client application requires that the model have a
-single image input and produce a single classification output. If you
-don't have a model repository with image classification models see
-:ref:`section-example-model-repository` for instructions on how to
-create one.
-
-Follow the instructions in :ref:`section-running-the-inference-server`
-to launch TRTIS using the model repository. Once the server is running
-you can use the image\_client application to send inference requests
-to the server. You can specify a single image or a directory holding
-images. Here we send a request for the resnet50_netdef model from the
-:ref:`example model repository ` for
-an image from the `qa/images
-`_
-directory::
-
- $ image_client -m resnet50_netdef -s INCEPTION qa/images/mug.jpg
- Request 0, batch size 1
- Image '../qa/images/mug.jpg':
- 504 (COFFEE MUG) = 0.723991
-
-The Python version of the application accepts the same command-line
-arguments::
-
- $ src/clients/python/image_client.py -m resnet50_netdef -s INCEPTION qa/images/mug.jpg
- Request 0, batch size 1
- Image '../qa/images/mug.jpg':
- 504 (COFFEE MUG) = 0.778078556061
-
-The image\_client and image\_client.py applications use the TRTIS
-client library to talk to the server. By default image\_client
-instructs the client library to use HTTP protocol to talk to TRTIS,
-but you can use GRPC protocol by providing the \-i flag. You must also
-use the \-u flag to point at the GRPC endpoint on TRTIS::
-
- $ image_client -i grpc -u localhost:8001 -m resnet50_netdef -s INCEPTION qa/images/mug.jpg
- Request 0, batch size 1
- Image '../qa/images/mug.jpg':
- 504 (COFFEE MUG) = 0.723991
-
-By default the client prints the most probable classification for the
-image. Use the \-c flag to see more classifications::
-
- $ image_client -m resnet50_netdef -s INCEPTION -c 3 qa/images/mug.jpg
- Request 0, batch size 1
- Image '../qa/images/mug.jpg':
- 504 (COFFEE MUG) = 0.723991
- 968 (CUP) = 0.270953
- 967 (ESPRESSO) = 0.00115996
-
-The \-b flag allows you to send a batch of images for inferencing.
-The image\_client application will form the batch from the image or
-images that you specified. If the batch is bigger than the number of
-images then image\_client will just repeat the images to fill the
-batch::
-
- $ image_client -m resnet50_netdef -s INCEPTION -c 3 -b 2 qa/images/mug.jpg
- Request 0, batch size 2
- Image '../qa/images/mug.jpg':
- 504 (COFFEE MUG) = 0.778078556061
- 968 (CUP) = 0.213262036443
- 967 (ESPRESSO) = 0.00293014757335
- Image '../qa/images/mug.jpg':
- 504 (COFFEE MUG) = 0.778078556061
- 968 (CUP) = 0.213262036443
- 967 (ESPRESSO) = 0.00293014757335
-
-Provide a directory instead of a single image to perform inferencing
-on all images in the directory::
-
- $ image_client -m resnet50_netdef -s INCEPTION -c 3 -b 2 qa/images
- Request 0, batch size 2
- Image '../qa/images/car.jpg':
- 817 (SPORTS CAR) = 0.836187
- 511 (CONVERTIBLE) = 0.0708251
- 751 (RACER) = 0.0597549
- Image '../qa/images/mug.jpg':
- 504 (COFFEE MUG) = 0.723991
- 968 (CUP) = 0.270953
- 967 (ESPRESSO) = 0.00115996
- Request 1, batch size 2
- Image '../qa/images/vulture.jpeg':
- 23 (VULTURE) = 0.992326
- 8 (HEN) = 0.00231854
- 84 (PEACOCK) = 0.00201471
- Image '../qa/images/car.jpg':
- 817 (SPORTS CAR) = 0.836187
- 511 (CONVERTIBLE) = 0.0708251
- 751 (RACER) = 0.0597549
-
-The grpc\_image\_client.py application at available at
-`src/clients/python/grpc\_image\_client.py
-`_
-behaves the same as the image\_client except that instead of using the
-TRTIS client library it uses the GRPC generated client library to
-communicate with TRTIS.
-
-Performance Example Application
--------------------------------
-
-The perf\_client example application located at
-`src/clients/c++/perf\_client.cc
-`_
-uses the C++ client API to send concurrent requests to TRTIS to
-measure latency and inferences per second under varying client loads.
-
-To use perf\_client you must first have a running TRTIS that is
-serving one or more models. The perf\_client application works with
-any type of model by sending random data for all input tensors and by
-reading and ignoring all output tensors. If you don't have a model
-repository see :ref:`section-example-model-repository` for
-instructions on how to create one.
-
-Follow the instructions in :ref:`section-running-the-inference-server`
-to launch TRTIS using the model repository.
-
-The perf\_client application has two major modes. In the first mode
-you specify how many concurrent clients you want to simulate and
-perf\_client finds a stable latency and inferences/second for that
-level of concurrency. Use the \-t flag to control concurrency and \-v
-to see verbose output. The following example simulates four clients
-continuously sending requests to TRTIS::
-
- $ perf_client -m resnet50_netdef -p3000 -t4 -v
- *** Measurement Settings ***
- Batch size: 1
- Measurement window: 3000 msec
-
- Request concurrency: 4
- Pass [1] throughput: 207 infer/sec. Avg latency: 19268 usec (std 910 usec)
- Pass [2] throughput: 206 infer/sec. Avg latency: 19362 usec (std 941 usec)
- Pass [3] throughput: 208 infer/sec. Avg latency: 19252 usec (std 841 usec)
- Client:
- Request count: 624
- Throughput: 208 infer/sec
- Avg latency: 19252 usec (standard deviation 841 usec)
- Avg HTTP time: 19224 usec (send 714 usec + response wait 18486 usec + receive 24 usec)
- Server:
- Request count: 749
- Avg request latency: 17886 usec (overhead 55 usec + queue 26 usec + compute 17805 usec)
-
-In the second mode perf\_client will generate an inferences/second
-vs. latency curve by increasing concurrency until a specific latency
-limit or concurrency limit is reached. This mode is enabled by using
-the \-d option and \-l to specify the latency limit and optionally the
-\-c to specify a maximum concurrency limit::
-
- $ perf_client -m resnet50_netdef -p3000 -d -l50 -c 3
- *** Measurement Settings ***
- Batch size: 1
- Measurement window: 3000 msec
- Latency limit: 50 msec
- Concurrency limit: 3 concurrent requests
-
- Request concurrency: 1
- Client:
- Request count: 327
- Throughput: 109 infer/sec
- Avg latency: 9191 usec (standard deviation 822 usec)
- Avg HTTP time: 9188 usec (send/recv 1007 usec + response wait 8181 usec)
- Server:
- Request count: 391
- Avg request latency: 7661 usec (overhead 90 usec + queue 68 usec + compute 7503 usec)
-
- Request concurrency: 2
- Client:
- Request count: 521
- Throughput: 173 infer/sec
- Avg latency: 11523 usec (standard deviation 616 usec)
- Avg HTTP time: 11448 usec (send/recv 711 usec + response wait 10737 usec)
- Server:
- Request count: 629
- Avg request latency: 10018 usec (overhead 70 usec + queue 41 usec + compute 9907 usec)
-
- Request concurrency: 3
- Client:
- Request count: 580
- Throughput: 193 infer/sec
- Avg latency: 15518 usec (standard deviation 635 usec)
- Avg HTTP time: 15487 usec (send/recv 779 usec + response wait 14708 usec)
- Server:
- Request count: 697
- Avg request latency: 14083 usec (overhead 59 usec + queue 30 usec + compute 13994 usec)
-
- Inferences/Second vs. Client Average Batch Latency
- Concurrency: 1, 109 infer/sec, latency 9191 usec
- Concurrency: 2, 173 infer/sec, latency 11523 usec
- Concurrency: 3, 193 infer/sec, latency 15518 usec
-
-Use the \-f flag to generate a file containing CSV output of the
-results::
-
- $ perf_client -m resnet50_netdef -p3000 -d -l50 -c 3 -f perf.csv
-
-You can then import the CSV file into a spreadsheet to help visualize
-the latency vs inferences/second tradeoff as well as see some
-components of the latency. Follow these steps:
-
-- Open `this spreadsheet `_
-- Make a copy from the File menu "Make a copy..."
-- Open the copy
-- Select the A2 cell
-- From the File menu select "Import..."
-- Select "Upload" and upload the file
-- Select "Replace data at selected cell" and then select the "Import data" button
-
-.. _section-client-api:
-
-Client API
-----------
-
-The C++ client API exposes a class-based interface for querying server
-and model status and for performing inference. The commented interface
-is available at `src/clients/c++/request.h
-`_
-and in the API Reference.
-
-The Python client API provides similar capabilities as the C++
-API. The commented interface is available at
-`src/clients/python/\_\_init\_\_.py
-`_
-and in the API Reference.
-
-A very simple C++ example application at
-`src/clients/c++/simple\_client.cc
-`_
-and a Python version at `src/clients/python/simple\_client.py
-`_
-demonstrate basic client API usage.
-
-To run the the C++ version of the simple example, first build as
-described in :ref:`section-building-the-client-libraries-and-examples`
-and then::
-
- $ simple_client
- 0 + 1 = 1
- 0 - 1 = -1
- 1 + 1 = 2
- 1 - 1 = 0
- 2 + 1 = 3
- 2 - 1 = 1
- 3 + 1 = 4
- 3 - 1 = 2
- 4 + 1 = 5
- 4 - 1 = 3
- 5 + 1 = 6
- 5 - 1 = 4
- 6 + 1 = 7
- 6 - 1 = 5
- 7 + 1 = 8
- 7 - 1 = 6
- 8 + 1 = 9
- 8 - 1 = 7
- 9 + 1 = 10
- 9 - 1 = 8
- 10 + 1 = 11
- 10 - 1 = 9
- 11 + 1 = 12
- 11 - 1 = 10
- 12 + 1 = 13
- 12 - 1 = 11
- 13 + 1 = 14
- 13 - 1 = 12
- 14 + 1 = 15
- 14 - 1 = 13
- 15 + 1 = 16
- 15 - 1 = 14
-
-To run the the Python version of the simple example, first build as
-described in :ref:`section-building-the-client-libraries-and-examples`
-and install the tensorrtserver whl, then::
-
- $ python src/clients/python/simple_client.py
diff --git a/docs/conf.py b/docs/conf.py
old mode 100644
new mode 100755
index 917cb7ea71..564ff3e1af
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#!/usr/bin/env python3
+
+# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@@ -24,13 +26,11 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-# -*- coding: utf-8 -*-
-#
# Configuration file for the Sphinx documentation builder.
#
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
@@ -38,223 +38,242 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('..'))
-from builtins import str
import os
-import re
-import sphinx_rtd_theme
-import subprocess
-import textwrap
-
-# -- Project information -----------------------------------------------------
-
-project = u'NVIDIA TensorRT Inference Server'
-copyright = u'2018, NVIDIA Corporation'
-author = u'NVIDIA Corporation'
-
-version_long = u'0.0.0'
-with open("../VERSION") as f:
- version_long = f.readline()
-version_short = re.match('^[\d]+\.[\d]+', version_long).group(0)
+from docutils import nodes
+from sphinx import search
-git_sha = os.getenv("GIT_SHA")
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
-if not git_sha:
- try:
- git_sha = subprocess.check_output(["git", "log", "--pretty=format:'%h'", "-n1"]).decode('ascii').replace("'","").strip()
- except:
- git_sha = u'0000000'
+# -- Project information -----------------------------------------------------
-git_sha = git_sha[:7] if len(git_sha) > 7 else git_sha
+project = "NVIDIA Triton Inference Server"
+copyright = "2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved"
+author = "NVIDIA"
-version = str(version_long + u"-" + git_sha)
# The full version, including alpha/beta/rc tags
-release = str(version_long)
+# Env only set during riva-release process, otherwise keep as dev for all internal builds
+release = os.getenv("TRITON_VERSION", "dev")
-# hack: version is used for html creation, so put the version picker
-# link here as well:
-version = version + """
-Version select: """
+# maintain left-side bar toctrees in `contents` file
+# so it doesn't show up needlessly in the index page
+master_doc = "contents"
# -- General configuration ---------------------------------------------------
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
- 'sphinx.ext.autodoc',
- 'sphinx.ext.mathjax',
- 'sphinx.ext.napoleon',
- 'sphinx.ext.ifconfig',
- 'sphinx.ext.extlinks',
- 'nbsphinx',
- 'breathe',
- 'exhale'
+ "ablog",
+ "myst_parser",
+ "sphinx_copybutton",
+ "sphinx_design",
+ "sphinx-prompt",
+ # "sphinxcontrib.bibtex",
+ "sphinx_tabs.tabs",
+ "sphinx_sitemap",
+ "sphinx.ext.autodoc",
+ "sphinx.ext.autosummary",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.ifconfig",
+ "sphinx.ext.extlinks",
]
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
-
-# The master toctree document.
-master_doc = 'index'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path .
-exclude_patterns = [u'build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
+suppress_warnings = ["myst.domains", "ref.ref", "myst.header"]
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+source_suffix = [".rst", ".md"]
-# Setup the breathe extension
-breathe_projects = {
- "BreatheTRTIS": "./doxyoutput/xml"
-}
-breathe_default_project = "BreatheTRTIS"
-
-# Setup the exhale extension
-exhale_args = {
- # These arguments are required
- "containmentFolder": "./cpp_api",
- "rootFileName": "cpp_api_root.rst",
- "rootFileTitle": "C++ API",
- "doxygenStripFromPath": "..",
- # Suggested optional arguments
- "createTreeView": True,
- # TIP: if using the sphinx-bootstrap-theme, you need
- # "treeViewIsBootstrap": True,
- "exhaleExecutesDoxygen": True,
- "exhaleDoxygenStdin": textwrap.dedent('''
- JAVADOC_AUTOBRIEF = YES
- INPUT = ../src/clients/c++/request.h
- ''')
+autodoc_default_options = {
+ "members": True,
+ "undoc-members": True,
+ "private-members": True,
}
-# Tell sphinx what the primary language being documented is.
-#primary_domain = 'cpp'
+autosummary_generate = True
+autosummary_mock_imports = [
+ "tritonclient.grpc.model_config_pb2",
+ "tritonclient.grpc.service_pb2",
+ "tritonclient.grpc.service_pb2_grpc",
+]
-# Tell sphinx what the pygments highlight language should be.
-highlight_language = 'text'
+napoleon_include_special_with_doc = True
+
+numfig = True
+
+# final location of docs for seo/sitemap
+html_baseurl = (
+ "https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/"
+)
+
+myst_enable_extensions = [
+ "dollarmath",
+ "amsmath",
+ "deflist",
+ # "html_admonition",
+ "html_image",
+ "colon_fence",
+ # "smartquotes",
+ "replacements",
+ # "linkify",
+ "substitution",
+]
+myst_heading_anchors = 5
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["README.md", "examples/README.md", "user_guide/perf_analyzer.md"]
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
-html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further. For a list of options available for each theme, see the
-# documentation.
-#
-html_theme_options = {
- 'canonical_url': 'https://docs.nvidia.com/deeplearning/sdk/tensorrt-inference-server-guide/docs/index.html',
- 'collapse_navigation': False,
- 'display_version': True,
- 'logo_only': False,
-}
+html_theme = "sphinx_book_theme"
+html_logo = "_static/nvidia-logo-horiz-rgb-blk-for-screen.png"
+html_title = "NVIDIA Triton Inference Server"
+html_short_title = "Triton"
+html_copy_source = True
+html_sourcelink_suffix = ""
+html_favicon = "_static/nvidia-logo-vert-rgb-blk-for-screen.png"
+html_last_updated_fmt = ""
+html_additional_files = ["index.html"]
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
-#html_static_path = ['_static']
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself. Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'NVIDIATRTISdoc'
-
-
-# -- Options for LaTeX output ------------------------------------------------
+html_static_path = ["_static"]
+html_css_files = ["custom.css"]
-latex_elements = {
- # The paper size ('letterpaper' or 'a4paper').
- #
- # 'papersize': 'letterpaper',
-
- # The font size ('10pt', '11pt' or '12pt').
- #
- # 'pointsize': '10pt',
-
- # Additional stuff for the LaTeX preamble.
- #
- # 'preamble': '',
-
- # Latex figure (float) alignment
- #
- # 'figure_align': 'htbp',
+html_theme_options = {
+ "path_to_docs": "docs",
+ # "launch_buttons": {
+ # "binderhub_url": "https://mybinder.org",
+ # "colab_url": "https://colab.research.google.com/",
+ # "deepnote_url": "https://deepnote.com/",
+ # "notebook_interface": "jupyterlab",
+ # "thebe": True,
+ # # "jupyterhub_url": "https://datahub.berkeley.edu", # For testing
+ # },
+ "use_edit_page_button": False,
+ "use_issues_button": True,
+ "use_repository_button": True,
+ "use_download_button": False,
+ "logo_only": False,
+ "show_toc_level": 2,
+ "extra_navbar": "",
+ "extra_footer": "",
+ "repository_url": "https://github.com/triton-inference-server/server",
+ "use_repository_button": True,
}
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-# author, documentclass [howto, manual, or own class]).
-latex_documents = [
- (master_doc, 'NVIDIATRTIS.tex', u'NVIDIA TensorRT Inference Server Documentation',
- u'NVIDIA Corporation', 'manual'),
-]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
- (master_doc, 'nvidiatrtis', u'NVIDIA TensorRT Inference Server Documentation',
- [author], 1)
-]
+version_short = release
+deploy_ngc_org = "nvidia"
+deploy_ngc_team = "triton"
+myst_substitutions = {
+ "VersionNum": version_short,
+ "deploy_ngc_org_team": f"{deploy_ngc_org}/{deploy_ngc_team}"
+ if deploy_ngc_team
+ else deploy_ngc_org,
+}
-# -- Options for Texinfo output ----------------------------------------------
+def ultimateReplace(app, docname, source):
+ result = source[0]
+ for key in app.config.ultimate_replacements:
+ result = result.replace(key, app.config.ultimate_replacements[key])
+ source[0] = result
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-# dir menu entry, description, category)
-texinfo_documents = [
- (master_doc, 'NVIDIATRTIS', u'NVIDIA TensorRT Inference Server Documentation',
- author, 'NVIDIATRTIS', 'One line description of project.',
- 'Miscellaneous'),
-]
+# this is a necessary hack to allow us to fill in variables that exist in code blocks
+ultimate_replacements = {
+ "{VersionNum}": version_short,
+ "{SamplesVersionNum}": version_short,
+ "{NgcOrgTeam}": f"{deploy_ngc_org}/{deploy_ngc_team}"
+ if deploy_ngc_team
+ else deploy_ngc_org,
+}
-# -- Extension configuration -------------------------------------------------
-extlinks = {'issue': ('https://github.com/NVIDIA/tensorrt-inference-server/issues/%s',
- 'issue '),
- 'fileref': ('https://github.com/NVIDIA/tensorrt-inference-server/tree/' +
- (git_sha if git_sha != u'0000000' else "master") + '/%s', ''),}
+# bibtex_bibfiles = ["references.bib"]
+# To test that style looks good with common bibtex config
+# bibtex_reference_style = "author_year"
+# bibtex_default_style = "plain"
+
+### We currently use Myst: https://myst-nb.readthedocs.io/en/latest/use/execute.html
+nb_execution_mode = "off" # Global execution disable
+# execution_excludepatterns = ['tutorials/tts-python-basics.ipynb'] # Individual notebook disable
+
+
+def setup(app):
+ app.add_config_value("ultimate_replacements", {}, True)
+ app.connect("source-read", ultimateReplace)
+ app.add_js_file("https://js.hcaptcha.com/1/api.js")
+
+ visitor_script = (
+ "//assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js"
+ )
+
+ if visitor_script:
+ app.add_js_file(visitor_script)
+
+ # if not os.environ.get("READTHEDOCS") and not os.environ.get("GITHUB_ACTIONS"):
+ # app.add_css_file(
+ # "https://assets.readthedocs.org/static/css/readthedocs-doc-embed.css"
+ # )
+ # app.add_css_file("https://assets.readthedocs.org/static/css/badge_only.css")
+
+ # # Create the dummy data file so we can link it
+ # # ref: https://github.com/readthedocs/readthedocs.org/blob/bc3e147770e5740314a8e8c33fec5d111c850498/readthedocs/core/static-src/core/js/doc-embed/footer.js # noqa: E501
+ # app.add_js_file("rtd-data.js")
+ # app.add_js_file(
+ # "https://assets.readthedocs.org/static/javascript/readthedocs-doc-embed.js",
+ # priority=501,
+ # )
+
+
+# Patch for sphinx.search stemming short terms (i.e. tts -> tt)
+# https://github.com/sphinx-doc/sphinx/blob/4.5.x/sphinx/search/__init__.py#L380
+def sphinxSearchIndexFeed(
+ self, docname: str, filename: str, title: str, doctree: nodes.document
+):
+ """Feed a doctree to the index."""
+ self._titles[docname] = title
+ self._filenames[docname] = filename
+
+ visitor = search.WordCollector(doctree, self.lang)
+ doctree.walk(visitor)
+
+ # memoize self.lang.stem
+ def stem(word: str) -> str:
+ try:
+ return self._stem_cache[word]
+ except KeyError:
+ self._stem_cache[word] = self.lang.stem(word).lower()
+ return self._stem_cache[word]
+
+ _filter = self.lang.word_filter
+
+ for word in visitor.found_title_words:
+ stemmed_word = stem(word)
+ if len(stemmed_word) > 3 and _filter(stemmed_word):
+ self._title_mapping.setdefault(stemmed_word, set()).add(docname)
+ elif _filter(word): # stemmer must not remove words from search index
+ self._title_mapping.setdefault(word.lower(), set()).add(docname)
+
+ for word in visitor.found_words:
+ stemmed_word = stem(word)
+ # again, stemmer must not remove words from search index
+ if len(stemmed_word) <= 3 or not _filter(stemmed_word) and _filter(word):
+ stemmed_word = word.lower()
+ already_indexed = docname in self._title_mapping.get(stemmed_word, set())
+ if _filter(stemmed_word) and not already_indexed:
+ self._mapping.setdefault(stemmed_word, set()).add(docname)
+
+
+search.IndexBuilder.feed = sphinxSearchIndexFeed
diff --git a/docs/contents.md b/docs/contents.md
new file mode 100644
index 0000000000..cf5653340d
--- /dev/null
+++ b/docs/contents.md
@@ -0,0 +1,149 @@
+
+
+```{toctree}
+:maxdepth: 1
+:caption: Getting Started
+
+getting_started/quickstart
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: User Guide
+
+user_guide/performance_tuning
+user_guide/architecture
+user_guide/model_repository
+customization_guide/repository_agents
+user_guide/model_configuration
+user_guide/request_cancellation
+user_guide/optimization
+user_guide/ragged_batching
+user_guide/rate_limiter
+user_guide/model_analyzer
+user_guide/model_management
+user_guide/custom_operations
+user_guide/decoupled_models
+user_guide/response_cache
+user_guide/metrics
+user_guide/trace
+user_guide/jetson
+user_guide/v1_to_v2
+customization_guide/deploy
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Debugging
+
+user_guide/debugging_guide
+user_guide/faq
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Protocol Guides
+
+protocol/README
+customization_guide/inference_protocols
+protocol/extension_binary_data
+protocol/extension_classification
+protocol/extension_generate
+protocol/extension_logging
+protocol/extension_model_configuration
+protocol/extension_model_repository
+protocol/extension_schedule_policy
+protocol/extension_sequence
+protocol/extension_shared_memory
+protocol/extension_statistics
+protocol/extension_trace
+protocol/extension_parameters
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Customization Guide
+
+customization_guide/build
+customization_guide/compose
+customization_guide/test
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Examples
+
+examples/jetson/README
+examples/jetson/concurrency_and_dynamic_batching/README
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Client
+
+client/README
+_reference/tritonclient_api.rst
+client/src/java/README
+client/src/grpc_generated/go/README
+client/src/grpc_generated/javascript/README
+client/src/grpc_generated/java/README
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Performance Analyzer
+
+client/src/c++/perf_analyzer/README
+client/src/c++/perf_analyzer/docs/README
+client/src/c++/perf_analyzer/docs/install
+client/src/c++/perf_analyzer/docs/quick_start
+client/src/c++/perf_analyzer/docs/cli
+client/src/c++/perf_analyzer/docs/inference_load_modes
+client/src/c++/perf_analyzer/docs/input_data
+client/src/c++/perf_analyzer/docs/measurements_metrics
+client/src/c++/perf_analyzer/docs/benchmarking
+client/src/c++/perf_analyzer/genai-perf/README
+client/src/c++/perf_analyzer/genai-perf/examples/tutorial
+```
+
+```{toctree}
+:maxdepth: 1
+:caption: Python Backend
+
+python_backend/README
+python_backend/inferentia/README
+python_backend/examples/auto_complete/README
+python_backend/examples/bls/README
+python_backend/examples/bls_decoupled/README
+python_backend/examples/custom_metrics/README
+python_backend/examples/decoupled/README
+python_backend/examples/instance_kind/README
+python_backend/examples/jax/README
+python_backend/examples/preprocessing/README
+```
diff --git a/docs/contribute.rst b/docs/contribute.rst
deleted file mode 100644
index edc1bc0e3a..0000000000
--- a/docs/contribute.rst
+++ /dev/null
@@ -1,45 +0,0 @@
-..
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
- #
- # Redistribution and use in source and binary forms, with or without
- # modification, are permitted provided that the following conditions
- # are met:
- # * Redistributions of source code must retain the above copyright
- # notice, this list of conditions and the following disclaimer.
- # * Redistributions in binary form must reproduce the above copyright
- # notice, this list of conditions and the following disclaimer in the
- # documentation and/or other materials provided with the distribution.
- # * Neither the name of NVIDIA CORPORATION nor the names of its
- # contributors may be used to endorse or promote products derived
- # from this software without specific prior written permission.
- #
- # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Contributing
-============
-
-Contributions to TensorRT Inference Server are more than welcome. To
-contribute make a pull request and follow the guidelines outlined in
-the `CONTRIBUTING
-`_
-document.
-
-Coding Convention
------------------
-
-Use clang-format to format all source files (\*.h, \*.cc, \*.proto) to
-a consistent format. You should run clang-format on all source files
-before submitting a pull request::
-
- $ apt-get install clang-format clang-format-6.0
- $ clang-format-6.0 --style=file -i *.proto *.cc *.h
diff --git a/docs/customization_guide/build.md b/docs/customization_guide/build.md
new file mode 100644
index 0000000000..2f8b8f69d4
--- /dev/null
+++ b/docs/customization_guide/build.md
@@ -0,0 +1,521 @@
+
+
+# Building Triton
+
+This section describes how to build the Triton server from source. For
+information on building the Triton client libraries and examples see
+[Client Libraries and
+Examples](https://github.com/triton-inference-server/client). For
+information on building the Triton SDK container see [Build SDK
+Image](test.md#build-sdk-image). For information on testing your
+Triton build see [Testing Triton](test.md).
+
+You can create a customized Triton Docker image that contains a subset
+of the released backends without building from source. For example,
+you may want a Triton image that contains only the TensorRT and Python
+backends. For this type of customization you don't need to build
+Triton from source and instead can use [the *compose*
+utility](compose.md).
+
+The Triton source is distributed across multiple GitHub repositories
+that together can be built and installed to create a complete Triton
+installation. Triton server is built using CMake and (optionally)
+Docker. To simplify the build process, Triton provides a
+[build.py](https://github.com/triton-inference-server/server/blob/main/build.py) script.
+The build.py script will generate the CMake and Docker build steps required to
+build Triton, and will optionally invoke those steps or leave the invocation to
+you, as described below.
+
+The build.py script currently supports building Triton for the
+following platforms. See [Building on Unsupported
+Platforms](#building-on-unsupported-platforms) if you are attempting
+to build Triton on a platform that is not listed here.
+
+* [Ubuntu 22.04, x86-64](#building-for-ubuntu-2204)
+
+* [Jetpack 4.x, NVIDIA Jetson (Xavier, Nano, TX2)](#building-for-jetpack-4x)
+
+* [Windows 10, x86-64](#building-for-windows-10)
+
+If you are developing or debugging Triton, see [Development and
+Incremental Builds](#development-and-incremental-builds) for information
+on how to perform incremental build.
+
+## Building for Ubuntu 22.04
+
+For Ubuntu-22.04, build.py supports both a Docker build and a
+non-Docker build.
+
+* [Build using Docker](#building-with-docker) and the TensorFlow and PyTorch
+ Docker images from [NVIDIA GPU Cloud (NGC)](https://ngc.nvidia.com).
+
+* [Build without Docker](#building-without-docker).
+
+### Building With Docker
+
+The easiest way to build Triton is to use Docker. The result of the
+build will be a Docker image called *tritonserver* that will contain
+the tritonserver executable in /opt/tritonserver/bin and the required
+shared libraries in /opt/tritonserver/lib. The backends and
+repository-agents built for Triton will be in
+/opt/tritonserver/backends and /opt/tritonserver/repoagents,
+respectively.
+
+The first step for the build is to clone the
+[triton-inference-server/server](https://github.com/triton-inference-server/server)
+repo branch for the release you are interested in building (or the
+*main* branch to build from the development branch). Then run build.py
+as described below. The build.py script performs these steps when
+building with Docker.
+
+* In the *build* subdirectory of the server repo, generate the
+ docker_build script, the cmake_build script and the Dockerfiles
+ needed to build Triton. If you use the --dryrun flag, build.py will
+ stop here so that you can examine these files.
+
+* Run the docker_build script to perform the Docker-based build. The
+ docker_build script performs the following steps.
+
+ * Build the *tritonserver_buildbase* Docker image that collects all
+ the build dependencies needed to build Triton. The
+ *tritonserver_buildbase* image is based on a minimal/base
+ image. When building with GPU support (--enable-gpu), the *min*
+ image is the
+ [\-py3-min](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
+ image pulled from [NGC](https://ngc.nvidia.com) that contains the
+ CUDA, cuDNN, TensorRT and other dependencies that are required to
+ build Triton. When building without GPU support, the *min* image
+ is the standard ubuntu:22.04 image.
+
+ * Run the cmake_build script within the *tritonserver_buildbase*
+ image to actually build Triton. The cmake_build script performs
+ the following steps.
+
+ * Invoke CMake in the server repo to build Triton's core shared
+ library and *tritonserver* executable.
+
+ * Clone each requested backend and build it using CMake. For
+ example, the ONNX Runtime backend is built using
+ [triton-inference-server/onnxruntime_backend/CMakeLists.txt](https://github.com/triton-inference-server/onnxruntime_backend/blob/main/CMakeLists.txt). Some
+ of the backends may use Docker as part of their build (for
+ example [ONNX
+ Runtime](https://github.com/triton-inference-server/onnxruntime_backend)
+ and
+ [OpenVINO](https://github.com/triton-inference-server/openvino_backend)). If
+ you don't want to use Docker in those cases you must consult the
+ build process for those backends.
+
+ * Clone each repository agent and build it using the CMake file
+ from the corresponding repo. For example, the
+ [Checksum](https://github.com/triton-inference-server/checksum_repository_agent)
+ repository agent is built using
+ [triton-inference-server/checksum_repository_agent/CMakeLists.txt](https://github.com/triton-inference-server/checksum_repository_agent/blob/main/CMakeLists.txt).
+
+ * Copy the built artifacts out of the container and into the build
+ subdirectory on the host system.
+
+ * Create the final *tritonserver* Docker image that contains the
+ libraries, executables and other artifacts from the build.
+
+ * Create a *tritonserver_cibase* Docker image that contains the QA
+ artifacts needed for testing, as described in [Testing
+ Triton](test.md).
+
+By default, build.py does not enable any of Triton's optional features
+but you can enable all features, backends, and repository agents with
+the --enable-all flag. The -v flag turns on verbose output.
+
+```bash
+$ ./build.py -v --enable-all
+```
+
+If you want to enable only certain Triton features, backends and
+repository agents, do not specify --enable-all. Instead you must
+specify the individual flags as documented by --help.
+
+#### Building With Specific GitHub Branches
+
+As described above, the build is performed in the server repo, but
+source from several other repos is fetched during the build
+process. Typically you do not need to specify anything about these
+other repos, but if you want to control which branch is used in these
+other repos you can as shown in the following example.
+
+```bash
+$ ./build.py ... --repo-tag=common: --repo-tag=core: --repo-tag=backend: --repo-tag=thirdparty: ... --backend=tensorrt: ... --repoagent=checksum: ...
+```
+
+If you are building on a release branch then `` will
+default to the branch name. For example, if you are building on the
+r24.03 branch, `` will default to r24.03. If you are
+building on any other branch (including the *main* branch) then
+`` will default to "main". Therefore, you typically do
+not need to provide `` at all (nor the preceding
+colon). You can use a different `` for a component to
+instead use the corresponding branch/tag in the build. For example, if
+you have a branch called "mybranch" in the
+[onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend)
+repo that you want to use in the build, you would specify
+--backend=onnxruntime:mybranch.
+
+#### CPU-Only Build
+
+If you want to build without GPU support you must specify individual
+feature flags and not include the `--enable-gpu` and
+`--enable-gpu-metrics` flags. Only the following backends are
+available for a non-GPU / CPU-only build: `identity`, `repeat`, `ensemble`,
+`square`, `tensorflow2`, `pytorch`, `onnxruntime`, `openvino`,
+`python` and `fil`.
+
+To include the TensorFlow2 backend in your CPU-only build, you must
+provide this additional flag to build.py:
+`--extra-backend-cmake-arg=tensorflow2:TRITON_TENSORFLOW_INSTALL_EXTRA_DEPS=ON`.
+
+CPU-only builds of the TensorFlow and PyTorch backends require some CUDA stubs
+and runtime dependencies that are not present in the CPU-only base container.
+These are retrieved from a GPU base container, which can be changed with the
+`--image=gpu-base,nvcr.io/nvidia/tritonserver:-py3-min` flag.
+
+### Building Without Docker
+
+To build Triton without using Docker you must install the build
+dependencies that are handled automatically when building with Docker.
+
+The first step for the build is to clone the
+[triton-inference-server/server](https://github.com/triton-inference-server/server)
+repo branch for the release you are interested in building (or the
+*main* branch to build from the development branch).
+
+To determine what dependencies are required by the build, run build.py
+with the --dryrun flag, and then looking in the build subdirectory at
+Dockerfile.buildbase.
+
+```bash
+$ ./build.py -v --enable-all
+```
+
+From Dockerfile.buildbase you can see what dependencies you need to
+install on your host system. Note that when building with --enable-gpu
+(or --enable-all), Dockerfile.buildbase depends on the
+[\-py3-min](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
+image pulled from [NGC](https://ngc.nvidia.com). Unfortunately, a
+Dockerfile is not currently available for the
+[\-py3-min](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
+image. Instead, you must manually install [CUDA and
+cuDNN](#cuda-cublas-cudnn) and [TensorRT](#tensorrt) dependencies as
+described below.
+
+Once you have installed these dependencies on your build system you
+can then use build.py with the --no-container-build flag to build
+Triton.
+
+```bash
+$ ./build.py -v --no-container-build --build-dir=`pwd`/build --enable-all
+```
+
+See [Building with Docker](#building-with-docker) for more details on how the
+cmake_build script is used to perform the build.
+
+#### CUDA, cuBLAS, cuDNN
+
+For Triton to support NVIDIA GPUs you must install CUDA, cuBLAS and
+cuDNN. These libraries must be installed on the system include and
+library paths so that they are available for the build. The version of
+the libraries used for a given release can be found in the [Framework
+Containers Support
+Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+For a given version of Triton you can attempt to build with
+non-supported versions of the libraries but you may have build or
+execution issues since non-supported versions are not tested.
+
+#### TensorRT
+
+The TensorRT headers and libraries must be installed on system include
+and library paths so that they are available for the build. The
+version of TensorRT used in a given release can be found in the
+[Framework Containers Support
+Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+For a given version of Triton you can attempt to build with
+non-supported versions of TensorRT but you may have build or execution
+issues since non-supported versions are not tested.
+
+## Building for JetPack 4.x
+
+*Under Construction*
+
+## Building for Windows 10
+
+For Windows 10, build.py supports both a Docker build and a non-Docker
+build in a similar way as described for [Ubuntu](#building-for-ubuntu-2204). The primary
+difference is that the minimal/base image used as the base of
+Dockerfile.buildbase image can be built from the provided
+[Dockerfile.win10.min](https://github.com/triton-inference-server/server/blob/main/Dockerfile.win10.min)
+file as described in [Windows 10 "Min" Image](#windows-10-min-image). When running build.py
+use the --image flag to specify the tag that you assigned to this
+image. For example, --image=base,win10-py3-min.
+
+### Windows and Docker
+
+Depending on your version of Windows 10 and your version of Docker you
+may need to perform these additional steps before any of the following
+step.
+
+* Set your Docker to work with "Windows containers". Right click on
+ the whale icon in the lower-right status area and select "Switch to
+ Windows containers".
+
+### Windows 10 "Min" Image
+
+The "min" container describes the base dependencies needed to perform
+the Windows build. The Windows min container is
+[Dockerfile.win10.min](https://github.com/triton-inference-server/server/blob/main/Dockerfile.win10.min).
+
+Before building the min container you must download the appropriate
+cuDNN and TensorRT versions and place them in the same directory as
+Dockerfile.win10.min.
+
+* For cuDNN the CUDNN_VERSION and CUDNN_ZIP arguments defined in
+ Dockerfile.win10.min indicate the version of cuDNN that your should
+ download from https://developer.nvidia.com/rdp/cudnn-download.
+
+* For TensorRT the TENSORRT_VERSION and TENSORRT_ZIP arguments defined
+ in Dockerfile.win10.min indicate the version of TensorRT that your
+ should download from
+ https://developer.nvidia.com/nvidia-tensorrt-download.
+
+After downloading the zip files for cuDNN and TensorRT, you build the
+min container using the following command.
+
+```bash
+$ docker build -t win10-py3-min -f Dockerfile.win10.min .
+```
+
+### Build Triton Server
+
+Triton is built using the build.py script. The build system must have
+Docker, Python3 (plus pip installed *docker* module) and git installed
+so that it can execute build.py and perform a docker build. By
+default, build.py does not enable any of Triton's optional features
+and so you must enable them explicitly. The following build.py
+invocation builds all features and backends available on windows.
+
+```bash
+python build.py --cmake-dir=/build --build-dir=/tmp/citritonbuild --no-container-pull --image=base,win10-py3-min --enable-logging --enable-stats --enable-tracing --enable-gpu --endpoint=grpc --endpoint=http --repo-tag=common: --repo-tag=core: --repo-tag=backend: --repo-tag=thirdparty: --backend=ensemble --backend=tensorrt: --backend=onnxruntime: --backend=openvino:
+```
+
+If you are building on *main* branch then '' will
+default to "main". If you are building on a release branch then
+'' will default to the branch name. For example, if you
+are building on the r24.03 branch, '' will default to
+r24.03. Therefore, you typically do not need to provide '' at all (nor the preceding colon). You can use a different
+'' for a component to instead use the corresponding
+branch/tag in the build. For example, if you have a branch called
+"mybranch" in the
+[onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend)
+repo that you want to use in the build, you would specify
+--backend=onnxruntime:mybranch.
+
+### Extract Build Artifacts
+
+When build.py completes, a Docker image called *tritonserver* will
+contain the built Triton Server executable, libraries and other
+artifacts. Windows containers do not support GPU access so you likely
+want to extract the necessary files from the tritonserver image and
+run them directly on your host system. All the Triton artifacts can be
+found in /opt/tritonserver directory of the tritonserver image. Your
+host system will need to install the CUDA, cuDNN, TensorRT and other
+dependencies that were used for the build.
+
+## Building on Unsupported Platforms
+
+Building for an unsupported OS and/or hardware platform is
+possible. All of the build scripting, Dockerfiles and CMake
+invocations are included in the public repos or are generated by
+build.py as described in [Building with Docker](#building-with-docker). From
+these files you can find the required dependencies and CMake
+invocations. However, due to differences in compilers, libraries,
+package management, etc. you may have to make changes in the build
+scripts, Dockerfiles, CMake files and the source code.
+
+To see the generated build scripts and Dockerfiles referred to below,
+use:
+
+```bash
+$ ./build.py -v --enable-all --dryrun
+```
+
+You should familiarize yourself with the build process for supported
+platforms by reading the above documentation and then follow the
+process for the supported platform that most closely matches the
+platform you are interested in (for example, if you are trying to
+build for RHEL/x86-64 then follow the [Building for Ubuntu
+22.04](#building-for-ubuntu-2204) process. You will likely need to
+make changes in the following areas and then manually run docker_build
+and cmake_build or the equivalent commands to perform a build.
+
+* The generated Dockerfiles install dependencies for the build using
+ platform-specific packaging tools, for example, apt-get for
+ Ubuntu. You will need to change build.py to use the packaging tool
+ appropriate for your platform.
+
+* The package and libraries names for your platform may differ from
+ those used by the generated Dockerfiles. You will need to find the
+ corresponding packages on libraries on your platform.
+
+* Your platform may use a different compiler or compiler version than
+ the support platforms. As a result you may encounter build errors
+ that need to be fixed by editing the source code or changing the
+ compilation flags.
+
+* Triton depends on a large number of open-source packages that it
+ builds from source. If one of these packages does not support your
+ platform then you may need to disable the Triton feature that
+ depends on that package. For example, Triton supports the S3
+ filesystem by building the aws-sdk-cpp package. If aws-sdk-cpp
+ doesn't build for your platform then you can remove the need for
+ that package by not specifying --filesystem=s3 when you run
+ build.py. In general, you should start by running build.py with the
+ minimal required feature set.
+
+* The
+ [TensorFlow](https://github.com/triton-inference-server/tensorflow_backend)
+ backend extracts pre-built shared libraries from the TensorFlow NGC
+ container as part of the build. This container is only available for
+ Ubuntu-22.04 / x86-64, so if you require the TensorFlow backend for
+ your platform you will need download the TensorFlow container and
+ modify its build to produce shared libraries for your platform. You
+ must use the TensorFlow source and build scripts from within the NGC
+ container because they contain Triton-specific patches that are
+ required for the Triton TensorFlow backend.
+
+* By default, the
+ [PyTorch](https://github.com/triton-inference-server/pytorch_backend)
+ backend build extracts pre-built shared libraries from The PyTorch
+ NGC container. But the build can also use PyTorch shared libraries
+ that you build separately for your platform. See the pytorch_backend
+ build process for details.
+
+## Development and Incremental Builds
+
+### Development Builds Without Docker
+
+If you are [building without Docker](#building-without-docker) use the
+CMake invocation steps in cmake_build to invoke CMake to set-up a
+build environment where you can invoke make/msbuild.exe to incremental
+build the Triton core, a backend, or a repository agent.
+
+### Development Builds With Docker
+
+If you are [building with Docker](#building-with-docker), the generated
+*tritonserver_buildbase* image contains all the dependencies needed to
+perform a full or incremental build. Within *tritonserver_buildbase*,
+/workspace/build/cmake_build contains the CMake invocations that are
+used to build the Triton core, the backends, and the repository
+agents.
+
+To perform an incremental build within the *tritonserver_buildbase*
+container, map your source into the container and then run the
+appropriate CMake and `make` (or `msbuild.exe`) steps from cmake_build
+within the container.
+
+#### Development Build of Triton Core
+
+Assuming you have a clone of the [server
+repo](https://github.com/triton-inference-server/server) on your host
+system where you are making changes and you want to perform
+incremental builds to test those changes. Your source code is in
+/home/me/server. Run the *tritonserver_buildbase* container and map
+your server source directory into the container at /server.
+
+```
+$ docker run -it --rm -v/home/me/server:/server tritonserver_buildbase bash
+```
+
+Look at /workspace/build/cmake_build within the container for the
+section of commands that build "Triton core library". You can follow
+those command exactly, or you can modify them to change the build
+directory or the CMake options. You **must** change the CMake command
+to use /server instead of /workspace as the location for the
+CMakeLists.txt file and source:
+
+```
+$ cmake /server
+```
+
+Then you can change directory into the build directory and run `make`
+(or `msbuild.exe`) as shown in cmake_build. As you make changes to the
+source on your host system, you can perform incremental builds by
+re-running `make` (or `msbuild.exe`).
+
+#### Development Build of Backend or Repository Agent
+
+Performing a full or incremental build of a backend or repository
+agent is similar to building the Triton core. As an example we will
+use the TensorRT backend. Assuming you have a clone of the [TensorRT
+backend
+repo](https://github.com/triton-inference-server/tensorrt_backend) on
+your host system where you are making changes and you want to perform
+incremental builds to test those changes. Your source code is in
+/home/me/tritonserver_backend. Run the *tritonserver_buildbase*
+container and map your TensorRT backend source directory into the
+container at /tensorrt_backend. Note that some backends will use
+Docker as part of their build, and so the host's Docker registry must
+be made available within the *tritonserver_buildbase* by mounting
+docker.sock (on Windows use
+-v\\.\pipe\docker_engine:\\.\pipe\docker_engine).
+
+```
+$ docker run -it --rm -v/var/run/docker.sock:/var/run/docker.sock -v/home/me/tensorrt_backend:/tensorrt_backend tritonserver_buildbase bash
+```
+
+Look at /workspace/build/cmake_build within the container for the
+section of commands that build "TensorRT backend". You can follow
+those command exactly, or you can modify them to change the build
+directory or the CMake options. You **must** change the CMake command
+to use /tensorrt_backend instead of /workspace as the location for the
+CMakeLists.txt file and source:
+
+```
+$ cmake /tensorrt_backend
+```
+
+Then you can change directory into the build directory and run `make`
+(or `msbuild.exe`) as shown in cmake_build. As you make changes to the
+source on your host system, you can perform incremental builds by
+re-running `make` (or `msbuild.exe`).
+
+### Building with Debug Symbols
+
+To build with Debug symbols, use the --build-type=Debug argument while
+launching build.py. If building directly with CMake use
+-DCMAKE_BUILD_TYPE=Debug. You can then launch the built server with
+gdb and see the debug symbols/information in the gdb trace.
diff --git a/docs/customization_guide/compose.md b/docs/customization_guide/compose.md
new file mode 100644
index 0000000000..3baa9e9df6
--- /dev/null
+++ b/docs/customization_guide/compose.md
@@ -0,0 +1,147 @@
+
+
+# Customize Triton Container
+
+Two Docker images are available from [NVIDIA GPU Cloud
+(NGC)](https://ngc.nvidia.com) that make it possible to easily
+construct customized versions of Triton. By customizing Triton you can
+significantly reduce the size of the Triton image by removing
+functionality that you don't require.
+
+Currently the customization is limited as described below but future
+releases will increase the amount of customization that is available.
+It is also possible to [build Triton](build.md#building-triton)
+from source to get more exact customization.
+
+## Use the compose.py script
+
+The `compose.py` script can be found in the
+[server repository](https://github.com/triton-inference-server/server).
+Simply clone the repository and run `compose.py` to create a custom container.
+Note: Created container version will depend on the branch that was cloned.
+For example branch
+ [r24.03](https://github.com/triton-inference-server/server/tree/r24.03)
+should be used to create a image based on the NGC 24.03 Triton release.
+
+`compose.py` provides `--backend`, `--repoagent` options that allow you to
+specify which backends and repository agents to include in the custom image.
+For example, the following creates a new docker image that
+contains only the Pytorch and Tensorflow backends and the checksum
+repository agent.
+
+Example:
+```
+python3 compose.py --backend pytorch --backend tensorflow --repoagent checksum
+```
+will provide a container `tritonserver` locally. You can access the container
+with
+```
+$ docker run -it tritonserver:latest
+```
+
+Note: If `compose.py` is run on release versions `r21.08` and earlier,
+the resulting container will have DCGM version 2.2.3 installed.
+This may result in different GPU statistic reporting behavior.
+
+### Compose a specific version of Triton
+
+`compose.py` requires two containers: a `min` container which is the
+base the compose container is built from and a `full` container from which the
+script will extract components. The version of the `min` and `full` container
+is determined by the branch of Triton `compose.py` is on.
+For example, running
+```
+python3 compose.py --backend pytorch --repoagent checksum
+```
+on branch [r24.03](https://github.com/triton-inference-server/server/tree/r24.03) pulls:
+- `min` container `nvcr.io/nvidia/tritonserver:24.03-py3-min`
+- `full` container `nvcr.io/nvidia/tritonserver:24.03-py3`
+
+Alternatively, users can specify the version of Triton container to pull from
+any branch by either:
+1. Adding flag `--container-version ` to branch
+```
+python3 compose.py --backend pytorch --repoagent checksum --container-version 24.03
+```
+2. Specifying `--image min, --image full,`.
+ The user is responsible for specifying compatible `min` and `full` containers.
+```
+python3 compose.py --backend pytorch --repoagent checksum --image min,nvcr.io/nvidia/tritonserver:24.03-py3-min --image full,nvcr.io/nvidia/tritonserver:24.03-py3
+```
+Method 1 and 2 will result in the same composed container. Furthermore,
+`--image` flag overrides the `--container-version` flag when both are specified.
+
+Note:
+1. All contents in `/opt/tritonserver` repository of the `min` image will be
+ removed to ensure dependencies of the composed image are added properly.
+2. vLLM and TensorRT-LLM backends are currently not supported backends for
+`compose.py`. If you want to build additional backends on top of these backends,
+it would be better to [build it yourself](#build-it-yourself) by using
+`nvcr.io/nvidia/tritonserver:24.03-vllm-python-py3` or
+`nvcr.io/nvidia/tritonserver:24.03-trtllm-python-py3` as a `min` container.
+
+
+### CPU-only container composition
+
+CPU-only containers are not yet available for customization. Please see
+ [build documentation](build.md) for instructions to build a full CPU-only
+ container. When including TensorFlow or PyTorch backends in the composed
+ container, an additional `gpu-min` container is needed
+since this container provided the CUDA stubs and runtime dependencies which are
+not provided in the CPU only min container.
+
+## Build it yourself
+
+If you would like to do what `compose.py` is doing under the hood yourself, you
+ can run `compose.py` with the `--dry-run` option and then modify the
+ `Dockerfile.compose` file to satisfy your needs.
+
+
+### Triton with Unsupported and Custom Backends
+
+You can [create and build your own Triton
+backend](https://github.com/triton-inference-server/backend). The
+result of that build should be a directory containing your backend
+shared library and any additional files required by the
+backend. Assuming your backend is called "mybackend" and that the
+directory is "./mybackend", adding the following to the Dockerfile `compose.py`
+created will create a Triton image that contains all the supported Triton
+backends plus your custom backend.
+
+```
+COPY ./mybackend /opt/tritonserver/backends/mybackend
+```
+
+You also need to install any additional dependencies required by your
+backend as part of the Dockerfile. Then use Docker to create the
+image.
+
+```
+$ docker build -t tritonserver_custom -f Dockerfile.compose .
+```
diff --git a/docs/customization_guide/deploy.md b/docs/customization_guide/deploy.md
new file mode 100644
index 0000000000..112a2cebcf
--- /dev/null
+++ b/docs/customization_guide/deploy.md
@@ -0,0 +1,279 @@
+
+
+# Secure Deployment Considerations
+
+The Triton Inference Server project is designed for flexibility and
+allows developers to create and deploy inferencing solutions in a
+variety of ways. Developers can deploy Triton as an http server, a
+grpc server, a server supporting both, or embed a Triton server into
+their own application. Developers can deploy Triton locally or in the
+cloud, within a Kubernetes cluster behind an API gateway or as a
+standalone process. This guide is intended to provide some key points
+and best practices that users deploying Triton based solutions should
+consider.
+
+| [Deploying Behind a Secure Gateway or Proxy](#deploying-behind-a-secure-proxy-or-gateway) | [Running with Least Privilege](#running-with-least-privilege) |
+
+> [!IMPORTANT]
+> Ultimately the security of a solution based on Triton
+> is the responsibility of the developer building and deploying that
+> solution. When deploying in production settings please have security
+> experts review any potential risks and threats.
+
+> [!WARNING]
+> Dynamic updates to model repositories are disabled by
+> default. Enabling dynamic updates to model repositories either
+> through model loading APIs or through directory polling can lead to
+> arbitrary code execution. Model repository access control is
+> critical in production deployments. If dynamic updates are required,
+> ensure only trusted entities have access to model loading APIs and
+> model repository directories.
+
+## Deploying Behind a Secure Proxy or Gateway
+
+The Triton Inference Server is designed primarily as a microservice to
+be deployed as part of a larger solution within an application
+framework or service mesh.
+
+In such deployments it is typical to utilize dedicated gateway or
+proxy servers to handle authorization, access control, resource
+management, encryption, load balancing, redundancy and many other
+security and availability features.
+
+The full design of such systems is outside the scope of this
+deployment guide but in such scenarios dedicated ingress controllers
+handle access from outside the trusted network while Triton Inference
+Server handles only trusted, validated requests.
+
+In such scenarios Triton Inference Server is not exposed directly to
+an untrusted network.
+
+### References on Secure Deployments
+
+In the following references, Triton Inference Server would be deployed
+as an "Application" or "Service" within the trusted internal network.
+
+* [https://www.nginx.com/blog/architecting-zero-trust-security-for-kubernetes-apps-with-nginx/]
+* [https://istio.io/latest/docs/concepts/security/]
+* [https://konghq.com/blog/enterprise/envoy-service-mesh]
+* [https://www.solo.io/topics/envoy-proxy/]
+
+## Running with Least Privilege
+
+ The security principle of least privilege advocates that a process be
+ granted the minimum permissions required to do its job.
+
+ For an inference solution based on Triton Inference Server there are a
+ number of ways to reduce security risks by limiting the permissions
+ and capabilities of the server to the minimum required for correct
+ operation.
+
+### 1. Follow Best Practices for Securing Kubernetes Deployments
+
+ When deploying Triton within a Kubernetes pod ensure that it is
+ running with a service account with the fewest possible
+ permissions. Ensure that you have configured [role based access
+ control](https://kubernetes.io/docs/reference/access-authn-authz/rbac/)
+ to limit access to resources and capabilities as required by your
+ application.
+
+### 2. Follow Best Practices for Launching Standalone Docker Containers
+
+ When Triton is deployed as a containerized service, standard docker
+ security practices apply. This includes limiting the resources that a
+ container has access to as well as limiting network access to the
+ container. https://docs.docker.com/engine/security/
+
+### 3. Run as a Non-Root User
+
+ Triton's pre-built containers contain a non-root user that can be used
+ to launch the tritonserver application with limited permissions. This
+ user, `triton-server` is created with `user id 1000`. When launching
+ the container using docker the user can be set with the `--user`
+ command line option.
+
+##### Example Launch Command
+
+ ```
+ docker run --rm --user triton-server -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:YY.MM-py3 tritonserver --model-repository=/models
+ ```
+
+### 4. Restrict or Disable Access to Protocols and APIs
+
+The pre-built Triton Inference Serrver application enables a full set
+of features including health checks, server metadata, inference apis,
+shared memory apis, model and model repository configuration,
+statistics, tracing and logging. Care should be taken to only expose
+those capabilities that are required for your solution.
+
+#### Disabling Features at Compile Time
+
+When building a custom inference server application features can be
+selectively enabled or disabled using the `build.py` script. As an
+example a developer can use the flags `--endpoint http` and
+`--endpoint grpc` to compile support for `http`, `grpc` or
+both. Support for individual backends can be enabled as well. For more
+details please see [documentation](build.md) on building a custom
+inference server application.
+
+#### Disabling / Restricting Features at Run Time
+
+The `tritonserver` application provides a number of command line
+options to enable and disable features when launched. For a full list
+of options please see `tritonserver --help`. The following subset are
+described here with basic recommendations.
+
+##### `--exit-on-error , default True`
+
+Exits the inference server if any error occurs during
+initialization. Recommended to set to `True` to catch any
+unanticipated errors.
+
+##### `--disable-auto-complete-config, default enabled`
+
+Disables backends from autocompleting model configuration. If not
+required for your solution recommended to disable to ensure model
+configurations are defined statically.
+
+##### `--strict-readiness , default True`
+
+If set to true `/v2/health/ready` will only report ready when all
+selected models are loaded. Recommended to set to `True` to provide a
+signal to other services and orchestration frameworks when full
+initialization is complete and server is healthy.
+
+##### `--model-control-mode , default "none"`
+
+Specifies the mode for model management.
+
+> [!WARNING]
+> Allowing dynamic updates to the model repository can lead
+> to arbitrary code execution. Model repository access control is
+> critical in production deployments. Unless required for operation, it's recommended
+> to disable dynamic updates. If required, please ensure only trusted entities
+> can add or remove models from a model repository.
+
+Options:
+
+ * `none`- Models are loaded at start up and can not be modified.
+ * `poll`- Server process will poll the model repository for changes.
+ * `explicit` - Models can be loaded and unloaded via the model control APIs.
+
+Recommended to set to `none` unless dynamic updates are required. If
+dynamic updates are required care must be taken to control access to
+the model repository files and load and unload APIs.
+
+##### `--allow-http , default True`
+
+Enable HTTP request handling. Recommended to set to `False` if not required.
+
+##### `--allow-grpc , default True`
+
+Enable gRPC request handling. Recommended to set to `False` if not required.
+
+##### `--grpc-use-ssl default False`
+
+Use SSL authentication for gRPC requests. Recommended to set to `True` if service is not protected by a gateway or proxy.
+
+##### `--grpc-use-ssl-mutual default False`
+
+Use mutual SSL authentication for gRPC requests. Recommended to set to `True` if service is not protected by a gateway or proxy.
+
+##### `--grpc-restricted-protocol <:=>`
+
+Restrict access to specific gRPC protocol categories to users with
+specific key, value pair shared secret. See
+[limit-endpoint-access](inference_protocols.md#limit-endpoint-access-beta)
+for more information.
+
+> [!Note]
+> Restricting access can be used to limit exposure to model
+> control APIs to trusted users.
+
+##### `--http-restricted-api <:=>`
+
+Restrict access to specific HTTP API categories to users with
+specific key, value pair shared secret. See
+[limit-endpoint-access](inference_protocols.md#limit-endpoint-access-beta)
+for more information.
+
+> [!Note]
+> Restricting access can be used to limit exposure to model
+> control APIs to trusted users.
+
+##### `--allow-sagemaker default False`
+
+Enable Sagemaker request handling. Recommended to set to `False` unless required.
+
+##### `--allow-vertex-ai default depends on environment variable`
+
+Enable Vertex AI request handling. Default is `True` if
+`AIP_MODE=PREDICTION`, `False` otherwise. Recommended to set to
+`False` unless required.
+
+##### `--allow-metrics default True`
+
+Allow server to publish prometheus style metrics. Recommended to set
+to `False` if not required to avoid capturing or exposing any sensitive information.
+
+#### `--trace-config level= default "off"`
+
+Tracing mode. Trace mode supports `triton` and `opentelemetry`. Unless required `--trace-config level=off` should be set to avoid capturing or exposing any sensitive information.
+
+
+##### `backend-directory default /opt/tritonserver/backends`
+
+Directory where backend shared libraries are found.
+
+> [!Warning]
+> Access to add or remove files from the backend directory
+> must be access controlled. Adding untrusted files
+> can lead to arbitrarty code execution.
+
+##### `repoagent-directory default /opt/tritonserver/repoagents`
+Directory where repository agent shared libraries are found.
+
+> [!Warning]
+> Access to add or remove files from the repoagent directory
+> must be access controlled. Adding untrusted files
+> can lead to arbitrarty code execution.
+
+##### `cache-directory default /opt/tritonserver/caches`
+
+Directory where cache shared libraries are found.
+
+> [!Warning]
+> Access to add or remove files from the cache directory
+> must be access controlled. Adding untrusted files
+> can lead to arbitrarty code execution.
+
+
+
+
+
diff --git a/docs/customization_guide/inference_protocols.md b/docs/customization_guide/inference_protocols.md
new file mode 100644
index 0000000000..592f26e7d1
--- /dev/null
+++ b/docs/customization_guide/inference_protocols.md
@@ -0,0 +1,506 @@
+
+
+# Inference Protocols and APIs
+
+Clients can communicate with Triton using either an [HTTP/REST
+protocol](#httprest-and-grpc-protocols), a [GRPC
+protocol](#httprest-and-grpc-protocols), or by an [in-process C
+API](#in-process-triton-server-api) or its
+[C++ wrapper](https://github.com/triton-inference-server/developer_tools/tree/main/server).
+
+## HTTP/REST and GRPC Protocols
+
+Triton exposes both HTTP/REST and GRPC endpoints based on [standard
+inference
+protocols](https://github.com/kserve/kserve/tree/master/docs/predict-api/v2)
+that have been proposed by the [KServe
+project](https://github.com/kserve). To fully enable all capabilities
+Triton also implements [HTTP/REST and GRPC
+extensions](https://github.com/triton-inference-server/server/tree/main/docs/protocol)
+to the KServe inference protocol. GRPC protocol also provides a
+bi-directional streaming version of the inference RPC to allow a
+sequence of inference requests/responses to be sent over a
+GRPC stream. We typically recommend using the unary version for
+inference requests. The streaming version should be used only if the
+situation demands it. Some of such use cases can be:
+
+* Assume a system with multiple Triton server instances running
+ behind a Load Balancer. If a sequence of inference requests is
+ needed to hit the same Triton server instance, a GRPC stream
+ will hold a single connection throughout the lifetime and hence
+ ensure the requests are delivered to the same Triton instance.
+* If the order of requests/responses needs to be preserved over
+ the network, a GRPC stream will ensure that the server receives
+ the requests in the same order as they were sent from the
+ client.
+
+The HTTP/REST and GRPC protocols also provide endpoints to check
+server and model health, metadata and statistics. Additional
+endpoints allow model loading and unloading, and inferencing. See
+the KServe and extension documentation for details.
+
+### HTTP Options
+Triton provides the following configuration options for server-client network transactions over HTTP protocol.
+
+#### Compression
+
+Triton allows the on-wire compression of request/response on HTTP through its clients. See [HTTP Compression](https://github.com/triton-inference-server/client/tree/main#compression) for more details.
+
+### GRPC Options
+Triton exposes various GRPC parameters for configuring the server-client network transactions. For usage of these options, refer to the output from `tritonserver --help`.
+
+#### SSL/TLS
+
+These options can be used to configure a secured channel for communication. The server-side options include:
+
+* `--grpc-use-ssl`
+* `--grpc-use-ssl-mutual`
+* `--grpc-server-cert`
+* `--grpc-server-key`
+* `--grpc-root-cert`
+
+For client-side documentation, see [Client-Side GRPC SSL/TLS](https://github.com/triton-inference-server/client/tree/main#ssltls)
+
+For more details on overview of authentication in gRPC, refer [here](https://grpc.io/docs/guides/auth/).
+
+#### Compression
+
+Triton allows the on-wire compression of request/response messages by exposing following option on server-side:
+
+* `--grpc-infer-response-compression-level`
+
+For client-side documentation, see [Client-Side GRPC Compression](https://github.com/triton-inference-server/client/tree/main#compression-1)
+
+Compression can be used to reduce the amount of bandwidth used in server-client communication. For more details, see [gRPC Compression](https://grpc.github.io/grpc/core/md_doc_compression.html).
+
+#### GRPC KeepAlive
+
+Triton exposes GRPC KeepAlive parameters with the default values for both
+client and server described [here](https://github.com/grpc/grpc/blob/master/doc/keepalive.md).
+
+These options can be used to configure the KeepAlive settings:
+
+* `--grpc-keepalive-time`
+* `--grpc-keepalive-timeout`
+* `--grpc-keepalive-permit-without-calls`
+* `--grpc-http2-max-pings-without-data`
+* `--grpc-http2-min-recv-ping-interval-without-data`
+* `--grpc-http2-max-ping-strikes`
+
+For client-side documentation, see [Client-Side GRPC KeepAlive](https://github.com/triton-inference-server/client/blob/main/README.md#grpc-keepalive).
+
+### Limit Endpoint Access (BETA)
+
+Triton users may want to restrict access to protocols or APIs that are
+provided by the GRPC or HTTP endpoints of a server. For example, users
+can provide one set of access credentials for inference APIs and
+another for model control APIs such as model loading and unloading.
+
+The following options can be specified to declare a restricted
+protocol group (GRPC) or restricted API group (HTTP):
+
+```
+--grpc-restricted-protocol=,,...:=
+--http-restricted-api=,API_2>,...:=
+```
+
+The option can be specified multiple times to specifies multiple groups of
+protocols or APIs with different restriction settings.
+
+* `protocols / APIs` : A comma-separated list of protocols / APIs to be included in this
+group. Note that currently a given protocol / API is not allowed to be included in
+multiple groups. The following protocols / APIs are recognized:
+
+ * `health` : Health endpoint defined for [HTTP/REST](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#health) and [GRPC](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#health-1). For GRPC endpoint, this value also exposes [GRPC health check protocol](https://github.com/triton-inference-server/common/blob/main/protobuf/health.proto).
+ * `metadata` : Server / model metadata endpoints defined for [HTTP/REST](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#server-metadata) and [GRPC](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#server-metadata-1).
+ * `inference` : Inference endpoints defined for [HTTP/REST](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#inference) and [GRPC](https://github.com/kserve/kserve/blob/master/docs/predict-api/v2/required_api.md#inference-1).
+ * `shared-memory` : [Shared-memory endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_shared_memory.md).
+ * `model-config` : [Model configuration endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_configuration.md).
+ * `model-repository` : [Model repository endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_model_repository.md).
+ * `statistics` : [statistics endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md).
+ * `trace` : [trace endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_trace.md).
+ * `logging` : [logging endpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_logging.md).
+
+* `restricted-key` : The GRPC / HTTP request header
+to be checked when a request is received. The
+completed header for GRPC will be in the form of
+`triton-grpc-protocol-`. The completed header for HTTP
+will be in the form of ``.
+
+* `restricted-value` : The header value required to access the specified protocols.
+
+#### Example
+
+To start the server with a set of protocols and APIs restricted for
+`admin` usage and the rest of the protocols and APIs left unrestricted
+use the following command line arguments:
+
+
+```
+tritonserver --grpc-restricted-protocol=shared-memory,model-config,model-repository,statistics,trace:= \
+ --http-restricted-api=shared-memory,model-config,model-repository,statistics,trace:= ...
+```
+
+GRPC requests to `admin` protocols require that an additional header
+`triton-grpc-protocol-` is provided with value
+``. HTTP requests to `admin` APIs required that an
+additional header `` is provided with value ``.
+
+
+## In-Process Triton Server API
+
+The Triton Inference Server provides a backwards-compatible C API that
+allows Triton to be linked directly into a C/C++ application. This API
+is called the "Triton Server API" or just "Server API" for short. The
+API is implemented in the Triton shared library which is built from
+source contained in the [core
+repository](https://github.com/triton-inference-server/core). On Linux
+this library is libtritonserver.so and on Windows it is
+tritonserver.dll. In the Triton Docker image the shared library is
+found in /opt/tritonserver/lib. The header file that defines and
+documents the Server API is
+[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
+[Java bindings for In-Process Triton Server API](#java-bindings-for-in-process-triton-server-api)
+are built on top of `tritonserver.h` and can be used for Java applications that
+need to use Tritonserver in-process.
+
+All capabilities of Triton server are encapsulated in the shared
+library and are exposed via the Server API. The `tritonserver`
+executable implements HTTP/REST and GRPC endpoints and uses the Server
+API to communicate with core Triton logic. The primary source files
+for the endpoints are [grpc_server.cc](https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc) and
+[http_server.cc](https://github.com/triton-inference-server/server/blob/main/src/http_server.cc). In these source files you can
+see the Server API being used.
+
+You can use the Server API in your own application as well. A simple
+example using the Server API can be found in
+[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc).
+
+### API Description
+
+Triton server functionality is encapsulated in a shared library which
+is built from source contained in the [core
+repository](https://github.com/triton-inference-server/core). You can
+include the full capabilities of Triton by linking the shared library
+into your application and by using the C API defined in
+[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
+
+When you link the Triton shared library into your application you are
+*not* spawning a separate Triton process, instead, you are including
+the Triton core logic directly in your application. The Triton
+HTTP/REST or GRPC protocols are not used to communicate with this
+Triton core logic, instead all communication between your application
+and the Triton core logic must take place via the [Server
+API](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
+
+The top-level abstraction used by Server API is `TRITONSERVER_Server`,
+which represents the Triton core logic that is capable of implementing
+all of the features and capabilities of Triton. A
+`TRITONSERVER_Server` object is created by calling
+`TRITONSERVER_ServerNew` with a set of options that indicate how the
+object should be initialized. Use of `TRITONSERVER_ServerNew` is
+demonstrated in [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). Once you have created a
+`TRITONSERVER_Server` object, you can begin using the rest of the
+Server API as described below.
+
+#### Error Handling
+
+Most Server API functions return an error object indicating success or
+failure. Success is indicated by return `nullptr` (`NULL`). Failure is
+indicated by returning a `TRITONSERVER_Error` object. The error code
+and message can be retrieved from a `TRITONSERVER_Error` object with
+`TRITONSERVER_ErrorCode` and `TRITONSERVER_ErrorMessage`.
+
+The lifecycle and ownership of all Server API objects is documented in
+[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). For
+`TRITONSERVER_Error`, ownership of the object passes to the caller of
+the Server API function. As a result, your application is responsible
+for managing the lifecycle of the returned `TRITONSERVER_Error`
+object. You must delete the error object using
+`TRITONSERVER_ErrorDelete` when you are done using it. Macros such as
+`FAIL_IF_ERR` shown in [common.h](https://github.com/triton-inference-server/server/blob/main/src/common.h) are useful for
+managing error object lifetimes.
+
+#### Versioning and Backwards Compatibility
+
+A typical pattern, demonstrated in [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) and
+shown below, shows how you can compare the Server API version provided
+by the shared library against the Server API version that you compiled
+your application against. The Server API is backwards compatible, so
+as long as the major version provided by the shared library matches
+the major version that you compiled against, and the minor version
+provided by the shared library is greater-than-or-equal to the minor
+version that you compiled against, then your application can use the
+Server API.
+
+```
+#include "tritonserver.h"
+// Error checking removed for clarity...
+uint32_t api_version_major, api_version_minor;
+TRITONSERVER_ApiVersion(&api_version_major, &api_version_minor);
+if ((TRITONSERVER_API_VERSION_MAJOR != api_version_major) ||
+ (TRITONSERVER_API_VERSION_MINOR > api_version_minor)) {
+ // Error, the shared library implementing the Server API is older than
+ // the version of the Server API that you compiled against.
+}
+```
+
+#### Non-Inference APIs
+
+The Server API contains functions for checking health and readiness,
+getting model information, getting model statistics and metrics,
+loading and unloading models, etc. The use of these functions is
+straightforward and some of these functions are demonstrated in
+[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) and all are documented in
+[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
+
+#### Inference APIs
+
+Performing an inference request requires the use of many Server API
+functions and objects, as demonstrated in
+[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). The general usage requires the
+following steps.
+
+* Create a `TRITONSERVER_ResponseAllocator` using
+ `TRITONSERVER_ResponseAllocatorNew`. You can use the same response
+ allocator for all of your inference requests, or you can create
+ multiple response allocators. When Triton produces an output
+ tensor, it needs a memory buffer into which it can store the
+ contents of that tensor. Triton defers the allocation of these
+ output buffers by invoking callback functions in your
+ application. You communicate these callback functions to Triton with
+ the `TRITONSERVER_ResponseAllocator` object. You must implement two
+ callback functions, one for buffer allocation and one for buffer
+ free. The signatures for these functions are
+ `TRITONSERVER_ResponseAllocatorAllocFn_t` and
+ `TRITONSERVER_ResponseAllocatorReleaseFn_t` as defined in
+ [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h). In
+ [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc), these callback functions are
+ implemented as `ResponseAlloc` and `ResponseRelease`.
+
+* Create an inference request as a `TRITONSERVER_InferenceRequest`
+ object. The inference request is where you specify what model you
+ want to use, the input tensors and their values, the output tensors
+ that you want returned, and other request parameters. You create an
+ inference request using `TRITONSERVER_InferenceRequestNew`. You
+ create each input tensor in the request using
+ `TRITONSERVER_InferenceRequestAddInput` and set the data for the
+ input tensor using `TRITONSERVER_InferenceRequestAppendInputData`
+ (or one of the `TRITONSERVER_InferenceRequestAppendInputData*`
+ variants defined in
+ [tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h)). By
+ default, Triton will return all output tensors, but you can limit
+ Triton to only return some outputs by using
+ `TRITONSERVER_InferenceRequestAddRequestedOutput`.
+
+ To correctly manage the lifecycle of the inference request, you must
+ use `TRITONSERVER_InferenceRequestSetReleaseCallback` to set a
+ callback into a function in your application. This callback will be
+ invoke by Triton to return ownership of the
+ `TRITONSERVER_InferenceRequest` object. Typically, in this callback
+ you will just delete the `TRITONSERVER_InferenceRequest` object by
+ using `TRITONSERVER_InferenceRequestDelete`. But you may also
+ implement a different lifecycle management; for example, if you are
+ reusing inference request objects you would want to make the object
+ available for reuse.
+
+ You can optionally use `TRITONSERVER_InferenceRequestSetId` to set a
+ user-defined ID on the request. This ID is not used by Triton but
+ will be returned in the response.
+
+ You can reuse an existing `TRITONSERVER_InferenceRequest` object for
+ a new inference request. A couple of examples of how this is done
+ and why it is useful are shown in [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc).
+
+* Ask Triton to execute the inference request using
+ `TRITONSERVER_ServerInferAsync`. `TRITONSERVER_ServerInferAsync` is
+ a asynchronous call that returns immediately. The inference response
+ is returned via a callback into your application. You register this
+ callback using `TRITONSERVER_InferenceRequestSetResponseCallback`
+ before you invoke `TRITONSERVER_ServerInferAsync`. In
+ [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) this callback is
+ `InferResponseComplete`.
+
+ When you invoke `TRITONSERVER_ServerInferAsync` and it returns
+ without error, you are passing ownership of the
+ `TRITONSERVER_InferenceRequest` object to Triton, and so you must
+ not access that object in any way until Triton returns ownership to
+ you via the callback you registered with
+ `TRITONSERVER_InferenceRequestSetReleaseCallback`.
+
+* Process the inference response. The inference response is returned
+ to the callback function you registered with
+ `TRITONSERVER_InferenceRequestSetResponseCallback`. Your callback
+ receives the response as a `TRITONSERVER_InferenceResponse`
+ object. Your callback takes ownership of the
+ `TRITONSERVER_InferenceResponse` object and so must free it with
+ `TRITONSERVER_InferenceResponseDelete` when it is no longer needed.
+
+ The first step in processing a response is to use
+ `TRITONSERVER_InferenceResponseError` to check if the response is
+ returning an error or if it is returning valid results. If the
+ response is valid you can use
+ `TRITONSERVER_InferenceResponseOutputCount` to iterate over the
+ output tensors, and `TRITONSERVER_InferenceResponseOutput` to get
+ information about each output tensor.
+
+ Note that the [simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc) example uses a
+ std::promise to simply wait for the response, but synchronizing
+ response handling in this way is not required. You can have multiple
+ inference requests in flight at the same time and can issue
+ inference requests from the same thread or from multiple different
+ threads.
+allows Triton to be linked directly to a C/C++ application. The API
+is documented in
+[tritonserver.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonserver.h).
+
+A simple example using the C API can be found in
+[simple.cc](https://github.com/triton-inference-server/server/blob/main/src/simple.cc). A more complicated example can be
+found in the source that implements the HTTP/REST and GRPC endpoints
+for Triton. These endpoints use the C API to communicate with the core
+of Triton. The primary source files for the endpoints are
+[grpc_server.cc](https://github.com/triton-inference-server/server/blob/main/src/grpc/grpc_server.cc) and
+[http_server.cc](https://github.com/triton-inference-server/server/blob/main/src/http_server.cc).
+
+## Java bindings for In-Process Triton Server API
+
+The Triton Inference Server uses [Java CPP](https://github.com/bytedeco/javacpp)
+to create bindings around Tritonserver to create Java API.
+
+The API is documented in
+[tritonserver.java](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/src/gen/java/org/bytedeco/tritonserver/global/tritonserver.java).
+Alternatively, the user can refer to the web version [API docs](http://bytedeco.org/javacpp-presets/tritonserver/apidocs/)
+generated from `tritonserver.java`.
+**Note:** Currently, `tritonserver.java` contains bindings for both the `In-process C-API`
+and the bindings for `C-API Wrapper`. More information about the [developer_tools/server C-API wrapper](https://github.com/triton-inference-server/developer_tools/blob/main/server/README.md) can be found in the [developer_tools repository](https://github.com/triton-inference-server/developer_tools/).
+
+A simple example using the Java API can be found in
+[Samples folder](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver/samples)
+which includes `Simple.java` which is similar to
+[`simple.cc`](https://github.com/triton-inference-server/server/blob/main/src/simple.cc).
+Please refer to
+[sample usage documentation](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver#sample-usage)
+to learn about how to build and run `Simple.java`.
+
+In the [QA folder](https://github.com/triton-inference-server/server/blob/main/qa), folders starting with L0_java include Java API tests.
+These can be useful references for getting started, such as the
+[ResNet50 test](https://github.com/triton-inference-server/server/blob/main/qa/L0_java_resnet).
+
+### Java API setup instructions
+
+To use the Tritonserver Java API, you will need to have the Tritonserver library
+and dependencies installed in your environment. There are two ways to do this:
+
+1. Use a Tritonserver docker container with
+ 1. `.jar` Java bindings to C API (recommended)
+ 2. maven and build bindings yourself
+2. Build Triton from your environment without Docker (not recommended)
+
+#### Run Tritonserver container and install dependencies
+
+To set up your environment with Triton Java API, please follow the following steps:
+1. First run Docker container:
+```
+ $ docker run -it --gpus=all -v ${pwd}:/workspace nvcr.io/nvidia/tritonserver:-py3 bash
+```
+2. Install `jdk`:
+```bash
+ $ apt update && apt install -y openjdk-11-jdk
+```
+3. Install `maven` (only if you want to build the bindings yourself):
+```bash
+$ cd /opt/tritonserver
+ $ wget https://archive.apache.org/dist/maven/maven-3/3.8.4/binaries/apache-maven-3.8.4-bin.tar.gz
+ $ tar zxvf apache-maven-3.8.4-bin.tar.gz
+ $ export PATH=/opt/tritonserver/apache-maven-3.8.4/bin:$PATH
+```
+
+#### Run Java program with Java bindings Jar
+
+After ensuring that Tritonserver and dependencies are installed, you can run your
+Java program with the Java bindings with the following steps:
+
+1. Place Java bindings into your environment. You can do this by either:
+
+ a. Building Java API bindings with provided build script:
+ ```bash
+ # Clone Triton client repo. Recommended client repo tag is: main
+ $ git clone --single-branch --depth=1 -b
+ https://github.com/triton-inference-server/client.git clientrepo
+ # Run build script
+ ## For In-Process C-API Java Bindings
+ $ source clientrepo/src/java-api-bindings/scripts/install_dependencies_and_build.sh
+ ## For C-API Wrapper (Triton with C++ bindings) Java Bindings
+ $ source clientrepo/src/java-api-bindings/scripts/install_dependencies_and_build.sh --enable-developer-tools-server
+ ```
+ This will install the Java bindings to `/workspace/install/java-api-bindings/tritonserver-java-bindings.jar`
+
+ *or*
+
+ b. Copying "Uber Jar" from Triton SDK container to your environment
+ ```bash
+ $ id=$(docker run -dit nvcr.io/nvidia/tritonserver:-py3-sdk bash)
+ $ docker cp ${id}:/workspace/install/java-api-bindings/tritonserver-java-bindings.jar /tritonserver-java-bindings.jar
+ $ docker stop ${id}
+ ```
+ **Note:** `tritonserver-java-bindings.jar` only includes the `In-Process Java Bindings`. To use the `C-API Wrapper Java Bindings`, please use the build script.
+2. Use the built "Uber Jar" that contains the Java bindings
+ ```bash
+ $ java -cp /tritonserver-java-bindings.jar
+ ```
+
+#### Build Java bindings and run Java program with Maven
+
+If you want to make changes to the Java bindings, then you can use Maven to
+build yourself. You can refer to part 1.a of [Run Java program with Java
+bindings Jar](#run-java-program-with-java-bindings-jar) to also build the jar
+yourself without any modifications to the Tritonserver bindings in
+JavaCPP-presets.
+You can do this using the following steps:
+
+1. Create the JNI binaries in your local repository (`/root/.m2/repository`)
+ with [`javacpp-presets/tritonserver`](https://github.com/bytedeco/javacpp-presets/tree/master/tritonserver).
+ For C-API Wrapper Java bindings (Triton with C++ bindings), you need to
+ install some build specific dependencies including cmake and rapidjson.
+ Refer to [java installation script](https://github.com/triton-inference-server/client/blob/main/src/java-api-bindings/scripts/install_dependencies_and_build.sh)
+ for dependencies you need to install and modifications you need to make for your container.
+After installing dependencies, you can build the tritonserver project on javacpp-presets:
+```bash
+ $ git clone https://github.com/bytedeco/javacpp-presets.git
+ $ cd javacpp-presets
+ $ mvn clean install --projects .,tritonserver
+ $ mvn clean install -f platform --projects ../tritonserver/platform -Djavacpp.platform=linux-x86_64
+```
+2. Create your custom `*.pom` file for Maven. Please refer to
+ [samples/simple/pom.xml](https://github.com/bytedeco/javacpp-presets/blob/master/tritonserver/samples/simple/pom.xml) as
+ reference for how to create your pom file.
+3. After creating your `pom.xml` file you can build your application with:
+```bash
+ $ mvn compile exec:java -Djavacpp.platform=linux-x86_64 -Dexec.args=""
+```
diff --git a/docs/customization_guide/repository_agents.md b/docs/customization_guide/repository_agents.md
new file mode 100644
index 0000000000..02fb1d57ec
--- /dev/null
+++ b/docs/customization_guide/repository_agents.md
@@ -0,0 +1,176 @@
+
+
+# Repository Agent
+
+A *repository agent* extends Triton with new functionality that
+operates when a model is loaded or unloaded. You can introduce your
+own code to perform authentication, decryption, conversion, or similar
+operations when a model is loaded.
+
+**BETA: The repository agent API is beta quality and is subject to
+non-backward-compatible changes for one or more releases.**
+
+A repository agent comunicates with Triton using the [repository agent
+API](https://github.com/triton-inference-server/core/tree/main/include/triton/core/tritonrepoagent.h). The
+[checksum_repository_agent GitHub
+repo](https://github.com/triton-inference-server/checksum_repository_agent)
+provides an example repository agent that verifies file checksums
+before loading a model.
+
+## Using a Repository Agent
+
+A model can use one or more repository agents by specifying them in
+the *ModelRepositoryAgents* section of the [model
+configuration](../user_guide/model_configuration.md). Each repository agent can have
+parameters specific to that agent that are specified in the model
+configuration to control the behavior of the agent. To understand the
+parameters available for a given agent consult the documentation for
+that agent.
+
+Multiple agents may be specified for the same model and they will be
+invoked in order when a model is loaded or unloaded. The following
+example model configuration contents shows how two agents, "agent0"
+and "agent1", are specified so that they are invoked in that order
+with the given parameters.
+
+```
+model_repository_agents
+{
+ agents [
+ {
+ name: "agent0",
+ parameters [
+ {
+ key: "key0",
+ value: "value0"
+ },
+ {
+ key: "key1",
+ value: "value1"
+ }
+ ]
+ },
+ {
+ name: "agent1",
+ parameters [
+ {
+ key: "keyx",
+ value: "valuex"
+ }
+ ]
+ }
+ ]
+}
+```
+
+## Implementing a Repository Agent
+
+A repository agent must be implemented as a shared library and the
+name of the shared library must be
+*libtritonrepoagent_\.so*. The shared library should
+hide all symbols except those needed by the repository agent API. See
+the [checksum example's
+CMakeList.txt](https://github.com/triton-inference-server/checksum_repository_agent/blob/main/CMakeLists.txt)
+for an example of how to use an ldscript to expose only the necessary
+symbols.
+
+The shared library will be dynamically loaded by Triton when it is
+needed. For a repository agent called *A*, the shared library must be
+installed as \/A/libtritonrepoagent_A.so.
+Where \ is by default
+/opt/tritonserver/repoagents. The --repoagent-directory flag can be
+used to override the default.
+
+Your repository agent must implement the repository agent API as
+documented in
+[tritonrepoagent.h](https://github.com/triton-inference-server/core/tree/main/include/triton/core/tritonrepoagent.h).
+
+Triton follows these steps when loading a model:
+
+* Load the model's configuration file (config.pbtxt) and extract the
+ *ModelRepositoryAgents* settings. Even if a repository agent
+ modifies the config.pbtxt file, the repository agent settings from
+ the initial config.pbtxt file are used for the entire loading
+ process.
+
+* For each repository agent specified:
+
+ * Initialize the corresponding repository agent, loading the shared
+ library if necessary. Model loading fails if the shared library is
+ not available or if initialization fails.
+
+ * Invoke the repository agent's *TRITONREPOAGENT_ModelAction*
+ function with action TRITONREPOAGENT_ACTION_LOAD. As input the
+ agent can access the model's repository as either a cloud storage
+ location or a local filesystem location.
+
+ * The repository agent can return *success* to indicate that no
+ changes where made to the repository, can return *failure* to
+ indicate that the model load should fail, or can create a new
+ repository for the model (for example, by decrypting the input
+ repository) and return *success* to indicate that the new
+ repository should be used.
+
+ * If the agent returns *success* Triton continues to the next
+ agent. If the agent returns *failure*, Triton skips invocation of
+ any additional agents.
+
+* If all agents returned *success*, Triton attempts to load the model
+ using the final model repository.
+
+* For each repository agent that was invoked with
+ TRITONREPOAGENT_ACTION_LOAD, in reverse order:
+
+ * Triton invokes the repository agent's
+ *TRITONREPOAGENT_ModelAction* function with action
+ TRITONREPOAGENT_ACTION_LOAD_COMPLETE if the model loaded
+ successfully or TRITONREPOAGENT_ACTION_LOAD_FAIL if the model
+ failed to load.
+
+Triton follows these steps when unloading a model:
+
+* Triton uses the repository agent settings from the initial
+ config.pbtxt file, even if during loading one or more agents
+ modified its contents.
+
+* For each repository agent that was invoked with
+ TRITONREPOAGENT_ACTION_LOAD, in the same order:
+
+ * Triton invokes the repository agent's
+ *TRITONREPOAGENT_ModelAction* function with action
+ TRITONREPOAGENT_ACTION_UNLOAD.
+
+* Triton unloads the model.
+
+* For each repository agent that was invoked with
+ TRITONREPOAGENT_ACTION_UNLOAD, in reverse order:
+
+ * Triton invokes the repository agent's
+ *TRITONREPOAGENT_ModelAction* function with action
+ TRITONREPOAGENT_ACTION_UNLOAD_COMPLETE.
diff --git a/docs/customization_guide/test.md b/docs/customization_guide/test.md
new file mode 100644
index 0000000000..a64d81a27f
--- /dev/null
+++ b/docs/customization_guide/test.md
@@ -0,0 +1,134 @@
+
+
+# Testing Triton
+
+Currently there is no CI testing enabled for Triton repositories. We
+will enable CI testing in a future update.
+
+However, there is a set of tests in the qa/ directory that can be run
+manually to provide extensive testing. Before running these tests you
+must first generate a few model repositories containing the models
+needed by the tests.
+
+## Generate QA Model Repositories
+
+The QA model repositories contain some simple models that are used to
+verify the correctness of Triton. To generate the QA model
+repositories:
+
+```
+$ cd qa/common
+$ ./gen_qa_model_repository
+$ ./gen_qa_custom_ops
+```
+
+This will create multiple model repositories in /tmp//qa_*
+(for example /tmp/24.03/qa_model_repository). The TensorRT models
+will be created for the GPU on the system that CUDA considers device 0
+(zero). If you have multiple GPUs on your system see the documentation
+in the scripts for how to target a specific GPU.
+
+## Build SDK Image
+
+Build the *tritonserver_sdk* image that contains the client
+libraries, model analyzer, and examples using the following
+commands. You must first checkout the branch of the
+*client* repo into the clientrepo/ subdirectory. Typically you want to
+set to be the same as your current server branch.
+
+```
+$ cd
+$ git clone --single-branch --depth=1 -b https://github.com/triton-inference-server/client.git clientrepo
+$ docker build -t tritonserver_sdk -f Dockerfile.sdk .
+```
+
+## Build QA Image
+
+Next you need to build a QA version of the Triton Docker image. This
+image will contain Triton, the QA tests, and all the dependencies
+needed to run the QA tests. First do a [Docker image
+build](build.md#building-with-docker) to produce the
+*tritonserver_cibase* and *tritonserver* images.
+
+Then, build the actual QA image.
+
+```
+$ docker build -t tritonserver_qa -f Dockerfile.QA .
+```
+
+## Run QA Tests
+
+Now run the QA image and mount the QA model repositories into the
+container so the tests will be able to access them.
+
+```
+$ docker run --gpus=all -it --rm -v/tmp:/data/inferenceserver tritonserver_qa
+```
+
+Within the container the QA tests are in /opt/tritonserver/qa. To run
+a test, change directory to the test and run the test.sh script.
+
+```
+$ cd
+$ bash -x ./test.sh
+```
+
+### Sanity Tests
+
+Many tests require that you use a complete Triton build, with all
+backends and other features enabled. There are three sanity tests that
+are parameterized so that you can run them even if you have built a
+Triton that contains only a subset of all supported Triton
+backends. These tests are L0_infer, L0_batcher and
+L0_sequence_batcher. For these tests the following envvars are
+available to control how the tests behave:
+
+* BACKENDS: Control which backends are tested. Look in the test.sh
+ file of the test to see the default and allowed values.
+
+* ENSEMBLES: Enable testing of ensembles. Set to "0" to disable, set
+ to "1" to enable. If enabled you must have the *identity* backend
+ included in your Triton build.
+
+* EXPECTED_NUM_TESTS: The tests perform a check of the total number of
+ test sub-cases. The exact number of sub-cases that run will depend
+ on the values you use for BACKENDS and ENSEMBLES. So you will need
+ to adjust this as appropriate for your testing.
+
+For example, if you build a Triton that has only the TensorRT backend
+you can run L0_infer as follows:
+
+```
+$ BACKENDS="plan" ENSEMBLES=0 EXPECTED_NUM_TESTS= bash -x ./test.sh
+```
+
+Where '\' is the number of sub-tests expected to be run for
+just TensorRT testing and no ensembles. Depending on which backend(s)
+you are testing you will need to experiment and determine the correct
+value for '\'.
diff --git a/docs/examples/README.md b/docs/examples/README.md
new file mode 100644
index 0000000000..84bfcb9499
--- /dev/null
+++ b/docs/examples/README.md
@@ -0,0 +1,35 @@
+
+
+# Triton Examples
+
+**New to Triton Inference Server?** Make use of [these tutorials](https://github.com/triton-inference-server/tutorials) to begin your Triton journey!
+
+This folder contains the following:
+* jetson: This covers deploying Triton Inference Server on Jetson devices.
+* model_repository: This folder is a basic model repository for deploying models using the Triton Inference Server.
\ No newline at end of file
diff --git a/docs/examples/fetch_models.sh b/docs/examples/fetch_models.sh
index 0612dfc6cb..5594878b3e 100755
--- a/docs/examples/fetch_models.sh
+++ b/docs/examples/fetch_models.sh
@@ -27,16 +27,14 @@
set -ex
-# Caffe2 resnet50
-mkdir -p model_repository/resnet50_netdef/1
-wget -O model_repository/resnet50_netdef/1/model.netdef \
- http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/predict_net.pb
-wget -O model_repository/resnet50_netdef/1/init_model.netdef \
- http://download.caffe2.ai.s3.amazonaws.com/models/resnet50/init_net.pb
-
# TensorFlow inception
mkdir -p model_repository/inception_graphdef/1
wget -O /tmp/inception_v3_2016_08_28_frozen.pb.tar.gz \
https://storage.googleapis.com/download.tensorflow.org/models/inception_v3_2016_08_28_frozen.pb.tar.gz
(cd /tmp && tar xzf inception_v3_2016_08_28_frozen.pb.tar.gz)
mv /tmp/inception_v3_2016_08_28_frozen.pb model_repository/inception_graphdef/1/model.graphdef
+
+# ONNX densenet
+mkdir -p model_repository/densenet_onnx/1
+wget -O model_repository/densenet_onnx/1/model.onnx \
+ https://contentmamluswest001.blob.core.windows.net/content/14b2744cf8d6418c87ffddc3f3127242/9502630827244d60a1214f250e3bbca7/08aed7327d694b8dbaee2c97b8d0fcba/densenet121-1.2.onnx
diff --git a/docs/examples/jetson/README.md b/docs/examples/jetson/README.md
new file mode 100644
index 0000000000..281d5f2a97
--- /dev/null
+++ b/docs/examples/jetson/README.md
@@ -0,0 +1,68 @@
+
+
+# Using Triton Inference Server as a shared library for execution on Jetson
+
+## Overview
+This project demonstrates how to run C API applications using Triton Inference Server as a shared library. We also show how to build and execute such applications on Jetson.
+
+### Prerequisites
+
+* JetPack >= 4.6
+* OpenCV >= 4.1.1
+* TensorRT >= 8.0.1.6
+
+### Installation
+
+Follow the installation instructions from the GitHub release page ([https://github.com/triton-inference-server/server/releases/](https://github.com/triton-inference-server/server/releases/)).
+
+In our example, we placed the contents of downloaded release directory under `/opt/tritonserver`.
+
+## Part 1. Concurrent inference and dynamic batching
+
+The purpose of the sample located under [concurrency_and_dynamic_batching](concurrency_and_dynamic_batching/README.md)
+is to demonstrate the important features of Triton Inference Server such as concurrent model execution and
+dynamic batching. In order to do that, we implemented a people detection application using C API and Triton
+Inference Server as a shared library.
+
+## Part 2. Analyzing model performance with perf_analyzer
+
+To analyze model performance on Jetson,
+[perf_analyzer](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md)
+tool is used. The `perf_analyzer` is included in the release tar file or can be
+compiled from source.
+
+From this directory of the repository, execute the following to evaluate model performance:
+
+```shell
+./perf_analyzer -m peoplenet -b 2 --service-kind=triton_c_api --model-repo=$(pwd)/concurrency_and_dynamic_batching/trtis_model_repo_sample_1 --triton-server-directory=/opt/tritonserver --concurrency-range 1:6 -f perf_c_api.csv
+```
+
+In the example above we saved the results as a `.csv` file. To visualize these
+results, follow the steps described
+[here](https://github.com/triton-inference-server/client/blob/main/src/c++/perf_analyzer/README.md).
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
new file mode 100644
index 0000000000..6506314999
--- /dev/null
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/Makefile
@@ -0,0 +1,47 @@
+# Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# * Neither the name of NVIDIA CORPORATION nor the names of its
+# contributors may be used to endorse or promote products derived
+# from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+TARGET=people_detection
+GCC=g++
+GCC_PARMS+=-I../../server -I/usr/include/opencv4 -I../../core/include/ -I/usr/local/cuda/targets/aarch64-linux/include
+GCC_PARMS+=-I${HOME}/tritonserver/include/tritonserver -D TRITON_ENABLE_GPU=ON -D TRITON_MIN_COMPUTE_CAPABILITY=5.3
+
+GCC_LIBS=-L${HOME}/tritonserver/lib -L/usr/lib -L/usr/local/cuda/targets/aarch64-linux/lib
+GCC_LIBS+=-lpthread -ltritonserver -lopencv_core -lopencv_highgui -lopencv_imgproc -lopencv_imgcodecs -lopencv_dnn -lcudart
+
+all: $(TARGET)
+
+
+%.o: %.cc
+ $(GCC) $(GCC_PARMS) -c -g -o $@ $^
+
+$(TARGET): $(TARGET).o
+ $(GCC) $^ $(GCC_LIBS) -o $@
+
+clean:
+ rm -f $(TARGET).o $(TARGET)
+
+.PHONY: all clean
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/README.md b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
new file mode 100644
index 0000000000..1f96dd365d
--- /dev/null
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/README.md
@@ -0,0 +1,331 @@
+
+
+# Concurrent inference and dynamic batching
+
+The purpose of this sample is to demonstrate the important features of Triton Inference Server such as concurrent model execution and dynamic batching.
+
+We will be using a purpose built deployable people detection model, which we download from [Nvidia GPU Cloud (NGC)](https://ngc.nvidia.com/).
+
+## Acquiring the model
+
+Download the pruned [PeopleNet](https://ngc.nvidia.com/catalog/models/nvidia:tlt_peoplenet) model from the NGC. This model is available as a ready-to-use model, and you can download it from NGC using either `wget` method:
+
+```shell
+wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/tao/peoplenet/versions/pruned_v2.1/zip -O pruned_v2.1.zip
+```
+
+or via CLI command:
+
+```shell
+ngc registry model download-version "nvidia/tao/peoplenet:pruned_v2.1"
+```
+
+For latter you need to setup the [NGC CLI](https://ngc.nvidia.com/setup).
+
+Having downloaded the model from the NGC, unzip the archive `peoplenet_pruned_v2.1.zip` into `concurrency_and_dynamic_batching/tao/models/peoplenet`.
+
+If you have the zip archive in the `concurrency_and_dynamic_batching` directory, the following will automatically place the model to the correct location:
+
+```shell
+unzip pruned_v2.1.zip -d $(pwd)/tao/models/peoplenet
+```
+
+Verify that you can see the model file `resnet34_peoplenet_pruned.etlt` under
+
+```
+concurrency_and_dynamic_batching
+└── tao
+  └── models
+  └── peoplenet
+  ├── labels.txt
+  └── resnet34_peoplenet_pruned.etlt
+```
+
+## Converting the model to TensorRT
+
+After you have acquired the model file in `.etlt` format, you will need to convert the model to [TensorRT](https://developer.nvidia.com/tensorrt) format. NVIDIA TensorRT is an SDK for high-performance deep learning inference. It includes a deep learning inference optimizer and runtime that delivers low latency and high throughput for deep learning inference applications. The latest versions of JetPack include TensorRT.
+
+In order to convert an `.etlt` model to TensorRT format, you need to use the `tao-converter` tool.
+
+The `tao-converter` tool is available as a compiled release file for different platforms. The download links corresponding to your deployment system are provided among the [TLT Getting Started resources](https://developer.nvidia.com/tlt-get-started).
+
+After you have downloaded `tao-converter`, you might need to execute
+
+```shell
+chmod 777 tao-converter
+```
+
+in the directory with the tool.
+
+We provide a conversion script `tao/convert_peoplenet.sh` which expects the model to be present at the location.
+
+```shell
+tao
+└── models
+ └── peoplenet
+```
+
+To execute it, you can place the `tao-converter` executable to the `tao` directory of the project and in the same directory run
+
+```shell
+bash convert_peoplenet.sh
+```
+
+After you execute it, verify that a `model.plan` file was placed to to the directories `/trtis_model_repo_sample_1/peoplenet/1` and `/trtis_model_repo_sample_2/peoplenet/1`. Note that we have two slightly different repositories for the same model to demonstrate different features of Triton.
+
+Also note that this step has to be performed on the target hardware: if you are planning to execute this application on Jetson, the conversion has to be performed on Jetson.
+
+To learn more about `tao-converter`parameters, run:
+
+```shell
+./tao-converter -h
+```
+
+## Building the app
+
+To compile the sample, pull the following repositories:
+* [https://github.com/triton-inference-server/server](https://github.com/triton-inference-server/server)
+* [https://github.com/triton-inference-server/core](https://github.com/triton-inference-server/core)
+
+Make sure you copied the contents of the release you downloaded to `$HOME`
+
+```shell
+sudo cp -rf tritonserver2.x.y-jetpack4.6 $HOME/tritonserver
+```
+
+Open the terminal in `concurrency_and_dynamic_batching` and build the app executing
+
+```shell
+make
+```
+
+An example Makefile is provided for Jetson.
+
+## Demonstration case 1: Concurrent model execution
+
+With Triton Inference Server, multiple models (or multiple instances of the same model) can run simultaneously on the same GPU or on multiple GPUs. In this example, we are demonstrating how to run multiple instances of the same model on a single Jetson GPU.
+
+### Running the sample
+
+To execute from the terminal, run from the `concurrency_and_dynamic_batching` directory:
+
+```shell
+LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_1 -t 6 -s false -p $HOME/tritonserver
+```
+
+The parameter `-t` controls the number of concurrent inference calls we want to execute. We will be executing the same model on the same sample image with the purpose of demonstrating how setting different concurrency options affects the performance.
+
+You can enable saving detected bounding boxes in the project directory in form of overlays over the original image for each execution thread. You can turn the visualization on by setting the parameter `-s` to `true` upon execution (`-s` is set to `false` by default).
+
+### Expected output
+
+Upon execution, in the terminal log you will see _Model 'peoplenet' Stats_ in json format reflecting the inference performance. We also output _TOTAL INFERENCE TIME_ which simply reflects the elapsed time required to run the application including data loading, pre-processing and post-processing.
+
+A typical output in the log for _Model 'peoplenet' Stats_ looks as follows:
+
+```json
+{
+ "model_stats":[
+ {
+ "name":"peoplenet",
+ "version":"1",
+ "last_inference":1626448309997,
+ "inference_count":6,
+ "execution_count":6,
+ "inference_stats":{
+ "success":{
+ "count":6,
+ "ns":574589968
+ },
+ "fail":{
+ "count":0,
+ "ns":0
+ },
+ "queue":{
+ "count":6,
+ "ns":234669630
+ },
+ "compute_input":{
+ "count":6,
+ "ns":194884512
+ },
+ "compute_infer":{
+ "count":6,
+ "ns":97322636
+ },
+ "compute_output":{
+ "count":6,
+ "ns":47700806
+ }
+ },
+ "batch_stats":[
+ {
+ "batch_size":1,
+ "compute_input":{
+ "count":6,
+ "ns":194884512
+ },
+ "compute_infer":{
+ "count":6,
+ "ns":97322636
+ },
+ "compute_output":{
+ "count":6,
+ "ns":47700806
+ }
+ }
+ ]
+ }
+ ]
+}
+
+"TOTAL INFERENCE TIME: 174ms"
+```
+
+To learn about different statistics check out the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md#statistics-extension).
+
+To see how setting different values for concurrency affects total execution time and its components reflected in the model stats, you need to modify a single parameter in the model config file.
+
+To enable concurrent model execution support for a model, corresponding model config file `trtis_model_repo_sample_1/peoplenet/config.pbtxt` includes the following:
+
+```
+instance_group [
+ {
+ count: 3
+ kind: KIND_GPU
+ }
+]
+```
+
+You can change the count of allowed inferences for the same model instance and observe how it affects performance in _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_. Note that on Jetson we dont recommend setting values too high: for instance, on a device like a Jetson Xavier AGX we don't recommend setting the number larger than 6. The values in the range 1-3 are optimal.
+
+While trying out different values, note how it affects total inference time as well as some inference statistics (like queue and compute times)
+
+## Demonstration case 2: Dynamic batching
+
+For models that support batching, Triton implements multiple scheduling and batching algorithms that combine individual inference requests together to improve inference throughput. In this example, we want to demonstrate how enbling automatic dynamic batching affects inference performance.
+
+### Running the sample
+
+To observe the effect of dynamic batching, from the `concurrency_and_dynamic_batching` directory execute:
+
+```shell
+LD_LIBRARY_PATH=$HOME/tritonserver/lib ./people_detection -m system -v -r $(pwd)/trtis_model_repo_sample_2 -t 6 -s false -p $HOME/tritonserver
+```
+
+### Expected output
+
+Take a look at _Model 'peoplenet' Stats_ and _TOTAL INFERENCE TIME_ to see the effect of dynamic batching. A possible outcome should look like that:
+
+```json
+{
+ "model_stats":[
+ {
+ "name":"peoplenet",
+ "version":"1",
+ "last_inference":1626447787832,
+ "inference_count":6,
+ "execution_count":2,
+ "inference_stats":{
+ "success":{
+ "count":6,
+ "ns":558981051
+ },
+ "fail":{
+ "count":0,
+ "ns":0
+ },
+ "queue":{
+ "count":6,
+ "ns":49271380
+ },
+ "compute_input":{
+ "count":6,
+ "ns":170634044
+ },
+ "compute_infer":{
+ "count":6,
+ "ns":338079193
+ },
+ "compute_output":{
+ "count":6,
+ "ns":950544
+ }
+ },
+ "batch_stats":[
+ {
+ "batch_size":1,
+ "compute_input":{
+ "count":1,
+ "ns":15955684
+ },
+ "compute_infer":{
+ "count":1,
+ "ns":29917093
+ },
+ "compute_output":{
+ "count":1,
+ "ns":152264
+ }
+ },
+ {
+ "batch_size":5,
+ "compute_input":{
+ "count":1,
+ "ns":30935672
+ },
+ "compute_infer":{
+ "count":1,
+ "ns":61632420
+ },
+ "compute_output":{
+ "count":1,
+ "ns":159656
+ }
+ }
+ ]
+ }
+ ]
+}
+
+"TOTAL INFERENCE TIME: 162ms"
+```
+
+Notice that this time the model was executed only twice (as indicated by `execution_count`). Also, unlike in the previous example, the `batch_stats` part of the statitstics looks different: we see that our model was executed one time with `batch = 1` and the second time with `batch = 5`. It helped to decrease the total inference time.
+
+In order to enable dynamic batching, the following is present in the model config `trtis_model_repo_sample_2/peoplenet/config.pbtxt`:
+
+```
+dynamic_batching {
+}
+```
+
+To try further options of dynamic batcher see the [documentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#dynamic-batcher).
+
+You can also try enabling both concurrent model execution and dynamic batching.
\ No newline at end of file
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/capture.jpg b/docs/examples/jetson/concurrency_and_dynamic_batching/capture.jpg
new file mode 100644
index 0000000000..82e2cb38e0
Binary files /dev/null and b/docs/examples/jetson/concurrency_and_dynamic_batching/capture.jpg differ
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/common.h b/docs/examples/jetson/concurrency_and_dynamic_batching/common.h
new file mode 100644
index 0000000000..b55c8b71c5
--- /dev/null
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/common.h
@@ -0,0 +1,106 @@
+// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of NVIDIA CORPORATION nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#pragma once
+
+#include
+#include
+
+#include "triton/core/tritonserver.h"
+
+#define RETURN_IF_ERR(X) \
+ do { \
+ TRITONSERVER_Error* err__ = (X); \
+ if (err__ != nullptr) { \
+ return err__; \
+ } \
+ } while (false)
+
+#define RETURN_MSG_IF_ERR(X, MSG) \
+ do { \
+ TRITONSERVER_Error* err__ = (X); \
+ if (err__ != nullptr) { \
+ return TRITONSERVER_ErrorNew( \
+ TRITONSERVER_ErrorCode(err__), \
+ (std::string(MSG) + ": " + TRITONSERVER_ErrorMessage(err__)) \
+ .c_str()); \
+ } \
+ } while (false)
+
+#define GOTO_IF_ERR(X, T) \
+ do { \
+ TRITONSERVER_Error* err__ = (X); \
+ if (err__ != nullptr) { \
+ goto T; \
+ } \
+ } while (false)
+
+#define FAIL(MSG) \
+ do { \
+ std::cerr << "error: " << (MSG) << std::endl; \
+ exit(1); \
+ } while (false)
+
+#define FAIL_IF_ERR(X, MSG) \
+ do { \
+ TRITONSERVER_Error* err__ = (X); \
+ if (err__ != nullptr) { \
+ std::cerr << "error: " << (MSG) << ": " \
+ << TRITONSERVER_ErrorCodeString(err__) << " - " \
+ << TRITONSERVER_ErrorMessage(err__) << std::endl; \
+ TRITONSERVER_ErrorDelete(err__); \
+ exit(1); \
+ } \
+ } while (false)
+
+#define IGNORE_ERR(X) \
+ do { \
+ TRITONSERVER_Error* err__ = (X); \
+ if (err__ != nullptr) { \
+ TRITONSERVER_ErrorDelete(err__); \
+ } \
+ } while (false)
+
+#ifdef TRITON_ENABLE_GPU
+#define FAIL_IF_CUDA_ERR(X, MSG) \
+ do { \
+ cudaError_t err__ = (X); \
+ if (err__ != cudaSuccess) { \
+ std::cerr << "error: " << (MSG) << ": " << cudaGetErrorString(err__) \
+ << std::endl; \
+ exit(1); \
+ } \
+ } while (false)
+#endif // TRITON_ENABLE_GPU
+
+/// Get the integral version from a string, or fail if string does not
+/// represent a valid version.
+///
+/// \param version_string The string version.
+/// \param version Returns the integral version.
+/// \return The error status. Failure if 'version_string' doesn't
+/// convert to valid version.
+TRITONSERVER_Error* GetModelVersionFromString(
+ const std::string& version_string, int64_t* version);
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/labels.txt b/docs/examples/jetson/concurrency_and_dynamic_batching/labels.txt
new file mode 100644
index 0000000000..8ae80671d6
--- /dev/null
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/labels.txt
@@ -0,0 +1,4 @@
+person
+bag
+face
+
diff --git a/docs/examples/jetson/concurrency_and_dynamic_batching/people_detection.cc b/docs/examples/jetson/concurrency_and_dynamic_batching/people_detection.cc
new file mode 100644
index 0000000000..ce22bdcba9
--- /dev/null
+++ b/docs/examples/jetson/concurrency_and_dynamic_batching/people_detection.cc
@@ -0,0 +1,1158 @@
+// Copyright (c) 2021, NVIDIA CORPORATION& AFFILIATES.All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// * Neither the name of NVIDIA CORPORATION nor the names of its
+// contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include
+#include
+#include
+
+#include
+#include