Skip to content

Commit

Permalink
Merge branch 'main' into zhanga5-fix-sdk-dockerfile
Browse files Browse the repository at this point in the history
  • Loading branch information
zhanga5 authored Jul 31, 2024
2 parents ded1c20 + 69d768d commit 2fc47c8
Show file tree
Hide file tree
Showing 50 changed files with 903 additions and 215 deletions.
50 changes: 39 additions & 11 deletions Dockerfile.sdk
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
#

# Base image on the minimum Triton container
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.06-py3-min
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:24.07-py3-min

ARG TRITON_CLIENT_REPO_SUBDIR=clientrepo
ARG TRITON_REPO_ORGANIZATION=http://github.com/triton-inference-server
ARG TRITON_PA_REPO_SUBDIR=perfanalyzerrepo
ARG TRITON_COMMON_REPO_TAG=main
ARG TRITON_CORE_REPO_TAG=main
ARG TRITON_CLIENT_REPO_TAG=main
ARG TRITON_THIRD_PARTY_REPO_TAG=main
ARG TRITON_MODEL_ANALYZER_REPO_TAG=main
ARG TRITON_ENABLE_GPU=ON
Expand Down Expand Up @@ -104,8 +106,10 @@ RUN rm -f /usr/bin/python && \
# Build the client library and examples
ARG TRITON_REPO_ORGANIZATION
ARG TRITON_CLIENT_REPO_SUBDIR
ARG TRITON_PA_REPO_SUBDIR
ARG TRITON_COMMON_REPO_TAG
ARG TRITON_CORE_REPO_TAG
ARG TRITON_CLIENT_REPO_TAG
ARG TRITON_THIRD_PARTY_REPO_TAG
ARG TRITON_ENABLE_GPU
ARG JAVA_BINDINGS_MAVEN_VERSION
Expand All @@ -115,26 +119,53 @@ ARG TARGETPLATFORM
WORKDIR /workspace
COPY TRITON_VERSION .
COPY ${TRITON_CLIENT_REPO_SUBDIR} client
COPY ${TRITON_PA_REPO_SUBDIR} perf_analyzer

WORKDIR /workspace/build
WORKDIR /workspace/client_build
RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
-DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-DTRITON_THIRD_PARTY_REPO_TAG=${TRITON_THIRD_PARTY_REPO_TAG} \
-DTRITON_ENABLE_PERF_ANALYZER=OFF \
-DTRITON_ENABLE_CC_HTTP=ON -DTRITON_ENABLE_CC_GRPC=ON \
-DTRITON_ENABLE_PYTHON_HTTP=ON -DTRITON_ENABLE_PYTHON_GRPC=ON \
-DTRITON_ENABLE_PYTHON_HTTP=OFF -DTRITON_ENABLE_PYTHON_GRPC=OFF \
-DTRITON_ENABLE_JAVA_HTTP=ON \
-DTRITON_ENABLE_PERF_ANALYZER=ON \
-DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
RUN make -j16 cc-clients java-clients && \
rm -fr ~/.m2

# TODO: PA will rebuild the CC clients since it depends on it.
# This should be optimized so that we do not have to build
# the CC clients twice. Similarly, because the SDK expectation is
# that PA is packaged with the python client, we hold off on building
# the python client until now. Post-migration we should focus
# effort on de-tangling these flows.
WORKDIR /workspace/pa_build
RUN cmake -DCMAKE_INSTALL_PREFIX=/workspace/install \
-DTRITON_VERSION=`cat /workspace/TRITON_VERSION` \
-DTRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION} \
-DTRITON_COMMON_REPO_TAG=${TRITON_COMMON_REPO_TAG} \
-DTRITON_CORE_REPO_TAG=${TRITON_CORE_REPO_TAG} \
-DTRITON_CLIENT_REPO_TAG=${TRITON_CLIENT_REPO_TAG} \
-DTRITON_ENABLE_PERF_ANALYZER_C_API=ON \
-DTRITON_ENABLE_PERF_ANALYZER_TFS=ON \
-DTRITON_ENABLE_PERF_ANALYZER_TS=ON \
-DTRITON_ENABLE_PERF_ANALYZER_OPENAI=ON \
-DTRITON_ENABLE_EXAMPLES=ON -DTRITON_ENABLE_TESTS=ON \
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} /workspace/client
RUN make -j16 cc-clients python-clients java-clients && \
rm -fr ~/.m2
-DTRITON_ENABLE_CC_HTTP=ON \
-DTRITON_ENABLE_CC_GRPC=ON \
-DTRITON_ENABLE_PYTHON_HTTP=ON \
-DTRITON_ENABLE_PYTHON_GRPC=ON \
-DTRITON_PACKAGE_PERF_ANALYZER=ON \
-DTRITON_ENABLE_GPU=${TRITON_ENABLE_GPU} \
/workspace/perf_analyzer
RUN make -j16 perf-analyzer python-clients

RUN pip3 install build \
&& cd /workspace/perf_analyzer/genai-perf \
&& python3 -m build --wheel --outdir /workspace/install/python

# Install Java API Bindings
RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
Expand All @@ -145,9 +176,6 @@ RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
--jar-install-path /workspace/install/java-api-bindings; \
fi

RUN pip3 install build \
&& cd /workspace/client/src/c++/perf_analyzer/genai-perf \
&& python3 -m build --wheel --outdir /workspace/install/python
############################################################################
## Create sdk container
############################################################################
Expand Down
20 changes: 10 additions & 10 deletions Dockerfile.win10.min
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -37,9 +37,9 @@ RUN choco install unzip -y
#
# Installing TensorRT
#
ARG TENSORRT_VERSION=10.0.1.6
ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.4.zip"
ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/zip/TensorRT-10.0.1.6.Windows10.win10.cuda-12.4.zip
ARG TENSORRT_VERSION=10.2.0.19
ARG TENSORRT_ZIP="TensorRT-${TENSORRT_VERSION}.Windows10.x86_64.cuda-12.5.zip"
ARG TENSORRT_SOURCE=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.2.0/zip/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5.zip
# COPY ${TENSORRT_ZIP} /tmp/${TENSORRT_ZIP}
ADD ${TENSORRT_SOURCE} /tmp/${TENSORRT_ZIP}
RUN unzip /tmp/%TENSORRT_ZIP%
Expand All @@ -51,9 +51,9 @@ LABEL TENSORRT_VERSION="${TENSORRT_VERSION}"
#
# Installing cuDNN
#
ARG CUDNN_VERSION=9.1.0.70
ARG CUDNN_VERSION=9.2.1.18
ARG CUDNN_ZIP=cudnn-windows-x86_64-${CUDNN_VERSION}_cuda12-archive.zip
ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.1.0.70_cuda12-archive.zip
ARG CUDNN_SOURCE=https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.2.1.18_cuda12-archive.zip
ADD ${CUDNN_SOURCE} /tmp/${CUDNN_ZIP}
RUN unzip /tmp/%CUDNN_ZIP%
RUN move cudnn-* cudnn
Expand Down Expand Up @@ -88,7 +88,7 @@ LABEL PYTHON_VERSION=${PYTHON_VERSION}
#
# Installing CMake
#
ARG CMAKE_VERSION=3.29.3
ARG CMAKE_VERSION=3.30.0
RUN pip install cmake==%CMAKE_VERSION%

ENV CMAKE_TOOLCHAIN_FILE /vcpkg/scripts/buildsystems/vcpkg.cmake
Expand Down Expand Up @@ -150,7 +150,7 @@ WORKDIR /
#
ARG CUDA_MAJOR=12
ARG CUDA_MINOR=5
ARG CUDA_PATCH=0
ARG CUDA_PATCH=1
ARG CUDA_VERSION=${CUDA_MAJOR}.${CUDA_MINOR}.${CUDA_PATCH}
ARG CUDA_PACKAGES="nvcc_${CUDA_MAJOR}.${CUDA_MINOR} \
cudart_${CUDA_MAJOR}.${CUDA_MINOR} \
Expand All @@ -175,15 +175,15 @@ RUN copy "%CUDA_INSTALL_ROOT_WP%\extras\visual_studio_integration\MSBuildExtensi

RUN setx PATH "%CUDA_INSTALL_ROOT_WP%\bin;%PATH%"

ARG CUDNN_VERSION=9.1.0.70
ARG CUDNN_VERSION=9.2.1.18
ENV CUDNN_VERSION ${CUDNN_VERSION}
COPY --from=dependency_base /cudnn /cudnn
RUN copy cudnn\bin\cudnn*.dll "%CUDA_INSTALL_ROOT_WP%\bin\."
RUN copy cudnn\lib\x64\cudnn*.lib "%CUDA_INSTALL_ROOT_WP%\lib\x64\."
RUN copy cudnn\include\cudnn*.h "%CUDA_INSTALL_ROOT_WP%\include\."
LABEL CUDNN_VERSION="${CUDNN_VERSION}"

ARG TENSORRT_VERSION=10.0.1.6
ARG TENSORRT_VERSION=10.2.0.19
ENV TRT_VERSION ${TENSORRT_VERSION}
COPY --from=dependency_base /TensorRT /TensorRT
RUN setx PATH "c:\TensorRT\lib;%PATH%"
Expand Down
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<!--
# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -30,10 +30,11 @@

[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)

> [!WARNING]
> ##### LATEST RELEASE
> You are currently on the `main` branch which tracks under-development progress towards the next release.
> The current release is version [2.47.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.06 container release on NVIDIA GPU Cloud (NGC).
[!WARNING]

##### LATEST RELEASE
You are currently on the `main` branch which tracks under-development progress towards the next release.
The current release is version [2.48.0](https://github.com/triton-inference-server/server/releases/latest) and corresponds to the 24.07 container release on NVIDIA GPU Cloud (NGC).

Triton Inference Server is an open source inference serving software that
streamlines AI inferencing. Triton enables teams to deploy any AI model from
Expand Down Expand Up @@ -91,16 +92,16 @@ Inference Server with the

```bash
# Step 1: Create the example model repository
git clone -b r24.06 https://github.com/triton-inference-server/server.git
git clone -b r24.07 https://github.com/triton-inference-server/server.git
cd server/docs/examples
./fetch_models.sh

# Step 2: Launch triton from the NGC Triton container
docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.06-py3 tritonserver --model-repository=/models
docker run --gpus=1 --rm --net=host -v ${PWD}/model_repository:/models nvcr.io/nvidia/tritonserver:24.07-py3 tritonserver --model-repository=/models

# Step 3: Sending an Inference Request
# In a separate console, launch the image_client example from the NGC Triton SDK container
docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.06-py3-sdk
docker run -it --rm --net=host nvcr.io/nvidia/tritonserver:24.07-py3-sdk
/workspace/install/bin/image_client -m densenet_onnx -c 3 -s INCEPTION /workspace/images/mug.jpg

# Inference should return the following
Expand Down
2 changes: 1 addition & 1 deletion TRITON_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.48.0dev
2.49.0dev
43 changes: 28 additions & 15 deletions build.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@
# incorrectly load the other version of the openvino libraries.
#
TRITON_VERSION_MAP = {
"2.48.0dev": (
"24.06dev", # triton container
"24.06", # upstream container
"2.49.0dev": (
"24.08dev", # triton container
"24.07", # upstream container
"1.18.1", # ORT
"2024.0.0", # ORT OpenVINO
"2024.0.0", # Standalone OpenVINO
"3.2.6", # DCGM version
"0.5.0.post1", # vLLM version
"0.5.3.post1", # vLLM version
)
}

Expand Down Expand Up @@ -1086,18 +1086,23 @@ def create_dockerfile_linux(
# Remove contents that are not needed in runtime
# Setuptools has breaking changes in version 70.0.0, so fix it to 69.5.1
# The generated code in grpc_service_pb2_grpc.py depends on grpcio>=1.64.0, so fix it to 1.64.0
RUN ldconfig && \
ARCH="$(uname -i)" && \
rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \
rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \
rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \
python3 -m pip install --upgrade pip && \
pip3 install --no-cache-dir transformers && \
find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \
find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \
pip3 install --no-cache-dir setuptools==69.5.1 grpcio-tools==1.64.0
RUN ldconfig && \\
ARCH="$(uname -i)" && \\
rm -fr ${TRT_ROOT}/bin ${TRT_ROOT}/targets/${ARCH}-linux-gnu/bin ${TRT_ROOT}/data && \\
rm -fr ${TRT_ROOT}/doc ${TRT_ROOT}/onnx_graphsurgeon ${TRT_ROOT}/python && \\
rm -fr ${TRT_ROOT}/samples ${TRT_ROOT}/targets/${ARCH}-linux-gnu/samples && \\
python3 -m pip install --upgrade pip && \\
pip3 install --no-cache-dir transformers && \\
find /usr -name libtensorrt_llm.so -exec dirname {} \; > /etc/ld.so.conf.d/tensorrt-llm.conf && \\
find /opt/tritonserver -name libtritonserver.so -exec dirname {} \; > /etc/ld.so.conf.d/triton-tensorrtllm-worker.conf && \\
pip3 install --no-cache-dir grpcio-tools==1.64.0 && \\
pip3 uninstall -y setuptools
ENV LD_LIBRARY_PATH=/usr/local/tensorrt/lib/:/opt/tritonserver/backends/tensorrtllm:$LD_LIBRARY_PATH
# There are some ucc issues when spawning mpi processes with ompi v4.1.7a1.
# Downgrade to ompi v4.1.5rc2 to avoid the issue.
RUN rm -fr /opt/hpcx/ompi
COPY --from=nvcr.io/nvidia/tritonserver:24.02-py3-min /opt/hpcx/ompi /opt/hpcx/ompi
"""
with open(os.path.join(ddir, dockerfile_name), "w") as dfile:
dfile.write(df)
Expand Down Expand Up @@ -1229,6 +1234,14 @@ def dockerfile_prepare_container_linux(argmap, backends, enable_gpu, target_mach
virtualenv \\
&& rm -rf /var/lib/apt/lists/*
"""
if "tensorrtllm" in backends:
df += """
# Updating the openssh-client to fix for the CVE-2024-6387. This can be removed when trtllm uses a later CUDA container(12.5 or later)
RUN apt-get update \\
&& apt-get install -y --no-install-recommends \\
openssh-client \\
&& rm -rf /var/lib/apt/lists/*
"""

if "vllm" in backends:
df += """
Expand Down
6 changes: 3 additions & 3 deletions deploy/aws/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -27,7 +27,7 @@
replicaCount: 1

image:
imageName: nvcr.io/nvidia/tritonserver:24.06-py3
imageName: nvcr.io/nvidia/tritonserver:24.07-py3
pullPolicy: IfNotPresent
modelRepositoryPath: s3://triton-inference-server-repository/model_repository
numGpus: 1
Expand All @@ -38,4 +38,4 @@ service:
secret:
region: AWS_REGION
id: AWS_SECRET_KEY_ID
key: AWS_SECRET_ACCESS_KEY
key: AWS_SECRET_ACCESS_KEY
4 changes: 2 additions & 2 deletions deploy/fleetcommand/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -26,7 +26,7 @@

apiVersion: v1
# appVersion is the Triton version; update when changing release
appVersion: "2.47.0"
appVersion: "2.48.0"
description: Triton Inference Server (Fleet Command)
name: triton-inference-server
# version is the Chart version; update when changing anything in the chart
Expand Down
8 changes: 4 additions & 4 deletions deploy/fleetcommand/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -27,7 +27,7 @@
replicaCount: 1

image:
imageName: nvcr.io/nvidia/tritonserver:24.06-py3
imageName: nvcr.io/nvidia/tritonserver:24.07-py3
pullPolicy: IfNotPresent
numGpus: 1
serverCommand: tritonserver
Expand All @@ -47,13 +47,13 @@ image:
#
# To set model control mode, uncomment and configure below
# TODO: Fix the following url, it is invalid
# See https://github.com/triton-inference-server/server/blob/r24.06/docs/model_management.md
# See https://github.com/triton-inference-server/server/blob/r24.07/docs/model_management.md
# for more details
#- --model-control-mode=explicit|poll|none
#
# Additional server args
#
# see https://github.com/triton-inference-server/server/blob/r24.06/README.md
# see https://github.com/triton-inference-server/server/blob/r24.07/README.md
# for more details

service:
Expand Down
6 changes: 3 additions & 3 deletions deploy/gcp/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019-2024, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -27,10 +27,10 @@
replicaCount: 1

image:
imageName: nvcr.io/nvidia/tritonserver:24.06-py3
imageName: nvcr.io/nvidia/tritonserver:24.07-py3
pullPolicy: IfNotPresent
modelRepositoryPath: gs://triton-inference-server-repository/model_repository
numGpus: 1

service:
type: LoadBalancer
type: LoadBalancer
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -33,7 +33,7 @@ metadata:
namespace: default
spec:
containers:
- image: nvcr.io/nvidia/tritonserver:24.06-py3-sdk
- image: nvcr.io/nvidia/tritonserver:24.07-py3-sdk
imagePullPolicy: Always
name: nv-triton-client
securityContext:
Expand Down
Loading

0 comments on commit 2fc47c8

Please sign in to comment.