triton-inference-server · rmccorm4 · Jul 26, 2024 · Jul 26, 2024
diff --git a/README.md b/README.md
@@ -22,8 +22,8 @@ and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
 
 For vLLM and TRT-LLM, you can use their respective images:
-- `nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3`
-- `nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3`
 
 If you decide to run the CLI on the host or in a custom image, please
 see this list of [additional dependencies](#additional-dependencies-for-custom-environments)
@@ -38,6 +38,7 @@ matrix below:
 
 | Triton CLI Version | TRT-LLM Version | Triton Container Tag |
 |:------------------:|:---------------:|:--------------------:|
+| 0.0.9 | v0.10.0 | 24.06 |
 | 0.0.8 | v0.9.0 | 24.05 |
 | 0.0.7 | v0.9.0 | 24.04 |
 | 0.0.6 | v0.8.0 | 24.02, 24.03 |
@@ -55,7 +56,7 @@ It is also possible to install from a specific branch name, a commit hash
 or a tag name. For example to install `triton_cli` with a specific tag:
 
 ```bash
-GIT_REF="0.0.8"
+GIT_REF="0.0.9"
 pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
 ```
 
@@ -90,7 +91,7 @@ triton -h
 triton import -m gpt2
 
 # Start server pointing at the default model repository
-triton start --image nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3
+triton start --image nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3
 
 # Infer with CLI
 triton infer -m gpt2 --prompt "machine learning is"
@@ -144,10 +145,10 @@ docker run -ti \
   --shm-size=1g --ulimit memlock=-1 \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.06-vllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].8
+pip install git+https://github.com/triton-inference-server/[email protected].9
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
@@ -213,10 +214,10 @@ docker run -ti \
   -v /tmp:/tmp \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].8
+pip install git+https://github.com/triton-inference-server/[email protected].9
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login

diff --git a/src/triton_cli/__init__.py b/src/triton_cli/__init__.py
@@ -24,4 +24,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-__version__ = "0.0.9dev"
+__version__ = "0.0.9"
diff --git a/src/triton_cli/docker/Dockerfile b/src/triton_cli/docker/Dockerfile
@@ -3,7 +3,7 @@ FROM nvcr.io/nvidia/tritonserver:24.06-trtllm-python-py3
 
 # Setup vLLM Triton backend
 RUN mkdir -p /opt/tritonserver/backends/vllm && \
-    wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py
+    wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/r24.06/src/model.py
 
 # vLLM runtime dependencies
 RUN pip install "vllm==0.4.3"