Sync with main

triton-inference-server · May 29, 2024 · 3d55a8e · 3d55a8e
2 parents 935bf92 + 2a1691a
commit 3d55a8e
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 11 deletions.
diff --git a/ci/L0_multi_gpu/vllm_backend/test.sh b/ci/L0_multi_gpu/vllm_backend/test.sh
@@ -111,7 +111,6 @@ function run_multi_gpu_test() {
 }
 
 ### Test
-python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc] nvidia-ml-py3
 rm -f *.log
 RET=0
 

diff --git a/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py b/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py
@@ -30,7 +30,7 @@
 import unittest
 from functools import partial
 
-import nvidia_smi
+import pynvml
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
 
@@ -40,19 +40,19 @@
 
 class VLLMMultiGPUTest(TestResultCollector):
     def setUp(self):
-        nvidia_smi.nvmlInit()
+        pynvml.nvmlInit()
         self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
 
     def get_gpu_memory_utilization(self, gpu_id):
-        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id)
-        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
         return info.used
 
     def get_available_gpu_ids(self):
-        device_count = nvidia_smi.nvmlDeviceGetCount()
+        device_count = pynvml.nvmlDeviceGetCount()
         available_gpus = []
         for gpu_id in range(device_count):
-            handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
             if handle:
                 available_gpus.append(gpu_id)
         return available_gpus
@@ -178,7 +178,7 @@ def test_multi_gpu_model(self):
         self._test_vllm_multi_gpu_utilization(model)
 
     def tearDown(self):
-        nvidia_smi.nvmlShutdown()
+        pynvml.nvmlShutdown()
         self.triton_client.close()
 
 

diff --git a/docs/llama_multi_lora_tutorial.md b/docs/llama_multi_lora_tutorial.md
@@ -61,9 +61,6 @@ sudo docker run --gpus all -it --net=host -p 8001:8001 --shm-size=12G \
 Triton's vLLM container has been introduced starting from 23.10 release, and `multi-lora` experimental support was added in vLLM v0.3.0 release.
 
 > Docker image version `nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3` or higher version is strongly recommended.
-
-> [!IMPORTANT]
-> 24.05 release is still under active development, and relevant NGC containers are not available at this time.
 ---
 
 For **pre-24.05 containers**, the docker images didn't support multi-lora feature, so you need to replace that provided in the container `/opt/tritonserver/backends/vllm/model.py` with the most up to date version. Just follow this command:
-Original file line number
+Diff line change
@@ Expand Up / @@ -111,7 +111,6 @@ function run_multi_gpu_test() { @@
     }
     ### Test
-    python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc] nvidia-ml-py3
     rm -f *.log
     RET=0
@@ Expand Down @@