From f77614efbc23252e70979ccbe61312a3a0e1684b Mon Sep 17 00:00:00 2001 From: Sai Kiran Polisetty Date: Wed, 29 May 2024 23:14:29 +0530 Subject: [PATCH 1/2] Update CI - Bump vllm to v0.4.2 (#43) --- ci/L0_multi_gpu/vllm_backend/test.sh | 2 -- .../vllm_backend/vllm_multi_gpu_test.py | 14 +++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/ci/L0_multi_gpu/vllm_backend/test.sh b/ci/L0_multi_gpu/vllm_backend/test.sh index 36369196..09a0bb08 100755 --- a/ci/L0_multi_gpu/vllm_backend/test.sh +++ b/ci/L0_multi_gpu/vllm_backend/test.sh @@ -42,8 +42,6 @@ rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt sed -i '3s/^/ "tensor_parallel_size": 2,\n/' models/vllm_opt/1/model.json -python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc] nvidia-ml-py3 - RET=0 run_server diff --git a/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py b/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py index baa71632..f9bb56b3 100644 --- a/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py +++ b/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py @@ -28,7 +28,7 @@ import unittest from functools import partial -import nvidia_smi +import pynvml import tritonclient.grpc as grpcclient from tritonclient.utils import * @@ -38,20 +38,20 @@ class VLLMMultiGPUTest(TestResultCollector): def setUp(self): - nvidia_smi.nvmlInit() + pynvml.nvmlInit() self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") self.vllm_model_name = "vllm_opt" def get_gpu_memory_utilization(self, gpu_id): - handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id) - info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle) + handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) return info.used def get_available_gpu_ids(self): - device_count = nvidia_smi.nvmlDeviceGetCount() + device_count = pynvml.nvmlDeviceGetCount() available_gpus = [] for gpu_id in range(device_count): - handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id) + handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) if handle: available_gpus.append(gpu_id) return available_gpus @@ -119,7 +119,7 @@ def _test_vllm_model(self, send_parameters_as_tensor=True): self.triton_client.stop_stream() def tearDown(self): - nvidia_smi.nvmlShutdown() + pynvml.nvmlShutdown() self.triton_client.close() From 2a1691a9452815709f215ff545eb5777dbc2ff3b Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Wed, 29 May 2024 12:12:27 -0700 Subject: [PATCH 2/2] Removed the note regarding 24.05 being in development (#44) --- docs/llama_multi_lora_tutorial.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/docs/llama_multi_lora_tutorial.md b/docs/llama_multi_lora_tutorial.md index e5e6b579..c12910e6 100644 --- a/docs/llama_multi_lora_tutorial.md +++ b/docs/llama_multi_lora_tutorial.md @@ -61,9 +61,6 @@ sudo docker run --gpus all -it --net=host -p 8001:8001 --shm-size=12G \ Triton's vLLM container has been introduced starting from 23.10 release, and `multi-lora` experimental support was added in vLLM v0.3.0 release. > Docker image version `nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3` or higher version is strongly recommended. - -> [!IMPORTANT] -> 24.05 release is still under active development, and relevant NGC containers are not available at this time. --- For **pre-24.05 containers**, the docker images didn't support multi-lora feature, so you need to replace that provided in the container `/opt/tritonserver/backends/vllm/model.py` with the most up to date version. Just follow this command: