Skip to content

Commit

Permalink
Sync with main
Browse files Browse the repository at this point in the history
  • Loading branch information
rmccorm4 committed May 29, 2024
2 parents 935bf92 + 2a1691a commit 3d55a8e
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 11 deletions.
1 change: 0 additions & 1 deletion ci/L0_multi_gpu/vllm_backend/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ function run_multi_gpu_test() {
}

### Test
python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc] nvidia-ml-py3
rm -f *.log
RET=0

Expand Down
14 changes: 7 additions & 7 deletions ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import unittest
from functools import partial

import nvidia_smi
import pynvml
import tritonclient.grpc as grpcclient
from tritonclient.utils import *

Expand All @@ -40,19 +40,19 @@

class VLLMMultiGPUTest(TestResultCollector):
def setUp(self):
nvidia_smi.nvmlInit()
pynvml.nvmlInit()
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")

def get_gpu_memory_utilization(self, gpu_id):
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id)
info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return info.used

def get_available_gpu_ids(self):
device_count = nvidia_smi.nvmlDeviceGetCount()
device_count = pynvml.nvmlDeviceGetCount()
available_gpus = []
for gpu_id in range(device_count):
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id)
handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
if handle:
available_gpus.append(gpu_id)
return available_gpus
Expand Down Expand Up @@ -178,7 +178,7 @@ def test_multi_gpu_model(self):
self._test_vllm_multi_gpu_utilization(model)

def tearDown(self):
nvidia_smi.nvmlShutdown()
pynvml.nvmlShutdown()
self.triton_client.close()


Expand Down
3 changes: 0 additions & 3 deletions docs/llama_multi_lora_tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ sudo docker run --gpus all -it --net=host -p 8001:8001 --shm-size=12G \
Triton's vLLM container has been introduced starting from 23.10 release, and `multi-lora` experimental support was added in vLLM v0.3.0 release.

> Docker image version `nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3` or higher version is strongly recommended.
> [!IMPORTANT]
> 24.05 release is still under active development, and relevant NGC containers are not available at this time.
---

For **pre-24.05 containers**, the docker images didn't support multi-lora feature, so you need to replace that provided in the container `/opt/tritonserver/backends/vllm/model.py` with the most up to date version. Just follow this command:
Expand Down

0 comments on commit 3d55a8e

Please sign in to comment.