Skip to content

Commit

Permalink
Updated tests
Browse files Browse the repository at this point in the history
  • Loading branch information
oandreeva-nv committed Apr 9, 2024
1 parent c1c88fa commit 65669e9
Showing 1 changed file with 4 additions and 5 deletions.
9 changes: 4 additions & 5 deletions ci/L0_backend_vllm/vllm_backend/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -114,23 +114,22 @@ if [[ "$COUNT" -ne 2 ]]; then
echo "Cmdline parameters verification Failed"
fi

# Test loading multiple vllm models at the same time
# Test loading multiple vllm models
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_one"
SERVER_LOG="./vllm_test_multi_model.log"

# Create two models, one is just a copy of the other, and make sure gpu
# utilization is low enough for multiple models to avoid OOM.
# vLLM changed behavior of their GPU profiler from total to free memory,
# so to load two small models at the same time, we need to start
# triton server in explicit mode, load first model with
# `gpu_memory_utilization` 0.4 and second should be 0.9.
# so to load two small models, we need to start
# triton server in explicit mode.
MODEL1="vllm_one"
MODEL2="vllm_two"
mkdir -p models
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/${MODEL1}/
cp -r models/${MODEL1} models/${MODEL2}
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL1}/1/model.json
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.9/' models/${MODEL2}/1/model.json
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL2}/1/model.json

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand Down

0 comments on commit 65669e9

Please sign in to comment.