Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating vLLM tests to handle changes between vLLM version 0.2.3 and 0.3.0 #30

Merged
merged 9 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 74 additions & 29 deletions ci/L0_backend_vllm/accuracy_test/accuracy_test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand All @@ -24,7 +24,9 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
import asyncio
import pickle
import sys
import unittest
from functools import partial
Expand All @@ -39,12 +41,24 @@
sys.path.append("../../common")
from test_util import TestResultCollector, UserData, callback, create_vllm_request

VLLM_ENGINE_CONFIG = {
"model": "facebook/opt-125m",
"gpu_memory_utilization": 0.3,
}


PROMPTS = [
"The most dangerous animal is",
"The capital of France is",
"The future of AI is",
]

SAMPLING_PARAMETERS = {"temperature": 0, "top_p": 1}


async def generate_python_vllm_output(prompt, llm_engine):
request_id = random_uuid()
sampling_parameters = {"temperature": 0, "top_p": 1}
sampling_params = SamplingParams(**sampling_parameters)

sampling_params = SamplingParams(**SAMPLING_PARAMETERS)
python_vllm_output = None
last_output = None

Expand All @@ -59,50 +73,68 @@ async def generate_python_vllm_output(prompt, llm_engine):
return python_vllm_output


def prepare_vllm_baseline_outputs():
"""
Helper function that starts async vLLM engine and generates output for each
prompt in `PROMPTS`. Saves resulted baselines in `vllm_baseline_output.pkl`
for further use.
"""
llm_engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**VLLM_ENGINE_CONFIG))
python_vllm_output = []
for i in range(len(PROMPTS)):
python_vllm_output.extend(
asyncio.run(generate_python_vllm_output(PROMPTS[i], llm_engine))
)

with open("vllm_baseline_output.pkl", "wb") as f:
pickle.dump(python_vllm_output, f)

return

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps the return statement can be removed here since it always returns None.


class VLLMTritonAccuracyTest(TestResultCollector):
def setUp(self):
self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
vllm_engine_config = {
"model": "facebook/opt-125m",
"gpu_memory_utilization": 0.3,
}

self.llm_engine = AsyncLLMEngine.from_engine_args(
AsyncEngineArgs(**vllm_engine_config)
)
self.vllm_model_name = "vllm_opt"
self.python_vllm_output = []
with open("vllm_baseline_output.pkl", "rb") as f:
self.python_vllm_output = pickle.load(f)

self.assertNotEqual(
self.python_vllm_output,
[],
"Loaded baseline outputs' list should not be empty",
)
self.assertIsNotNone(
self.python_vllm_output, "Loaded baseline outputs' list should not be None"
)
self.assertEqual(
len(self.python_vllm_output),
len(PROMPTS),
"Unexpected number of baseline outputs loaded, expected {}, but got {}".format(
len(PROMPTS), len(self.python_vllm_output)
),
)

def test_vllm_model(self):
user_data = UserData()
stream = False
prompts = [
"The most dangerous animal is",
"The capital of France is",
"The future of AI is",
]
number_of_vllm_reqs = len(prompts)
sampling_parameters = {"temperature": "0", "top_p": "1"}
python_vllm_output = []
triton_vllm_output = []

self.triton_client.start_stream(callback=partial(callback, user_data))
for i in range(number_of_vllm_reqs):
for i in range(len(PROMPTS)):
request_data = create_vllm_request(
prompts[i], i, stream, sampling_parameters, self.vllm_model_name
PROMPTS[i], i, stream, SAMPLING_PARAMETERS, self.vllm_model_name
)
self.triton_client.async_stream_infer(
model_name=self.vllm_model_name,
request_id=request_data["request_id"],
inputs=request_data["inputs"],
outputs=request_data["outputs"],
parameters=sampling_parameters,
)

python_vllm_output.extend(
asyncio.run(generate_python_vllm_output(prompts[i], self.llm_engine))
parameters=SAMPLING_PARAMETERS,
)

for i in range(number_of_vllm_reqs):
for i in range(len(PROMPTS)):
result = user_data._completed_requests.get()
self.assertIsNot(type(result), InferenceServerException, str(result))

Expand All @@ -112,11 +144,24 @@ def test_vllm_model(self):
triton_vllm_output.extend(output)

self.triton_client.stop_stream()
self.assertEqual(python_vllm_output, triton_vllm_output)
self.assertEqual(self.python_vllm_output.sort(), triton_vllm_output.sort())

def tearDown(self):
self.triton_client.close()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--generate-baseline",
action="store_true",
required=False,
default=False,
help="Generates baseline output for accuracy tests",
)
FLAGS = parser.parse_args()
if FLAGS.generate_baseline:
prepare_vllm_baseline_outputs()
exit(0)

unittest.main()
14 changes: 11 additions & 3 deletions ci/L0_backend_vllm/accuracy_test/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -36,14 +36,22 @@ CLIENT_LOG="./accuracy_test_client.log"
TEST_RESULT_FILE='test_results.txt'
CLIENT_PY="./accuracy_test.py"
SAMPLE_MODELS_REPO="../../../samples/model_repository"
VLLM_ENGINE_LOG="vllm_engine.log"
EXPECTED_NUM_TESTS=1

rm -rf models && mkdir -p models
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/vllm_opt/1/model.json

[ -f vllm_baseline_output.pkl ] && rm vllm_baseline_output.pkl
RET=0

set +e
# Need to generate baseline first, since running 2 vLLM engines causes
# memory issues: https://github.com/vllm-project/vllm/issues/2248
python3 $CLIENT_PY --generate-baseline >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$!
wait $BASELINE_PID
set -e

run_server
if [ "$SERVER_PID" == "0" ]; then
cat $SERVER_LOG
Expand All @@ -52,7 +60,7 @@ if [ "$SERVER_PID" == "0" ]; then
fi

set +e
python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1
python3 $CLIENT_PY > $CLIENT_LOG 2>&1

if [ $? -ne 0 ]; then
cat $CLIENT_LOG
Expand Down
28 changes: 25 additions & 3 deletions ci/L0_backend_vllm/vllm_backend/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,16 @@ CLIENT_PY="./vllm_backend_test.py"
SAMPLE_MODELS_REPO="../../../samples/model_repository"
EXPECTED_NUM_TESTS=3

# Helpers =======================================
function assert_curl_success {
message="${1}"
if [ "$code" != "200" ]; then
cat ./curl.out
echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***"
RET=1
fi
}

rm -rf models && mkdir -p models
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt

Expand Down Expand Up @@ -105,17 +115,22 @@ if [[ "$COUNT" -ne 2 ]]; then
fi

# Test loading multiple vllm models at the same time
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR}"
SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_one"
SERVER_LOG="./vllm_test_multi_model.log"

# Create two models, one is just a copy of the other, and make sure gpu
# utilization is low enough for multiple models to avoid OOM
# utilization is low enough for multiple models to avoid OOM.
# vLLM changed behavior of their GPU profiler from total to free memory,
# so to load two small models at the same time, we need to start
# triton server in explicit mode, load first model with
# `gpu_memory_utilization` 0.4 and second should be 0.9.
MODEL1="vllm_one"
MODEL2="vllm_two"
mkdir -p models
cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/${MODEL1}/
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/${MODEL1}/1/model.json
cp -r models/${MODEL1} models/${MODEL2}
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL1}/1/model.json
sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.9/' models/${MODEL2}/1/model.json

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand All @@ -124,6 +139,13 @@ if [ "$SERVER_PID" == "0" ]; then
exit 1
fi

# Explicitly load model
rm -f ./curl.out
set +e
code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/vllm_two/load`
set -e
assert_curl_success "Failed to load 'vllm_two' model"

kill $SERVER_PID
wait $SERVER_PID
rm -rf "./models"
Expand Down
3 changes: 2 additions & 1 deletion samples/model_repository/vllm_model/1/model.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"model":"facebook/opt-125m",
"disable_log_requests": "true",
"gpu_memory_utilization": 0.5
"gpu_memory_utilization": 0.5,
"enforce_eager": "true"
}
Loading