From aded3c3b76550db35c9232f008b5397d29663544 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Wed, 31 Jan 2024 12:26:35 -0800 Subject: [PATCH 1/9] trying workaround for accuracy test with the latest vllm 0.3.0 --- ci/L0_backend_vllm/accuracy_test/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index 584d107d..233366b9 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -40,7 +40,7 @@ EXPECTED_NUM_TESTS=1 rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt -sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/vllm_opt/1/model.json +sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json RET=0 From d4521ac5010c4ededeaf3db30dcbeb1d70c3a97c Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Wed, 31 Jan 2024 17:55:45 -0800 Subject: [PATCH 2/9] Extracted baseline vLLM engine run from unittests --- .../accuracy_test/accuracy_test.py | 87 +++++++++++++------ ci/L0_backend_vllm/accuracy_test/test.sh | 9 +- 2 files changed, 67 insertions(+), 29 deletions(-) diff --git a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py index 59c669b6..e17fc8bf 100644 --- a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py +++ b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import asyncio +import pickle import sys import unittest from functools import partial @@ -39,12 +40,24 @@ sys.path.append("../../common") from test_util import TestResultCollector, UserData, callback, create_vllm_request +VLLM_ENGINE_CONFIG = { + "model": "facebook/opt-125m", + "gpu_memory_utilization": 0.3, +} + + +PROMPTS = [ + "The most dangerous animal is", + "The capital of France is", + "The future of AI is", +] + +SAMPLING_PARAMETERS = {"temperature": 0, "top_p": 1} + async def generate_python_vllm_output(prompt, llm_engine): request_id = random_uuid() - sampling_parameters = {"temperature": 0, "top_p": 1} - sampling_params = SamplingParams(**sampling_parameters) - + sampling_params = SamplingParams(**SAMPLING_PARAMETERS) python_vllm_output = None last_output = None @@ -59,50 +72,68 @@ async def generate_python_vllm_output(prompt, llm_engine): return python_vllm_output +def prepare_vllm_engine_outputs(): + """ + Helper function that starts async vLLM engine and generates output for each + prompt in `PROMPTS`. Saves resulted baselines in `vllm_engine_output.pkl` + for further use. + """ + llm_engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**VLLM_ENGINE_CONFIG)) + python_vllm_output = [] + for i in range(len(PROMPTS)): + python_vllm_output.extend( + asyncio.run(generate_python_vllm_output(PROMPTS[i], llm_engine)) + ) + + with open("vllm_engine_output.pkl", "wb") as f: + pickle.dump(python_vllm_output, f) + + return + + class VLLMTritonAccuracyTest(TestResultCollector): def setUp(self): self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") - vllm_engine_config = { - "model": "facebook/opt-125m", - "gpu_memory_utilization": 0.3, - } - - self.llm_engine = AsyncLLMEngine.from_engine_args( - AsyncEngineArgs(**vllm_engine_config) - ) self.vllm_model_name = "vllm_opt" + self.python_vllm_output = [] + with open("vllm_engine_output.pkl", "rb") as f: + self.python_vllm_output = pickle.load(f) + + self.assertNotEqual( + self.python_vllm_output, + [], + "Loaded baseline outputs' list should not be empty", + ) + self.assertIsNotNone( + self.python_vllm_output, "Loaded baseline outputs' list should not be None" + ) + self.assertEqual( + len(self.python_vllm_output), + len(PROMPTS), + "Unexpected number of baseline outputs loaded, expected {}, but got {}".format( + len(PROMPTS), len(self.python_vllm_output) + ), + ) def test_vllm_model(self): user_data = UserData() stream = False - prompts = [ - "The most dangerous animal is", - "The capital of France is", - "The future of AI is", - ] - number_of_vllm_reqs = len(prompts) - sampling_parameters = {"temperature": "0", "top_p": "1"} - python_vllm_output = [] triton_vllm_output = [] self.triton_client.start_stream(callback=partial(callback, user_data)) - for i in range(number_of_vllm_reqs): + for i in range(len(PROMPTS)): request_data = create_vllm_request( - prompts[i], i, stream, sampling_parameters, self.vllm_model_name + PROMPTS[i], i, stream, SAMPLING_PARAMETERS, self.vllm_model_name ) self.triton_client.async_stream_infer( model_name=self.vllm_model_name, request_id=request_data["request_id"], inputs=request_data["inputs"], outputs=request_data["outputs"], - parameters=sampling_parameters, - ) - - python_vllm_output.extend( - asyncio.run(generate_python_vllm_output(prompts[i], self.llm_engine)) + parameters=SAMPLING_PARAMETERS, ) - for i in range(number_of_vllm_reqs): + for i in range(len(PROMPTS)): result = user_data._completed_requests.get() self.assertIsNot(type(result), InferenceServerException, str(result)) @@ -112,7 +143,7 @@ def test_vllm_model(self): triton_vllm_output.extend(output) self.triton_client.stop_stream() - self.assertEqual(python_vllm_output, triton_vllm_output) + self.assertEqual(self.python_vllm_output.sort(), triton_vllm_output.sort()) def tearDown(self): self.triton_client.close() diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index 233366b9..a3a44dc7 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -36,14 +36,21 @@ CLIENT_LOG="./accuracy_test_client.log" TEST_RESULT_FILE='test_results.txt' CLIENT_PY="./accuracy_test.py" SAMPLE_MODELS_REPO="../../../samples/model_repository" +VLLM_ENGINE_LOG="vllm_engine.log" EXPECTED_NUM_TESTS=1 rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt -sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/vllm_opt/1/model.json +sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/vllm_opt/1/model.json RET=0 +set +e +# Need to generate baseline first, since running 2 vLLM engines causes +# memory issues: https://github.com/vllm-project/vllm/issues/2248 +python -c "import accuracy_test; accuracy_test.prepare_vllm_engine_outputs()" >> $VLLM_ENGINE_LOG 2>&1 +set -e + run_server if [ "$SERVER_PID" == "0" ]; then cat $SERVER_LOG From ae8185613ce327d6ca703e6e1644a9654298c516 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Wed, 31 Jan 2024 18:07:06 -0800 Subject: [PATCH 3/9] Updated copyright --- ci/L0_backend_vllm/accuracy_test/accuracy_test.py | 2 +- ci/L0_backend_vllm/accuracy_test/test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py index e17fc8bf..4da4753f 100644 --- a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py +++ b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py @@ -1,4 +1,4 @@ -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index a3a44dc7..c666579e 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions From 40296a5416e3294377113d2a4e46e233962c953d Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Thu, 1 Feb 2024 09:25:41 -0800 Subject: [PATCH 4/9] Adding fix to test --- ci/L0_backend_vllm/accuracy_test/test.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index c666579e..bce2d9eb 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -48,7 +48,8 @@ RET=0 set +e # Need to generate baseline first, since running 2 vLLM engines causes # memory issues: https://github.com/vllm-project/vllm/issues/2248 -python -c "import accuracy_test; accuracy_test.prepare_vllm_engine_outputs()" >> $VLLM_ENGINE_LOG 2>&1 +python -c "import accuracy_test; accuracy_test.prepare_vllm_engine_outputs()" >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$! +wait $BASELINE_PID set -e run_server From 7764f9cc016403e217ea64104ece5b78e7516d85 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Fri, 2 Feb 2024 09:00:37 -0800 Subject: [PATCH 5/9] Revision 1 --- .../accuracy_test/accuracy_test.py | 22 +++++++++++++++---- ci/L0_backend_vllm/accuracy_test/test.sh | 6 ++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py index 4da4753f..89598164 100644 --- a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py +++ b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py @@ -24,6 +24,7 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import argparse import asyncio import pickle import sys @@ -72,10 +73,10 @@ async def generate_python_vllm_output(prompt, llm_engine): return python_vllm_output -def prepare_vllm_engine_outputs(): +def prepare_vllm_baseline_outputs(): """ Helper function that starts async vLLM engine and generates output for each - prompt in `PROMPTS`. Saves resulted baselines in `vllm_engine_output.pkl` + prompt in `PROMPTS`. Saves resulted baselines in `vllm_baseline_output.pkl` for further use. """ llm_engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**VLLM_ENGINE_CONFIG)) @@ -85,7 +86,7 @@ def prepare_vllm_engine_outputs(): asyncio.run(generate_python_vllm_output(PROMPTS[i], llm_engine)) ) - with open("vllm_engine_output.pkl", "wb") as f: + with open("vllm_baseline_output.pkl", "wb") as f: pickle.dump(python_vllm_output, f) return @@ -96,7 +97,7 @@ def setUp(self): self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") self.vllm_model_name = "vllm_opt" self.python_vllm_output = [] - with open("vllm_engine_output.pkl", "rb") as f: + with open("vllm_baseline_output.pkl", "rb") as f: self.python_vllm_output = pickle.load(f) self.assertNotEqual( @@ -150,4 +151,17 @@ def tearDown(self): if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--generate-baseline", + action="store_true", + required=False, + default=False, + help="Generates baseline output for accuracy tests", + ) + FLAGS = parser.parse_args() + if FLAGS.generate_baseline: + prepare_vllm_baseline_outputs() + exit(0) + unittest.main() diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index bce2d9eb..7ba8ba5f 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -42,13 +42,13 @@ EXPECTED_NUM_TESTS=1 rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/vllm_opt/1/model.json - +rm vllm_baseline_output.pkl RET=0 set +e # Need to generate baseline first, since running 2 vLLM engines causes # memory issues: https://github.com/vllm-project/vllm/issues/2248 -python -c "import accuracy_test; accuracy_test.prepare_vllm_engine_outputs()" >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$! +python3 $CLIENT_PY --generate-baseline >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$! wait $BASELINE_PID set -e @@ -60,7 +60,7 @@ if [ "$SERVER_PID" == "0" ]; then fi set +e -python3 $CLIENT_PY -v > $CLIENT_LOG 2>&1 +python3 $CLIENT_PY > $CLIENT_LOG 2>&1 if [ $? -ne 0 ]; then cat $CLIENT_LOG From 03e31c9fb823efeacc12b69be043de6aef09ea28 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Fri, 2 Feb 2024 13:54:37 -0800 Subject: [PATCH 6/9] bash sript follow up --- ci/L0_backend_vllm/accuracy_test/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index 7ba8ba5f..b0b1c1b2 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -42,7 +42,7 @@ EXPECTED_NUM_TESTS=1 rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/vllm_opt/1/model.json -rm vllm_baseline_output.pkl +[ -f vllm_baseline_output.pkl ] && rm vllm_baseline_output.pkl RET=0 set +e From bc7f9fc83957c490e47d5a13d1a69fb1579fa5c2 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Wed, 7 Feb 2024 10:18:41 -0800 Subject: [PATCH 7/9] Add vLLM relevant changes --- ci/L0_backend_vllm/vllm_backend/test.sh | 22 +++++++++++++++++-- .../model_repository/vllm_model/1/model.json | 4 +++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh index 32520b5d..c800b9d1 100755 --- a/ci/L0_backend_vllm/vllm_backend/test.sh +++ b/ci/L0_backend_vllm/vllm_backend/test.sh @@ -38,6 +38,16 @@ CLIENT_PY="./vllm_backend_test.py" SAMPLE_MODELS_REPO="../../../samples/model_repository" EXPECTED_NUM_TESTS=3 +# Helpers ======================================= +function assert_curl_success { + message="${1}" + if [ "$code" != "200" ]; then + cat ./curl.out + echo -e "\n***\n*** ${message} : line ${BASH_LINENO}\n***" + RET=1 + fi +} + rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt @@ -105,7 +115,7 @@ if [[ "$COUNT" -ne 2 ]]; then fi # Test loading multiple vllm models at the same time -SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR}" +SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} --model-control-mode=explicit --load-model=vllm_one" SERVER_LOG="./vllm_test_multi_model.log" # Create two models, one is just a copy of the other, and make sure gpu @@ -114,8 +124,9 @@ MODEL1="vllm_one" MODEL2="vllm_two" mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/${MODEL1}/ -sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.3/' models/${MODEL1}/1/model.json cp -r models/${MODEL1} models/${MODEL2} +sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.4/' models/${MODEL1}/1/model.json +sed -i 's/"gpu_memory_utilization": 0.5/"gpu_memory_utilization": 0.9/' models/${MODEL2}/1/model.json run_server if [ "$SERVER_PID" == "0" ]; then @@ -124,6 +135,13 @@ if [ "$SERVER_PID" == "0" ]; then exit 1 fi +# Explicitly load model +rm -f ./curl.out +set +e +code=`curl -s -w %{http_code} -o ./curl.out -X POST localhost:8000/v2/repository/models/vllm_two/load` +set -e +assert_curl_success "Failed to load 'vllm_two' model" + kill $SERVER_PID wait $SERVER_PID rm -rf "./models" diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json index e610c3cb..d9fabdf5 100644 --- a/samples/model_repository/vllm_model/1/model.json +++ b/samples/model_repository/vllm_model/1/model.json @@ -1,5 +1,7 @@ { "model":"facebook/opt-125m", "disable_log_requests": "true", - "gpu_memory_utilization": 0.5 + "gpu_memory_utilization": 0.5, + "enforce_eager": "true", + "tensor_parallel_size": 1 } From f4c021674d131fec68245c790f3c400d1d025684 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Wed, 7 Feb 2024 10:34:49 -0800 Subject: [PATCH 8/9] Added clarification to tests --- ci/L0_backend_vllm/vllm_backend/test.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/L0_backend_vllm/vllm_backend/test.sh b/ci/L0_backend_vllm/vllm_backend/test.sh index c800b9d1..716ee6ff 100755 --- a/ci/L0_backend_vllm/vllm_backend/test.sh +++ b/ci/L0_backend_vllm/vllm_backend/test.sh @@ -119,7 +119,11 @@ SERVER_ARGS="--model-repository=$(pwd)/models --backend-directory=${BACKEND_DIR} SERVER_LOG="./vllm_test_multi_model.log" # Create two models, one is just a copy of the other, and make sure gpu -# utilization is low enough for multiple models to avoid OOM +# utilization is low enough for multiple models to avoid OOM. +# vLLM changed behavior of their GPU profiler from total to free memory, +# so to load two small models at the same time, we need to start +# triton server in explicit mode, load first model with +# `gpu_memory_utilization` 0.4 and second should be 0.9. MODEL1="vllm_one" MODEL2="vllm_two" mkdir -p models From e7b7e2bae7971d7c2853d7237d6980e487065366 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Wed, 7 Feb 2024 10:36:44 -0800 Subject: [PATCH 9/9] Removing tensor size from json --- samples/model_repository/vllm_model/1/model.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/samples/model_repository/vllm_model/1/model.json b/samples/model_repository/vllm_model/1/model.json index d9fabdf5..6eb5e070 100644 --- a/samples/model_repository/vllm_model/1/model.json +++ b/samples/model_repository/vllm_model/1/model.json @@ -2,6 +2,5 @@ "model":"facebook/opt-125m", "disable_log_requests": "true", "gpu_memory_utilization": 0.5, - "enforce_eager": "true", - "tensor_parallel_size": 1 + "enforce_eager": "true" }