From 3a222dbd07e8e3bc8c3bd71089f547061d372b34 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Mon, 30 Dec 2024 15:21:03 -0800 Subject: [PATCH 1/9] ip --- .../metrics_test/vllm_metrics_test.py | 2 +- src/model.py | 63 +++++++------------ 2 files changed, 24 insertions(+), 41 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index 1f8514e..e0ab7e0 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -48,7 +48,7 @@ def setUp(self): "The capital of France is", "The future of AI is", ] - self.sampling_parameters = {"temperature": "0", "top_p": "1"} + self.sampling_parameters = {"temperature": 0, "top_p": 1} def parse_vllm_metrics(self): """ diff --git a/src/model.py b/src/model.py index 4c351f1..32ed205 100644 --- a/src/model.py +++ b/src/model.py @@ -32,8 +32,9 @@ import queue import threading from io import BytesIO -from typing import Dict, List +from typing import Dict, List, Optional +import msgspec import numpy as np import torch import triton_python_backend_utils as pb_utils @@ -52,6 +53,14 @@ _MULTI_LORA_ARGS_FILENAME = "multi_lora.json" +class TritonSamplingParams(SamplingParams): + lora_name: Optional[str] = None + + def __repr__(self) -> str: + base = super().__repr__() + return f"{base}, lora_name={self.lora_name}" + + class TritonPythonModel: @classmethod def auto_complete_config(cls, auto_complete_model_config): @@ -430,14 +439,12 @@ async def _generate(self, request): additional_outputs, ) = self._get_input_tensors(request) - sampling_params_dict = self._get_sampling_params_dict(parameters) - lora_name = sampling_params_dict.pop("lora_name", None) - sampling_params = SamplingParams(**sampling_params_dict) + sampling_params = self._get_sampling_params_dict(parameters) lora_request = None - if lora_name is not None: - lora_id = str(self.supported_loras.index(lora_name) + 1) + if sampling_params.lora_name is not None: + lora_id = str(self.supported_loras.index(sampling_params.lora_name) + 1) lora_int_id = int(lora_id) - lora_local_path = self.lora_repository[lora_name] + lora_local_path = self.lora_repository[sampling_params.lora_name] lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path) response_iterator = self._llm_engine.generate( @@ -704,32 +711,8 @@ def _create_response( return pb_utils.InferenceResponse(output_tensors=output_tensors) - def _get_sampling_params_dict(self, params_json): - params_dict = json.loads(params_json) - - # Special parsing for the supported sampling parameters - bool_keys = ["ignore_eos", "skip_special_tokens", "use_beam_search"] - for k in bool_keys: - if k in params_dict: - params_dict[k] = bool(params_dict[k]) - - float_keys = [ - "frequency_penalty", - "length_penalty", - "presence_penalty", - "temperature", - "top_p", - ] - for k in float_keys: - if k in params_dict: - params_dict[k] = float(params_dict[k]) - - int_keys = ["best_of", "max_tokens", "min_tokens", "n", "top_k"] - for k in int_keys: - if k in params_dict: - params_dict[k] = int(params_dict[k]) - - return params_dict + def _get_sampling_params_dict(self, params_json) -> TritonSamplingParams: + return msgspec.json.decode(params_json, type=TritonSamplingParams) def _verify_loras(self, request): # We will check if the requested lora exists here, if not we will send a @@ -737,26 +720,26 @@ def _verify_loras(self, request): # further processing. verified_request = None lora_error = None - lora_name = None parameters_input_tensor = pb_utils.get_input_tensor_by_name( request, "sampling_parameters" ) if parameters_input_tensor: parameters = parameters_input_tensor.as_numpy()[0].decode("utf-8") - sampling_params_dict = self._get_sampling_params_dict(parameters) - lora_name = sampling_params_dict.pop("lora_name", None) + sampling_params = self._get_sampling_params_dict(parameters) - if lora_name is not None: + if sampling_params.lora_name is not None: if not self.enable_lora: lora_error = pb_utils.TritonError("LoRA feature is not enabled.") self.logger.log_info( "[vllm] LoRA is not enabled, please restart the backend with LoRA enabled." ) - elif lora_name not in self.supported_loras: + elif sampling_params.lora_name not in self.supported_loras: lora_error = pb_utils.TritonError( - f"LoRA {lora_name} is not supported, we currently support {self.supported_loras}" + f"LoRA {sampling_params.lora_name} is not supported, we currently support {self.supported_loras}" + ) + self.logger.log_info( + f"[vllm] LoRA {sampling_params.lora_name} not found." ) - self.logger.log_info(f"[vllm] LoRA {lora_name} not found.") if lora_error is not None: output_tensor = pb_utils.Tensor( From f63c841a7cfc0131e1d4e1edba7a82cefd25f07c Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Thu, 2 Jan 2025 14:48:57 -0800 Subject: [PATCH 2/9] refactor + clean up --- src/model.py | 42 +++++--------- src/utils/vllm_backend_utils.py | 100 ++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 27 deletions(-) create mode 100644 src/utils/vllm_backend_utils.py diff --git a/src/model.py b/src/model.py index 32ed205..1b993eb 100644 --- a/src/model.py +++ b/src/model.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -32,9 +32,8 @@ import queue import threading from io import BytesIO -from typing import Dict, List, Optional +from typing import Dict, List -import msgspec import numpy as np import torch import triton_python_backend_utils as pb_utils @@ -44,23 +43,15 @@ build_async_engine_client_from_engine_args, ) from vllm.lora.request import LoRARequest -from vllm.sampling_params import SamplingParams from vllm.utils import random_uuid from utils.metrics import VllmStatLogger +from utils.vllm_backend_utils import TritonSamplingParams _VLLM_ENGINE_ARGS_FILENAME = "model.json" _MULTI_LORA_ARGS_FILENAME = "multi_lora.json" -class TritonSamplingParams(SamplingParams): - lora_name: Optional[str] = None - - def __repr__(self) -> str: - base = super().__repr__() - return f"{base}, lora_name={self.lora_name}" - - class TritonPythonModel: @classmethod def auto_complete_config(cls, auto_complete_model_config): @@ -439,12 +430,13 @@ async def _generate(self, request): additional_outputs, ) = self._get_input_tensors(request) - sampling_params = self._get_sampling_params_dict(parameters) + sampling_params = TritonSamplingParams.from_dict(parameters, self.logger) + lora_name = sampling_params.lora_name lora_request = None - if sampling_params.lora_name is not None: - lora_id = str(self.supported_loras.index(sampling_params.lora_name) + 1) + if lora_name is not None: + lora_id = str(self.supported_loras.index(lora_name) + 1) lora_int_id = int(lora_id) - lora_local_path = self.lora_repository[sampling_params.lora_name] + lora_local_path = self.lora_repository[lora_name] lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path) response_iterator = self._llm_engine.generate( @@ -509,7 +501,6 @@ async def _generate(self, request): ) except Exception as e: - self.logger.log_error(f"[vllm] Error generating stream: {e}") error = pb_utils.TritonError(f"Error generating stream: {e}") text_output_tensor = pb_utils.Tensor( "text_output", np.asarray(["N/A"], dtype=self.output_dtype) @@ -711,35 +702,32 @@ def _create_response( return pb_utils.InferenceResponse(output_tensors=output_tensors) - def _get_sampling_params_dict(self, params_json) -> TritonSamplingParams: - return msgspec.json.decode(params_json, type=TritonSamplingParams) - def _verify_loras(self, request): # We will check if the requested lora exists here, if not we will send a # response with `LoRA not found` information. In this way we may avoid # further processing. verified_request = None lora_error = None + lora_name = None parameters_input_tensor = pb_utils.get_input_tensor_by_name( request, "sampling_parameters" ) if parameters_input_tensor: parameters = parameters_input_tensor.as_numpy()[0].decode("utf-8") - sampling_params = self._get_sampling_params_dict(parameters) + sampling_params = TritonSamplingParams.from_dict(parameters, self.logger) + lora_name = sampling_params.lora_name - if sampling_params.lora_name is not None: + if lora_name is not None: if not self.enable_lora: lora_error = pb_utils.TritonError("LoRA feature is not enabled.") self.logger.log_info( "[vllm] LoRA is not enabled, please restart the backend with LoRA enabled." ) - elif sampling_params.lora_name not in self.supported_loras: + elif lora_name not in self.supported_loras: lora_error = pb_utils.TritonError( - f"LoRA {sampling_params.lora_name} is not supported, we currently support {self.supported_loras}" - ) - self.logger.log_info( - f"[vllm] LoRA {sampling_params.lora_name} not found." + f"LoRA {lora_name} is not supported, we currently support {self.supported_loras}" ) + self.logger.log_info(f"[vllm] LoRA {lora_name} not found.") if lora_error is not None: output_tensor = pb_utils.Tensor( diff --git a/src/utils/vllm_backend_utils.py b/src/utils/vllm_backend_utils.py new file mode 100644 index 0000000..8d330fb --- /dev/null +++ b/src/utils/vllm_backend_utils.py @@ -0,0 +1,100 @@ +# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import json +from typing import Optional + +from vllm.sampling_params import GuidedDecodingParams, SamplingParams + + +class TritonSamplingParams(SamplingParams): + """ + Extended sampling parameters for text generation via + Triton Inference Server and vLLM backend. + + Attributes: + lora_name (Optional[str]): The name of the LoRA (Low-Rank Adaptation) + to use for inference. + """ + + lora_name: Optional[str] = None + + def __repr__(self) -> str: + """ + Returns a string representation of the `TritonSamplingParams` object. + + This method overrides the `__repr__` method of the parent class + to include additional attributes in the string representation. + + Returns: + A string representation of the object. + """ + base = super().__repr__() + return f"{base}, lora_name={self.lora_name}" + + @staticmethod + def from_dict( + params_dict_str: str, logger: "pb_utils.Logger" + ) -> "TritonSamplingParams": + """ + Creates a `TritonSamplingParams` object from a dictionary string. + + This method parses a JSON string containing sampling parameters, + converts the values to appropriate types, and creates a + `TritonSamplingParams` object. + + Args: + params_dict (str): A JSON string containing sampling parameters. + logger (pb_utils.Logger): Triton Inference Server logger object. + + Returns: + TritonSamplingParams: An instance of TritonSamplingParams. + """ + try: + params_dict = json.loads(params_dict_str) + vllm_params_dict = SamplingParams.__annotations__ + type_mapping = { + int: int, + float: float, + bool: bool, + str: str, + Optional[int]: int, + } + for key, value in params_dict.items(): + if key == "guided_decoding": + params_dict[key] = GuidedDecodingParams(**json.loads(value)) + elif key in vllm_params_dict: + vllm_type = vllm_params_dict[key] + if vllm_type in type_mapping: + params_dict[key] = type_mapping[vllm_type](params_dict[key]) + + return TritonSamplingParams(**params_dict) + + except Exception as e: + logger.log_error( + f"[vllm] Was trying to create `TritonSamplingParams`, but got exception: {e}" + ) + return None From 4ad17c45dfce64b7f21e7e91a3b771f1d5b48415 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Fri, 3 Jan 2025 12:07:21 -0800 Subject: [PATCH 3/9] Added tests --- ci/L0_multi_gpu_vllm/multi_lora/test.sh | 154 +++++++++++++++++++++++- src/model.py | 9 +- 2 files changed, 158 insertions(+), 5 deletions(-) diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh index b561a2d..841c57e 100755 --- a/ci/L0_multi_gpu_vllm/multi_lora/test.sh +++ b/ci/L0_multi_gpu_vllm/multi_lora/test.sh @@ -38,6 +38,60 @@ CLIENT_PY="./multi_lora_test.py" DOWNLOAD_PY="./download.py" SAMPLE_MODELS_REPO="../../../samples/model_repository" EXPECTED_NUM_TESTS=2 +GENERATE_ENDPOINT="localhost:8000/v2/models/vllm_llama_multi_lora/generate" +CHECK_FOR_ERROR=true + +make_api_call() { + local endpoint="$1" + local data="$2" + curl -X POST "$endpoint" --data-binary @- <<< "$data" +} + +check_response() { + local response="$1" + local expected_response="$2" + local error_message="$3" + local check_error="${4:-false}" + + if [ -z "$response" ]; then + echo -e "Expected a non-empty response from server" + echo -e "\n***\n*** $error_message \n***" + return 1 + fi + + local response_text=$(echo "$response" | jq '.text_output // empty') + local response_error=$(echo "$response" | jq '.error // empty') + + if [ "$check_error" = true ]; then + if [[ -n "$response_text" ]]; then + echo -e "Server didn't return an error." + echo "$response" + echo -e "\n***\n*** $error_message \n***" + return 1 + elif [[ "$expected_response" != "$response_error" ]]; then + echo -e "Expected error message doesn't match actual response." + echo "Expected: $expected_response." + echo "Received: $response_error" + echo -e "\n***\n*** $error_message\n***" + return 1 + fi + else + if [[ ! -z "$response_error" ]]; then + echo -e "Received an error from server." + echo "$response" + echo -e "\n***\n*** $error_message \n***" + return 1 + elif [[ "$expected_response" != "$response_text" ]]; then + echo "Expected response doesn't match actual" + echo "Expected: $expected_response." + echo "Received: $response_text" + echo -e "\n***\n*** $error_message \n***" + return 1 + fi + fi + + return 0 +} # first we download weights pip install -U huggingface_hub @@ -58,7 +112,7 @@ model_json=$(cat < Date: Fri, 3 Jan 2025 12:11:09 -0800 Subject: [PATCH 4/9] clean up --- ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py | 2 +- ci/L0_multi_gpu_vllm/multi_lora/test.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index e0ab7e0..1f8514e 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -48,7 +48,7 @@ def setUp(self): "The capital of France is", "The future of AI is", ] - self.sampling_parameters = {"temperature": 0, "top_p": 1} + self.sampling_parameters = {"temperature": "0", "top_p": "1"} def parse_vllm_metrics(self): """ diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh index 841c57e..6e66f56 100755 --- a/ci/L0_multi_gpu_vllm/multi_lora/test.sh +++ b/ci/L0_multi_gpu_vllm/multi_lora/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -112,7 +112,7 @@ model_json=$(cat < Date: Fri, 3 Jan 2025 16:29:25 -0800 Subject: [PATCH 5/9] Add accuracy test for guided decoding --- .../accuracy_test/accuracy_test.py | 116 ++++++++++++++++-- ci/L0_backend_vllm/accuracy_test/test.sh | 6 +- ci/L0_multi_gpu_vllm/multi_lora/test.sh | 4 +- 3 files changed, 111 insertions(+), 15 deletions(-) diff --git a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py index 8959816..2ed61b4 100644 --- a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py +++ b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py @@ -26,6 +26,7 @@ import argparse import asyncio +import json import pickle import sys import unittest @@ -36,6 +37,7 @@ from vllm import SamplingParams from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine +from vllm.sampling_params import GuidedDecodingParams from vllm.utils import random_uuid sys.path.append("../../common") @@ -53,14 +55,22 @@ "The future of AI is", ] +GUIDED_PROMPTS = ["Classify intent of the sentence: Harry Potter is underrated. "] + SAMPLING_PARAMETERS = {"temperature": 0, "top_p": 1} -async def generate_python_vllm_output(prompt, llm_engine): +async def generate_python_vllm_output( + prompt, + llm_engine, + sampling_params=SamplingParams(**SAMPLING_PARAMETERS), + guided_generation=None, +): request_id = random_uuid() - sampling_params = SamplingParams(**SAMPLING_PARAMETERS) python_vllm_output = None last_output = None + if guided_generation: + sampling_params.guided_decoding = guided_generation async for vllm_output in llm_engine.generate(prompt, sampling_params, request_id): last_output = vllm_output @@ -69,24 +79,28 @@ async def generate_python_vllm_output(prompt, llm_engine): python_vllm_output = [ (prompt + output.text).encode("utf-8") for output in last_output.outputs ] - return python_vllm_output -def prepare_vllm_baseline_outputs(): +def prepare_vllm_baseline_outputs( + export_file="vllm_baseline_output.pkl", prompts=PROMPTS, guided_generation=None +): """ Helper function that starts async vLLM engine and generates output for each - prompt in `PROMPTS`. Saves resulted baselines in `vllm_baseline_output.pkl` + prompt in `prompts`. Saves resulted baselines in `vllm_baseline_output.pkl` for further use. """ llm_engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**VLLM_ENGINE_CONFIG)) python_vllm_output = [] - for i in range(len(PROMPTS)): + for i in range(len(prompts)): python_vllm_output.extend( - asyncio.run(generate_python_vllm_output(PROMPTS[i], llm_engine)) + asyncio.run( + generate_python_vllm_output( + prompts[i], llm_engine, guided_generation=guided_generation + ) + ) ) - - with open("vllm_baseline_output.pkl", "wb") as f: + with open(export_file, "wb") as f: pickle.dump(python_vllm_output, f) return @@ -96,6 +110,9 @@ class VLLMTritonAccuracyTest(TestResultCollector): def setUp(self): self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001") self.vllm_model_name = "vllm_opt" + + def test_vllm_model(self): + # Reading and verifying baseline data self.python_vllm_output = [] with open("vllm_baseline_output.pkl", "rb") as f: self.python_vllm_output = pickle.load(f) @@ -116,11 +133,9 @@ def setUp(self): ), ) - def test_vllm_model(self): user_data = UserData() stream = False triton_vllm_output = [] - self.triton_client.start_stream(callback=partial(callback, user_data)) for i in range(len(PROMPTS)): request_data = create_vllm_request( @@ -131,7 +146,7 @@ def test_vllm_model(self): request_id=request_data["request_id"], inputs=request_data["inputs"], outputs=request_data["outputs"], - parameters=SAMPLING_PARAMETERS, + parameters=request_data["parameters"], ) for i in range(len(PROMPTS)): @@ -146,6 +161,63 @@ def test_vllm_model(self): self.triton_client.stop_stream() self.assertEqual(self.python_vllm_output.sort(), triton_vllm_output.sort()) + def test_guided_decoding(self): + # Reading and verifying baseline data + self.python_vllm_output = [] + with open("vllm_guided_baseline_output.pkl", "rb") as f: + self.python_vllm_output = pickle.load(f) + + self.assertNotEqual( + self.python_vllm_output, + [], + "Loaded baseline outputs' list should not be empty", + ) + self.assertIsNotNone( + self.python_vllm_output, "Loaded baseline outputs' list should not be None" + ) + self.assertEqual( + len(self.python_vllm_output), + len(GUIDED_PROMPTS), + "Unexpected number of baseline outputs loaded, expected {}, but got {}".format( + len(GUIDED_PROMPTS), len(self.python_vllm_output) + ), + ) + + user_data = UserData() + stream = False + triton_vllm_output = [] + + self.triton_client.start_stream(callback=partial(callback, user_data)) + sampling_params = SAMPLING_PARAMETERS + guided_decoding_params = { + "choice": ["Positive", "Negative"], + "backend": "outlines", + } + sampling_params["guided_decoding"] = json.dumps(guided_decoding_params) + for i in range(len(GUIDED_PROMPTS)): + request_data = create_vllm_request( + GUIDED_PROMPTS[i], i, stream, sampling_params, self.vllm_model_name + ) + self.triton_client.async_stream_infer( + model_name=self.vllm_model_name, + request_id=request_data["request_id"], + inputs=request_data["inputs"], + outputs=request_data["outputs"], + parameters=request_data["parameters"], + ) + + for i in range(len(GUIDED_PROMPTS)): + result = user_data._completed_requests.get() + self.assertIsNot(type(result), InferenceServerException, str(result)) + + output = result.as_numpy("text_output") + self.assertIsNotNone(output, "`text_output` should not be None") + + triton_vllm_output.extend(output) + + self.triton_client.stop_stream() + self.assertEqual(self.python_vllm_output.sort(), triton_vllm_output.sort()) + def tearDown(self): self.triton_client.close() @@ -159,9 +231,29 @@ def tearDown(self): default=False, help="Generates baseline output for accuracy tests", ) + parser.add_argument( + "--generate-guided-baseline", + action="store_true", + required=False, + default=False, + help="Generates baseline output for accuracy tests", + ) FLAGS = parser.parse_args() if FLAGS.generate_baseline: prepare_vllm_baseline_outputs() exit(0) + if FLAGS.generate_guided_baseline: + guided_decoding_params = { + "choice": ["Positive", "Negative"], + "backend": "outlines", + } + guided_generation = GuidedDecodingParams(**guided_decoding_params) + prepare_vllm_baseline_outputs( + export_file="vllm_guided_baseline_output.pkl", + prompts=GUIDED_PROMPTS, + guided_generation=guided_generation, + ) + exit(0) + unittest.main() diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index b0b1c1b..773e355 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -37,7 +37,7 @@ TEST_RESULT_FILE='test_results.txt' CLIENT_PY="./accuracy_test.py" SAMPLE_MODELS_REPO="../../../samples/model_repository" VLLM_ENGINE_LOG="vllm_engine.log" -EXPECTED_NUM_TESTS=1 +EXPECTED_NUM_TESTS=2 rm -rf models && mkdir -p models cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt @@ -50,6 +50,10 @@ set +e # memory issues: https://github.com/vllm-project/vllm/issues/2248 python3 $CLIENT_PY --generate-baseline >> $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$! wait $BASELINE_PID + +python3 $CLIENT_PY --generate-guided-baseline > $VLLM_ENGINE_LOG 2>&1 & BASELINE_PID=$! +wait $BASELINE_PID + set -e run_server diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh index 6e66f56..6ab8d62 100755 --- a/ci/L0_multi_gpu_vllm/multi_lora/test.sh +++ b/ci/L0_multi_gpu_vllm/multi_lora/test.sh @@ -381,7 +381,7 @@ else fi # Test generate endpoint + LoRA enabled (str flag) -EXPECTED_RESPONSE='"LoRA feature is not enabled."' +EXPECTED_RESPONSE='" feature is not enabled."' DATA='{ "text_input": "Instruct: What do you think of Computer Science?\nOutput:", "parameters": { @@ -393,7 +393,7 @@ DATA='{ } }' RESPONSE=$(make_api_call "$GENERATE_ENDPOINT" "$DATA") -check_response "$RESPONSE" "$EXPECTED_RESPONSE" "Disabled LoRA + Generate Endpoint Test FAILED." $CHECK_FOR_ERROR || RET=1 +check_response "$RESPONSE" "$EXPECTED_RESPONSE" "Disabled LoRA + Generate Endpoint Test FAILED." $CHECK_FOR_ERROR > $CLIENT_LOG 2>&1 || RET=1 set -e From cc6dfc6f2ed985fad9d1fdfa1835a2db39b8fd60 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Fri, 3 Jan 2025 16:31:53 -0800 Subject: [PATCH 6/9] Copyright --- ci/L0_backend_vllm/accuracy_test/accuracy_test.py | 2 +- ci/L0_backend_vllm/accuracy_test/test.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py index 2ed61b4..b2a7e13 100644 --- a/ci/L0_backend_vllm/accuracy_test/accuracy_test.py +++ b/ci/L0_backend_vllm/accuracy_test/accuracy_test.py @@ -1,4 +1,4 @@ -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions diff --git a/ci/L0_backend_vllm/accuracy_test/test.sh b/ci/L0_backend_vllm/accuracy_test/test.sh index 773e355..75093b6 100755 --- a/ci/L0_backend_vllm/accuracy_test/test.sh +++ b/ci/L0_backend_vllm/accuracy_test/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions From 07c5374e4ace17cb254a45196d006e0267b1dbc7 Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Fri, 3 Jan 2025 16:36:35 -0800 Subject: [PATCH 7/9] Clean up --- src/model.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/model.py b/src/model.py index 48344a1..19ff713 100644 --- a/src/model.py +++ b/src/model.py @@ -501,6 +501,7 @@ async def _generate(self, request): ) except Exception as e: + self.logger.log_error(f"[vllm] Error generating stream: {e}") error = pb_utils.TritonError(f"Error generating stream: {e}") text_output_tensor = pb_utils.Tensor( "text_output", np.asarray(["N/A"], dtype=self.output_dtype) From cb819634af75a9ed645fde4b07cf3ba618f5613e Mon Sep 17 00:00:00 2001 From: oandreeva-nv Date: Mon, 6 Jan 2025 11:03:23 -0800 Subject: [PATCH 8/9] Test fix --- ci/L0_multi_gpu_vllm/multi_lora/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh index 6ab8d62..92c2dbb 100755 --- a/ci/L0_multi_gpu_vllm/multi_lora/test.sh +++ b/ci/L0_multi_gpu_vllm/multi_lora/test.sh @@ -381,7 +381,7 @@ else fi # Test generate endpoint + LoRA enabled (str flag) -EXPECTED_RESPONSE='" feature is not enabled."' +EXPECTED_RESPONSE='"LoRA feature is not enabled."' DATA='{ "text_input": "Instruct: What do you think of Computer Science?\nOutput:", "parameters": { From 8f9567c6c3b1cd0d5dae9a812177064f398f763b Mon Sep 17 00:00:00 2001 From: Olga Andreeva <124622579+oandreeva-nv@users.noreply.github.com> Date: Thu, 9 Jan 2025 12:12:44 -0800 Subject: [PATCH 9/9] Apply suggestions from code review --- ci/L0_multi_gpu_vllm/multi_lora/test.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/L0_multi_gpu_vllm/multi_lora/test.sh b/ci/L0_multi_gpu_vllm/multi_lora/test.sh index 92c2dbb..bcc5277 100755 --- a/ci/L0_multi_gpu_vllm/multi_lora/test.sh +++ b/ci/L0_multi_gpu_vllm/multi_lora/test.sh @@ -318,7 +318,7 @@ else fi fi -# Test generate endpoint + LoRA enabled (boolean flag) +# Test generate endpoint + LoRA disabled (boolean flag) EXPECTED_RESPONSE='"LoRA feature is not enabled."' DATA='{ "text_input": "Instruct: What do you think of Computer Science?\nOutput:", @@ -380,7 +380,7 @@ else fi fi -# Test generate endpoint + LoRA enabled (str flag) +# Test generate endpoint + LoRA disabled (str flag) EXPECTED_RESPONSE='"LoRA feature is not enabled."' DATA='{ "text_input": "Instruct: What do you think of Computer Science?\nOutput:",