Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test: Increase server startup timeout for vllm tests for test stability #101

Merged
merged 2 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 47 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,30 @@
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


import os
import sys
from pathlib import Path
Expand All @@ -8,16 +35,34 @@


@pytest.fixture(scope="function")
def llm_server():
def trtllm_server():
llm_repo = None

# Give ample startup timeout for possible downloading of models
# TRT-LLM models should be pre-built offline, and only need to be read
# from disk at server startup time, so they should generally load faster
# than vLLM models, but still give some room for long startup.
server = ScopedTritonServer(repo=llm_repo, timeout=600)
yield server
# Ensure server is cleaned up after each test
server.stop()


@pytest.fixture(scope="function")
def vllm_server():
llm_repo = None

# vLLM models are downloaded on the fly during model loading as part of
# server startup, so give even more room for timeout in case of slow network
# TODO: Consider one of the following
# (a) Pre-download and mount larger models in test environment
# (b) Download model from HF for vLLM at import step to remove burden
# from server startup step.
server = ScopedTritonServer(repo=llm_repo, timeout=1800)
yield server
# Ensure server is cleaned up after each test
server.stop()


@pytest.fixture(scope="function")
def simple_server():
test_dir = os.path.dirname(os.path.realpath(__file__))
Expand Down
26 changes: 11 additions & 15 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@
MODEL_REPO = os.path.join(TEST_DIR, "test_models")


# Give ample 30min timeout for tests that download models from huggingface
# where network speed can be intermittent, for test consistency.
LLM_TIMEOUT_SECS = 1800


class TestE2E:
@pytest.mark.skipif(
os.environ.get("IMAGE_KIND") != "TRTLLM", reason="Only run for TRT-LLM image"
Expand All @@ -52,10 +57,8 @@ class TestE2E:
),
],
)
# Give ample 30min timeout for now as this test will currently download
# models from huggingface as well, and network speed is intermittent.
@pytest.mark.timeout(1800)
def test_tensorrtllm_e2e(self, llm_server, protocol):
@pytest.mark.timeout(LLM_TIMEOUT_SECS)
def test_tensorrtllm_e2e(self, trtllm_server, protocol):
# NOTE: TRTLLM test models will be passed by the testing infrastructure.
# Only a single model will be passed per test to enable tests to run concurrently.
model = os.environ.get("TRTLLM_MODEL")
Expand All @@ -64,7 +67,7 @@ def test_tensorrtllm_e2e(self, llm_server, protocol):
source = os.environ.get("MODEL_SOURCE")
TritonCommands._clear()
TritonCommands._import(model, source=source, backend="tensorrtllm")
llm_server.start()
trtllm_server.start()
TritonCommands._infer(model, prompt=PROMPT, protocol=protocol)
TritonCommands._profile(model, backend="tensorrtllm")

Expand All @@ -84,10 +87,8 @@ def test_tensorrtllm_e2e(self, llm_server, protocol):
),
],
)
# Give ample 30min timeout for now as this test will currently download
# models from huggingface as well, and network speed is intermittent.
@pytest.mark.timeout(1800)
def test_vllm_e2e(self, llm_server, protocol):
@pytest.mark.timeout(LLM_TIMEOUT_SECS)
def test_vllm_e2e(self, vllm_server, protocol):
# NOTE: VLLM test models will be passed by the testing infrastructure.
# Only a single model will be passed per test to enable tests to run concurrently.
model = os.environ.get("VLLM_MODEL")
Expand All @@ -96,12 +97,7 @@ def test_vllm_e2e(self, llm_server, protocol):
source = os.environ.get("MODEL_SOURCE")
TritonCommands._clear()
TritonCommands._import(model, source=source)
# vLLM will download the model on the fly, so give it a big timeout
# TODO: Consider one of the following
# (a) Pre-download and mount larger models in test environment
# (b) Download model from HF for vLLM at import step to remove burden
# from server startup step.
llm_server.start()
vllm_server.start()
TritonCommands._infer(model, prompt=PROMPT, protocol=protocol)
TritonCommands._profile(model, backend="vllm")

Expand Down