diff --git a/tests/conftest.py b/tests/conftest.py index a91b078..aede0be 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,30 @@ +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of NVIDIA CORPORATION nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + import os import sys from pathlib import Path @@ -8,16 +35,34 @@ @pytest.fixture(scope="function") -def llm_server(): +def trtllm_server(): llm_repo = None - # Give ample startup timeout for possible downloading of models + # TRT-LLM models should be pre-built offline, and only need to be read + # from disk at server startup time, so they should generally load faster + # than vLLM models, but still give some room for long startup. server = ScopedTritonServer(repo=llm_repo, timeout=600) yield server # Ensure server is cleaned up after each test server.stop() +@pytest.fixture(scope="function") +def vllm_server(): + llm_repo = None + + # vLLM models are downloaded on the fly during model loading as part of + # server startup, so give even more room for timeout in case of slow network + # TODO: Consider one of the following + # (a) Pre-download and mount larger models in test environment + # (b) Download model from HF for vLLM at import step to remove burden + # from server startup step. + server = ScopedTritonServer(repo=llm_repo, timeout=1800) + yield server + # Ensure server is cleaned up after each test + server.stop() + + @pytest.fixture(scope="function") def simple_server(): test_dir = os.path.dirname(os.path.realpath(__file__)) diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 95edc2c..42070a7 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -35,6 +35,11 @@ MODEL_REPO = os.path.join(TEST_DIR, "test_models") +# Give ample 30min timeout for tests that download models from huggingface +# where network speed can be intermittent, for test consistency. +LLM_TIMEOUT_SECS = 1800 + + class TestE2E: @pytest.mark.skipif( os.environ.get("IMAGE_KIND") != "TRTLLM", reason="Only run for TRT-LLM image" @@ -52,10 +57,8 @@ class TestE2E: ), ], ) - # Give ample 30min timeout for now as this test will currently download - # models from huggingface as well, and network speed is intermittent. - @pytest.mark.timeout(1800) - def test_tensorrtllm_e2e(self, llm_server, protocol): + @pytest.mark.timeout(LLM_TIMEOUT_SECS) + def test_tensorrtllm_e2e(self, trtllm_server, protocol): # NOTE: TRTLLM test models will be passed by the testing infrastructure. # Only a single model will be passed per test to enable tests to run concurrently. model = os.environ.get("TRTLLM_MODEL") @@ -64,7 +67,7 @@ def test_tensorrtllm_e2e(self, llm_server, protocol): source = os.environ.get("MODEL_SOURCE") TritonCommands._clear() TritonCommands._import(model, source=source, backend="tensorrtllm") - llm_server.start() + trtllm_server.start() TritonCommands._infer(model, prompt=PROMPT, protocol=protocol) TritonCommands._profile(model, backend="tensorrtllm") @@ -84,10 +87,8 @@ def test_tensorrtllm_e2e(self, llm_server, protocol): ), ], ) - # Give ample 30min timeout for now as this test will currently download - # models from huggingface as well, and network speed is intermittent. - @pytest.mark.timeout(1800) - def test_vllm_e2e(self, llm_server, protocol): + @pytest.mark.timeout(LLM_TIMEOUT_SECS) + def test_vllm_e2e(self, vllm_server, protocol): # NOTE: VLLM test models will be passed by the testing infrastructure. # Only a single model will be passed per test to enable tests to run concurrently. model = os.environ.get("VLLM_MODEL") @@ -96,12 +97,7 @@ def test_vllm_e2e(self, llm_server, protocol): source = os.environ.get("MODEL_SOURCE") TritonCommands._clear() TritonCommands._import(model, source=source) - # vLLM will download the model on the fly, so give it a big timeout - # TODO: Consider one of the following - # (a) Pre-download and mount larger models in test environment - # (b) Download model from HF for vLLM at import step to remove burden - # from server startup step. - llm_server.start() + vllm_server.start() TritonCommands._infer(model, prompt=PROMPT, protocol=protocol) TritonCommands._profile(model, backend="vllm")