triton-inference-server · rmccorm4 · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,30 @@
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
 import os
 import sys
 from pathlib import Path
@@ -8,16 +35,34 @@
 
 
 @pytest.fixture(scope="function")
-def llm_server():
+def trtllm_server():
     llm_repo = None
 
-    # Give ample startup timeout for possible downloading of models
+    # TRT-LLM models should be pre-built offline, and only need to be read
+    # from disk at server startup time, so they should generally load faster
+    # than vLLM models, but still give some room for long startup.
     server = ScopedTritonServer(repo=llm_repo, timeout=600)
     yield server
     # Ensure server is cleaned up after each test
     server.stop()
 
 
+@pytest.fixture(scope="function")
+def vllm_server():
+    llm_repo = None
+
+    # vLLM models are downloaded on the fly during model loading as part of
+    # server startup, so give even more room for timeout in case of slow network
+    #     TODO: Consider one of the following
+    #     (a) Pre-download and mount larger models in test environment
+    #     (b) Download model from HF for vLLM at import step to remove burden
+    #         from server startup step.
+    server = ScopedTritonServer(repo=llm_repo, timeout=1800)
+    yield server
+    # Ensure server is cleaned up after each test
+    server.stop()
+
+
 @pytest.fixture(scope="function")
 def simple_server():
     test_dir = os.path.dirname(os.path.realpath(__file__))

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -35,6 +35,11 @@
 MODEL_REPO = os.path.join(TEST_DIR, "test_models")
 
 
+# Give ample 30min timeout for tests that download models from huggingface
+# where network speed can be intermittent, for test consistency.
+LLM_TIMEOUT_SECS = 1800
+
+
 class TestE2E:
     @pytest.mark.skipif(
         os.environ.get("IMAGE_KIND") != "TRTLLM", reason="Only run for TRT-LLM image"
@@ -52,10 +57,8 @@ class TestE2E:
             ),
         ],
     )
-    # Give ample 30min timeout for now as this test will currently download
-    # models from huggingface as well, and network speed is intermittent.
-    @pytest.mark.timeout(1800)
-    def test_tensorrtllm_e2e(self, llm_server, protocol):
+    @pytest.mark.timeout(LLM_TIMEOUT_SECS)
+    def test_tensorrtllm_e2e(self, trtllm_server, protocol):
         # NOTE: TRTLLM test models will be passed by the testing infrastructure.
         # Only a single model will be passed per test to enable tests to run concurrently.
         model = os.environ.get("TRTLLM_MODEL")
@@ -64,7 +67,7 @@ def test_tensorrtllm_e2e(self, llm_server, protocol):
         source = os.environ.get("MODEL_SOURCE")
         TritonCommands._clear()
         TritonCommands._import(model, source=source, backend="tensorrtllm")
-        llm_server.start()
+        trtllm_server.start()
         TritonCommands._infer(model, prompt=PROMPT, protocol=protocol)
         TritonCommands._profile(model, backend="tensorrtllm")
 
@@ -84,10 +87,8 @@ def test_tensorrtllm_e2e(self, llm_server, protocol):
             ),
         ],
     )
-    # Give ample 30min timeout for now as this test will currently download
-    # models from huggingface as well, and network speed is intermittent.
-    @pytest.mark.timeout(1800)
-    def test_vllm_e2e(self, llm_server, protocol):
+    @pytest.mark.timeout(LLM_TIMEOUT_SECS)
+    def test_vllm_e2e(self, vllm_server, protocol):
         # NOTE: VLLM test models will be passed by the testing infrastructure.
         # Only a single model will be passed per test to enable tests to run concurrently.
         model = os.environ.get("VLLM_MODEL")
@@ -96,12 +97,7 @@ def test_vllm_e2e(self, llm_server, protocol):
         source = os.environ.get("MODEL_SOURCE")
         TritonCommands._clear()
         TritonCommands._import(model, source=source)
-        # vLLM will download the model on the fly, so give it a big timeout
-        # TODO: Consider one of the following
-        # (a) Pre-download and mount larger models in test environment
-        # (b) Download model from HF for vLLM at import step to remove burden
-        #     from server startup step.
-        llm_server.start()
+        vllm_server.start()
         TritonCommands._infer(model, prompt=PROMPT, protocol=protocol)
         TritonCommands._profile(model, backend="vllm")