Skip to content

Commit

Permalink
Enable calls to GenAI-Perf for profile subcommand (#52)
Browse files Browse the repository at this point in the history
  • Loading branch information
dyastremsky authored May 10, 2024
1 parent 9736608 commit 085ad83
Show file tree
Hide file tree
Showing 12 changed files with 157 additions and 816 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
fail-fast: false
matrix:
os: ["ubuntu-22.04"]
python-version: ["3.8", "3.10"]
python-version: ["3.10"]

steps:
- uses: actions/checkout@v3
Expand Down
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ __pycache__/

# Distribution / packaging
.Python
artifacts/
build/
develop-eggs/
dist/
Expand All @@ -17,12 +18,20 @@ eggs/
lib64/
parts/
sdist/
tests/checkpoints/
tests/output_dir/
tests/output_model_repository/
tests/plots/
tests/reports/
tests/results/
tmp/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
llm_inputs.json
MANIFEST

# PyInstaller
Expand Down
10 changes: 3 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ Server.
## Pre-requisites

When using Triton and related tools on your host (outside of a Triton container
image) there are a number of additional dependencies that may be required for
image), there are a number of additional dependencies that may be required for
various workflows. Most system dependency issues can be resolved by installing
and running the CLI from within the latest corresponding `tritonserver`
container image, which should have all necessary system dependencies installed.
Expand Down Expand Up @@ -162,7 +162,7 @@ triton start
# Interact with model
triton infer -m llama-3-8b-instruct --prompt "machine learning is"

# Profile model with Perf Analyzer
# Profile model with GenAI-Perf
triton profile -m llama-3-8b-instruct --backend vllm
```

Expand Down Expand Up @@ -232,7 +232,7 @@ triton start
# Interact with model
triton infer -m llama-3-8b-instruct --prompt "machine learning is"

# Profile model with Perf Analyzer
# Profile model with GenAI-Perf
triton profile -m llama-3-8b-instruct --backend tensorrtllm
```
## Additional Dependencies for Custom Environments
Expand Down Expand Up @@ -269,10 +269,6 @@ sudo apt install libopenmpi-dev
```

## Known Limitations
- Triton CLI's `profile` command currently only supports TRT-LLM and vLLM models.
- Triton CLI's `profile` command will be migrating to use
[genai-perf](https://github.com/triton-inference-server/client/tree/main/src/c++/perf_analyzer/genai-perf)
as the backbone for LLM profiling soon.
- Models and configurations generated by Triton CLI are focused on ease-of-use,
and may not be as optimized as possible for your system or use case.
- Triton CLI currently uses the TRT-LLM dependencies installed in its environment
Expand Down
11 changes: 7 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,32 +37,32 @@ classifiers = [
"Topic :: Scientific/Engineering",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Operating System :: Unix",
]
authors = []
maintainers = []
keywords = []
requires-python = ">=3.8,<4"
requires-python = ">=3.10,<4"
# TODO: Add [gpu] set of dependencies for trtllm once it's available on pypi
dependencies = [
"directory-tree == 0.0.4", # may remove in future
"docker == 6.1.3",
"genai-perf @ git+https://github.com/triton-inference-server/[email protected]#subdirectory=src/c++/perf_analyzer/genai-perf",
# TODO: rely on tritonclient to pull in protobuf and numpy dependencies?
"numpy >= 1.21",
"protobuf>=3.7.0",
"prometheus-client == 0.19.0",
"psutil >= 5.9.5", # may remove later
"rich == 13.5.2",
# TODO: Test on cpu-only machine if [cuda] dependency is an issue
"tritonclient[all] >= 2.38",
"tritonclient[all] >= 2.45",
"huggingface-hub >= 0.19.4",
# Testing
"pytest >= 8.1.1", # may remove later
"pytest-timeout", # may remove later
"pytest-mock >= 3.13.0", # may remove later
]

# CLI Entrypoint
Expand All @@ -81,6 +81,9 @@ build-backend = "hatchling.build"
[tool.hatch.version]
path = "src/triton_cli/__init__.py"

[tool.hatch.metadata]
allow-direct-references = true

# Pre-commit hook tool configs
[tool.codespell]
# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -
Expand Down
2 changes: 1 addition & 1 deletion src/triton_cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

__version__ = "0.0.7"
__version__ = "0.0.8dev"
94 changes: 18 additions & 76 deletions src/triton_cli/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import subprocess
import sys
import time
import logging
import argparse
Expand All @@ -41,9 +43,9 @@
)
from triton_cli.client.client import InferenceServerException, TritonClient
from triton_cli.metrics import MetricsClient
from triton_cli.profile import add_unknown_args_to_args, build_command
from triton_cli.repository import ModelRepository
from triton_cli.server.server_factory import TritonServerFactory
from triton_cli.profiler import Profiler

logger = logging.getLogger(LOGGER_NAME)

Expand Down Expand Up @@ -159,41 +161,6 @@ def add_model_args(subcommands):
)


def add_profile_args(subcommands):
for subcommand in subcommands:
subcommand.add_argument(
"-b",
"--batch-size",
type=int,
default=1,
required=False,
help="The batch size / concurrency to benchmark. (Default: 1)",
)
subcommand.add_argument(
"--input-length",
type=int,
default=128,
required=False,
help="The input length (tokens) to use for benchmarking LLMs. (Default: 128)",
)
subcommand.add_argument(
"--output-length",
type=int,
default=128,
required=False,
help="The output length (tokens) to use for benchmarking LLMs. (Default: 128)",
)
# TODO: Revisit terminology here. Online/offline vs streaming, etc.
subcommand.add_argument(
"--profile-mode",
type=str,
choices=["online", "offline"],
default="online",
required=False,
help="Profiling mode: offline means one full response will be generated, online means response will be streaming tokens as they are generated.",
)


def add_client_args(subcommands):
# Add protocol/url/port to all client-based subcommands
for subcommand in subcommands:
Expand Down Expand Up @@ -396,49 +363,17 @@ def handle_infer(args: argparse.Namespace):
# Profile
# ================================================
def parse_args_profile(parser):
profile = parser.add_parser(
"profile", help="Profile LLM models using Perf Analyzer"
)
profile = parser.add_parser("profile", help="Profile models", add_help=False)
profile.set_defaults(func=handle_profile)
add_model_args([profile])
add_profile_args([profile])
add_backend_args([profile])
add_client_args([profile])
profile.add_argument(
"--help", action="store_true", help="Show help message and exit"
)


def handle_profile(args: argparse.Namespace):
client = TritonClient(url=args.url, port=args.port, protocol=args.protocol)
profile_model(args, client)


# TODO: Move to utils? <-- Delete?
def profile_model(args: argparse.Namespace, client: TritonClient):
if args.protocol != "grpc":
raise Exception("Profiler only supports 'grpc' protocol at this time.")

if not args.port:
args.port = 8001 if args.protocol == "grpc" else 8000

# TODO: Consider python(BLS)/ensemble case for the model
# receiving requests in the case of TRT-LLM. For now, TRT-LLM
# should be manually specified.
backend = args.backend
if not args.backend:
# Profiler needs to know TRT-LLM vs vLLM to form correct payload
backend = client.get_model_backend(args.model)

logger.info(f"Running Perf Analyzer profiler on '{args.model}'...")
Profiler.profile(
model=args.model,
backend=backend,
batch_size=args.batch_size,
url=f"{args.url}:{args.port}",
input_length=args.input_length,
output_length=args.output_length,
# Should be "online" for IFB / streaming, and "offline" for non-streaming
offline=(args.profile_mode == "offline"),
verbose=args.verbose,
)
cmd = build_command(args, "genai-perf")
logger.info(f"Running: '{' '.join(cmd)}'")
subprocess.run(cmd, check=True)


# ================================================
Expand Down Expand Up @@ -502,5 +437,12 @@ def parse_args(argv=None):
parse_args_profile(subcommands)
parse_args_utils(subcommands)
add_verbose_args([parser])
args = parser.parse_args(argv)

argv_ = argv if argv is not None else sys.argv[1:]
# Add special argparse handling for passthrough to genai-perf CLI
if argv_[0] == "profile":
args, unknown_args = parser.parse_known_args(argv_)
args = add_unknown_args_to_args(args, unknown_args)
else:
args = parser.parse_args(argv_)
return args
91 changes: 91 additions & 0 deletions src/triton_cli/profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/env python3
# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import argparse
from typing import List


# ================================================
# Helper functions
# ================================================
def build_command(args: argparse.Namespace, executable: str):
skip_args = ["func"]
cmd = [executable]
for arg, value in vars(args).items():
if arg in skip_args:
pass
elif value is False:
pass
elif value is True:
if len(arg) == 1:
cmd += [f"-{arg}"]
else:
cmd += [f"--{arg}"]
# [DLIS-6656] - Remove backend renaming.
# This allows "tensorrtllm" to be used as the backend for consistency.
# Once GenAI-Perf releases 24.05, "tensorrtllm" as the backend value
# will be supported by default.
elif arg == "backend" and value in ["tensorrtllm", "trtllm"]:
cmd += ["--backend", "trtllm"]
else:
if len(arg) == 1:
cmd += [f"-{arg}", f"{value}"]
else:
cmd += [f"--{arg}", f"{value}"]
return cmd


def add_unknown_args_to_args(args: argparse.Namespace, unknown_args: List[str]):
"""Add unknown args to args list"""
unknown_args_dict = turn_unknown_args_into_dict(unknown_args)
for key, value in unknown_args_dict.items():
setattr(args, key, value)
return args


def turn_unknown_args_into_dict(unknown_args: List[str]):
"""Convert list of unknown args to dictionary"""
it = iter(unknown_args)
unknown_args_dict = {}
try:
while True:
arg = next(it)
if arg.startswith(("-", "--")):
key = arg.lstrip("-")
# Peek to see if next item is a value or another flag
next_arg = next(it, None)
if next_arg and not next_arg.startswith(("-", "--")):
unknown_args_dict[key] = next_arg
else:
unknown_args_dict[key] = True
if next_arg:
it = iter([next_arg] + list(it))
else:
raise ValueError(f"Argument does not start with a '-' or '--': {arg}")
except StopIteration:
pass
return unknown_args_dict
Loading

0 comments on commit 085ad83

Please sign in to comment.