Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Catch errors and improve logging in Profiler #23

Merged
merged 3 commits into from
Jan 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/triton_cli/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ def profile_model(args: argparse.Namespace, client: TritonClient):
url=f"{args.url}:{args.port}",
input_length=args.input_length,
output_length=args.output_length,
verbose=args.verbose,
)


Expand Down
70 changes: 59 additions & 11 deletions src/triton_cli/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import logging
import subprocess
from dataclasses import dataclass
from itertools import tee
Expand All @@ -33,6 +34,11 @@

import numpy as np

from triton_cli.constants import LOGGER_NAME

logger = logging.getLogger(LOGGER_NAME)


INPUT_FILENAME = "generated_input_data.json"
METRIC_FIELDS = {
# "max_first_token_latency": ("Max first token latency", "ms"),
Expand Down Expand Up @@ -396,6 +402,28 @@ def calculate_offline_metrics(args, profile_result, export_data):


def calculate_metrics(args, profile_result, export_data):
# Sanity check the number of responses received from backend
if args.ignore_eos:
requests = export_data["experiments"][0]["requests"]
for request in requests:
if len(request["response_timestamps"]) == args.max_tokens:
# Assume FINAL flag is returned with final token response
pass
elif len(request["response_timestamps"]) == args.max_tokens + 1:
# Assume FINAL flag was returned with an empty response after
# the final token
logger.warning(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you include the comments I had in the other comment in-line here? The # Assume ... ones

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will help me remember the reasoning for the check/split later on

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added comments

"Received an extra response from the backend. This may be "
"due to the backend sending an 'empty final response'."
)
else:
raise ValueError(
f"Expecting {args.max_tokens} tokens but received "
f"{len(request['response_timestamps'])} tokens. "
f"This could be due to an unsupported sequence length. "
f"Please double check the input and output length."
)

calculate_offline_metrics(args, profile_result, export_data)
if not args.offline:
calculate_online_metrics(args, profile_result, export_data)
Expand Down Expand Up @@ -455,7 +483,15 @@ def profile(args, export_file):
f"--concurrency-range={args.concurrency}"
)

subprocess.run(args=[command], shell=True, stdout=subprocess.DEVNULL)
proc = subprocess.run(args=[command], shell=True, capture_output=True)

if args.verbose:
logger.info(f"Perf Analyzer output:\n{proc.stdout.decode('utf-8')}")
if proc.returncode:
raise RuntimeError(
"Encountered the following error while running Perf Analyzer:\n"
f"{proc.stderr.decode('utf-8').rstrip()}"
)


def prepare_export_file(args, prompt):
Expand Down Expand Up @@ -615,31 +651,43 @@ class Args:
offline = False
url = "localhost:8001"
concurrency = 1
verbose = False


class Profiler:
@staticmethod
def profile(model, backend, batch_size, url, input_length=128, output_length=128):
def profile(
model,
backend,
batch_size,
url,
input_length=128,
output_length=128,
verbose=False,
):
args = Args()
args.model = model
args.backend = backend
args.concurrency = batch_size # inflight batch size
args.url = url
args.prompt_size_range = [input_length, input_length, 1]
args.max_tokens = output_length
args.verbose = verbose

start, end, step = args.prompt_size_range
assert start == end and step == 1 # no sweeping for now

print("Warming up...")
logger.info("Warming up...")
main(args, should_summarize=False) # warm-up

print("Warmed up, profiling now...\n")
print("[ PROFILE CONFIGURATIONS ]")
print(f" * Model: {args.model}")
print(f" * Backend: {args.backend}")
print(f" * Batch size: {args.concurrency}")
print(f" * Input tokens: {args.prompt_size_range[0]}")
print(f" * Output tokens: {args.max_tokens}")
print("")
logger.info(
"Warmed up, profiling now...\n"
"[ PROFILE CONFIGURATIONS ]\n"
f" * Model: {args.model}\n"
f" * Backend: {args.backend}\n"
f" * Batch size: {args.concurrency}\n"
f" * Input tokens: {args.prompt_size_range[0]}\n"
f" * Output tokens: {args.max_tokens}\n"
""
)
main(args)