Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separating result processor out from profiler.py #23251

Merged
merged 11 commits into from
Jan 7, 2025
363 changes: 363 additions & 0 deletions onnxruntime/python/tools/transformers/profile_result_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,363 @@
import argparse
import json

"""
This profiler result processor print out the kernel time spent on each Node of the model.
Example of importing profile result file from onnxruntime_perf_test:
python profile_result_processor.py --input profile_2021-10-25_12-02-41.json
"""

NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"]
jchen351 marked this conversation as resolved.
Show resolved Hide resolved


def parse_arguments(argv=None):
parser = argparse.ArgumentParser()

parser.add_argument(
"-i",
"--input",
required=False,
type=str,
help="Set the input file for reading the profile results",
)

parser.add_argument(
"--threshold",
required=False,
type=float,
default=0.01,
help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
)

parser.add_argument(
"--provider",
required=False,
type=str,
default="cuda",
help="Execution provider to use",
)

parser.add_argument(
"--basic_optimization",
required=False,
action="store_true",
help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime",
)
parser.set_defaults(basic_optimization=False)

parser.add_argument(
"--kernel_time_only",
required=False,
action="store_true",
help="Only include the kernel time and no fence time",
)

parser.set_defaults(kernel_time_only=False)

parser.add_argument("-v", "--verbose", required=False, action="store_true")
parser.set_defaults(verbose=False)

return parser.parse_args(argv)


def load_profile_json(profile_file):
print(f"loading profile output {profile_file} ...")

with open(profile_file) as opened_file:
sess_time = json.load(opened_file)

assert isinstance(sess_time, list)
return sess_time


def parse_kernel_results(sess_time, threshold=0):
jchen351 marked this conversation as resolved.
Show resolved Hide resolved
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.

Args:
sess_time (List[Dict]): profile data
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.

Returns:
List[str]: lines of string for output.
"""
kernel_name_to_op_name = {}
kernel_time = {}
kernel_freq = {}
total = 0
session_init = False
for item in sess_time:
# Skip all MemcpyHostToDevice before session_initialization
if item["cat"] == "Session" and item["name"] == "session_initialization":
session_init = True
if not session_init:
continue

if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
kernel_name = item["name"]

op_name = item["args"]["op_name"]
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
continue

# Handle MemcpyHostToDevice and MemcpyDeviceToHost here
if not op_name:
op_name = f"({kernel_name})"

if kernel_name in kernel_time:
kernel_time[kernel_name] += item["dur"]
kernel_freq[kernel_name] += 1
else:
kernel_time[kernel_name] = item["dur"]
kernel_freq[kernel_name] = 1
kernel_name_to_op_name[kernel_name] = op_name

total += item["dur"]

if not kernel_time:
return ["No kernel record found!"]

# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
lines = []
lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:")
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
ratio = duration / total
if ratio < threshold:
continue

calls = kernel_freq[kernel_name]
avg_time = duration / float(calls)
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")

# Group by operator
op_time = {}
for kernel_name, op_name in kernel_name_to_op_name.items():
duration = kernel_time[kernel_name]
if op_name in op_time:
op_time[op_name] += duration
else:
op_time[op_name] = duration

lines.append("\nGroup kernel time by operator:")
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tOperator")
for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
ratio = duration / total
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")

return lines


def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.

Args:
sess_time (List[Dict]): profile data
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.

Returns:
List[str]: lines of string for output.
"""
node_name_list = []
node_time = {}
node_freq = {}
node_provider = {}
total = 0
for item in sess_time:
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
node_name = (
item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
)

if "provider" in item["args"]:
if item["args"]["provider"] == "CPUExecutionProvider":
device = "CPU"
elif item["args"]["provider"] == "CUDAExecutionProvider":
device = "CUDA"
elif item["args"]["provider"] == "DmlExecutionProvider":
device = "DML"

if node_name not in node_provider:
node_provider[node_name] = device
else:
assert node_provider[node_name] == device
elif kernel_time_only:
continue

op_name = item["args"]["op_name"]
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
continue

if node_name in node_time:
node_time[node_name] += item["dur"]
node_freq[node_name] += 1
else:
node_time[node_name] = item["dur"]
node_freq[node_name] = 1
node_name_list.append(node_name)

total += item["dur"]

# Output items in the original order.
lines = [
"\nNodes in the original order:",
"-" * 64,
"Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
]
before_percentage = 0.0
for node_name in node_name_list:
duration = node_time[node_name]
calls = node_freq[node_name]
avg_time = duration / float(calls)
percentage = (duration / total) * 100.0
provider = node_provider.get(node_name, "")
before_percentage += percentage
lines.append(
f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
)

# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:")
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
ratio = duration / total
if ratio < threshold:
continue

calls = node_freq[node_name]
avg_time = duration / float(calls)
percentage = (duration / total) * 100.0
provider = node_provider.get(node_name, "")
lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")

return lines


def group_node_results(sess_time):
jchen351 marked this conversation as resolved.
Show resolved Hide resolved
"""Group results by operator name.

Args:
sess_time (List[Dict]): profile data
kernel_time_only (bool): Only include items for kernel time.
use_gpu (bool): GPU is used in profiling or not.

Returns:
List[str]: lines of string for output.
"""
op_kernel_time = {}
op_kernel_records = {}
total_kernel_time = 0

provider_op_kernel_time = {}
provider_op_kernel_records = {}
provider_kernel_time = {}

op_fence_time = {}
total_fence_time = 0

provider_counter = {}
for item in sess_time:
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
op_name = item["args"]["op_name"]

# TODO: shall we have a separated group for nodes with subgraph?
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
continue

if "provider" not in item["args"]:
if "fence" in item["name"]:
if op_name in op_fence_time:
op_fence_time[op_name] += item["dur"]
else:
op_fence_time[op_name] = item["dur"]
total_fence_time += item["dur"]
continue

provider = item["args"].get("provider", "")
if provider in provider_counter:
provider_counter[provider] += 1
else:
provider_counter[provider] = 1

key = f"{provider}:{op_name}"
if key in provider_op_kernel_time:
provider_op_kernel_time[key] += item["dur"]
provider_op_kernel_records[key] += 1
else:
provider_op_kernel_time[key] = item["dur"]
provider_op_kernel_records[key] = 1

if provider in provider_kernel_time:
provider_kernel_time[provider] += item["dur"]
else:
provider_kernel_time[provider] = item["dur"]

if op_name in op_kernel_time:
op_kernel_time[op_name] += item["dur"]
op_kernel_records[op_name] += 1
else:
op_kernel_time[op_name] = item["dur"]
op_kernel_records[op_name] = 1

total_kernel_time += item["dur"]

lines = ["", "Grouped by operator"]
lines.append("-" * 64)
lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
fence_time = op_fence_time.get(op_name, 0)
kernel_time_ratio = kernel_time / total_kernel_time
total_time = kernel_time + fence_time
time_ratio = total_time / (total_kernel_time + total_fence_time)
kernel_calls = op_kernel_records[op_name]
avg_kernel_time = kernel_time / kernel_calls
lines.append(
f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
)

lines += ["", "Grouped by provider + operator"]
lines.append("-" * 64)
lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
parts = key.split(":")
provider = parts[0]
op_name = parts[1]
short_ep = provider.replace("ExecutionProvider", "")
calls = provider_op_kernel_records[key]
avg_kernel_time = kernel_time / calls
provider_time_ratio = kernel_time / provider_kernel_time[provider]
lines.append(
f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
)

return lines

def process_results(profile_file, args):
jchen351 marked this conversation as resolved.
Show resolved Hide resolved
profile_records = load_profile_json(profile_file)

lines = parse_kernel_results(profile_records, args.threshold)

lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)

lines += group_node_results(profile_records)

return lines

if __name__ == "__main__":
jchen351 marked this conversation as resolved.
Show resolved Hide resolved
arguments = parse_arguments()
print("Arguments", arguments)

from benchmark_helper import setup_logger

setup_logger(arguments.verbose)

profile_file = arguments.input

results = process_results(profile_file, arguments)

for line in results:
print(line)
jchen351 marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading