From c117213a7b5c7f78d4048e020c1c12f43ea21ca7 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Sat, 4 Jan 2025 10:39:03 -0800 Subject: [PATCH 01/10] Separating result processor out from profiler.py without changing the behaviors of current profile.py --- .../transformers/profile_result_processor.py | 363 ++++++++++++++++++ .../python/tools/transformers/profiler.py | 293 +------------- 2 files changed, 364 insertions(+), 292 deletions(-) create mode 100644 onnxruntime/python/tools/transformers/profile_result_processor.py diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py new file mode 100644 index 0000000000000..4e06e4c4948fd --- /dev/null +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -0,0 +1,363 @@ +import argparse +import json + +""" +This profiler result processor print out the kernel time spent on each Node of the model. +Example of importing profile result file from onnxruntime_perf_test: + python profile_result_processor.py --input profile_2021-10-25_12-02-41.json +""" + +NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"] + + +def parse_arguments(argv=None): + parser = argparse.ArgumentParser() + + parser.add_argument( + "-i", + "--input", + required=False, + type=str, + help="Set the input file for reading the profile results", + ) + + parser.add_argument( + "--threshold", + required=False, + type=float, + default=0.01, + help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.", + ) + + parser.add_argument( + "--provider", + required=False, + type=str, + default="cuda", + help="Execution provider to use", + ) + + parser.add_argument( + "--basic_optimization", + required=False, + action="store_true", + help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime", + ) + parser.set_defaults(basic_optimization=False) + + parser.add_argument( + "--kernel_time_only", + required=False, + action="store_true", + help="Only include the kernel time and no fence time", + ) + + parser.set_defaults(kernel_time_only=False) + + parser.add_argument("-v", "--verbose", required=False, action="store_true") + parser.set_defaults(verbose=False) + + return parser.parse_args(argv) + + +def load_profile_json(profile_file): + print(f"loading profile output {profile_file} ...") + + with open(profile_file) as opened_file: + sess_time = json.load(opened_file) + + assert isinstance(sess_time, list) + return sess_time + + +def parse_kernel_results(sess_time, threshold=0): + """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. + + Args: + sess_time (List[Dict]): profile data + kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. + threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. + + Returns: + List[str]: lines of string for output. + """ + kernel_name_to_op_name = {} + kernel_time = {} + kernel_freq = {} + total = 0 + session_init = False + for item in sess_time: + # Skip all MemcpyHostToDevice before session_initialization + if item["cat"] == "Session" and item["name"] == "session_initialization": + session_init = True + if not session_init: + continue + + if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]: + kernel_name = item["name"] + + op_name = item["args"]["op_name"] + if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: + continue + + # Handle MemcpyHostToDevice and MemcpyDeviceToHost here + if not op_name: + op_name = f"({kernel_name})" + + if kernel_name in kernel_time: + kernel_time[kernel_name] += item["dur"] + kernel_freq[kernel_name] += 1 + else: + kernel_time[kernel_name] = item["dur"] + kernel_freq[kernel_name] = 1 + kernel_name_to_op_name[kernel_name] = op_name + + total += item["dur"] + + if not kernel_time: + return ["No kernel record found!"] + + # Output items with run time ratio > thresholds, and sorted by duration in the descending order. + lines = [] + lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:") + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel") + for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True): + ratio = duration / total + if ratio < threshold: + continue + + calls = kernel_freq[kernel_name] + avg_time = duration / float(calls) + lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}") + + # Group by operator + op_time = {} + for kernel_name, op_name in kernel_name_to_op_name.items(): + duration = kernel_time[kernel_name] + if op_name in op_time: + op_time[op_name] += duration + else: + op_time[op_name] = duration + + lines.append("\nGroup kernel time by operator:") + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tOperator") + for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True): + ratio = duration / total + lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}") + + return lines + + +def parse_node_results(sess_time, kernel_time_only=False, threshold=0): + """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. + + Args: + sess_time (List[Dict]): profile data + kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. + threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. + + Returns: + List[str]: lines of string for output. + """ + node_name_list = [] + node_time = {} + node_freq = {} + node_provider = {} + total = 0 + for item in sess_time: + if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: + node_name = ( + item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "") + ) + + if "provider" in item["args"]: + if item["args"]["provider"] == "CPUExecutionProvider": + device = "CPU" + elif item["args"]["provider"] == "CUDAExecutionProvider": + device = "CUDA" + elif item["args"]["provider"] == "DmlExecutionProvider": + device = "DML" + + if node_name not in node_provider: + node_provider[node_name] = device + else: + assert node_provider[node_name] == device + elif kernel_time_only: + continue + + op_name = item["args"]["op_name"] + if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: + continue + + if node_name in node_time: + node_time[node_name] += item["dur"] + node_freq[node_name] += 1 + else: + node_time[node_name] = item["dur"] + node_freq[node_name] = 1 + node_name_list.append(node_name) + + total += item["dur"] + + # Output items in the original order. + lines = [ + "\nNodes in the original order:", + "-" * 64, + "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode", + ] + before_percentage = 0.0 + for node_name in node_name_list: + duration = node_time[node_name] + calls = node_freq[node_name] + avg_time = duration / float(calls) + percentage = (duration / total) * 100.0 + provider = node_provider.get(node_name, "") + before_percentage += percentage + lines.append( + f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}" + ) + + # Output items with run time ratio > thresholds, and sorted by duration in the descending order. + lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:") + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode") + for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True): + ratio = duration / total + if ratio < threshold: + continue + + calls = node_freq[node_name] + avg_time = duration / float(calls) + percentage = (duration / total) * 100.0 + provider = node_provider.get(node_name, "") + lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}") + + return lines + + +def group_node_results(sess_time): + """Group results by operator name. + + Args: + sess_time (List[Dict]): profile data + kernel_time_only (bool): Only include items for kernel time. + use_gpu (bool): GPU is used in profiling or not. + + Returns: + List[str]: lines of string for output. + """ + op_kernel_time = {} + op_kernel_records = {} + total_kernel_time = 0 + + provider_op_kernel_time = {} + provider_op_kernel_records = {} + provider_kernel_time = {} + + op_fence_time = {} + total_fence_time = 0 + + provider_counter = {} + for item in sess_time: + if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: + op_name = item["args"]["op_name"] + + # TODO: shall we have a separated group for nodes with subgraph? + if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: + continue + + if "provider" not in item["args"]: + if "fence" in item["name"]: + if op_name in op_fence_time: + op_fence_time[op_name] += item["dur"] + else: + op_fence_time[op_name] = item["dur"] + total_fence_time += item["dur"] + continue + + provider = item["args"].get("provider", "") + if provider in provider_counter: + provider_counter[provider] += 1 + else: + provider_counter[provider] = 1 + + key = f"{provider}:{op_name}" + if key in provider_op_kernel_time: + provider_op_kernel_time[key] += item["dur"] + provider_op_kernel_records[key] += 1 + else: + provider_op_kernel_time[key] = item["dur"] + provider_op_kernel_records[key] = 1 + + if provider in provider_kernel_time: + provider_kernel_time[provider] += item["dur"] + else: + provider_kernel_time[provider] = item["dur"] + + if op_name in op_kernel_time: + op_kernel_time[op_name] += item["dur"] + op_kernel_records[op_name] += 1 + else: + op_kernel_time[op_name] = item["dur"] + op_kernel_records[op_name] = 1 + + total_kernel_time += item["dur"] + + lines = ["", "Grouped by operator"] + lines.append("-" * 64) + lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator") + for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True): + fence_time = op_fence_time.get(op_name, 0) + kernel_time_ratio = kernel_time / total_kernel_time + total_time = kernel_time + fence_time + time_ratio = total_time / (total_kernel_time + total_fence_time) + kernel_calls = op_kernel_records[op_name] + avg_kernel_time = kernel_time / kernel_calls + lines.append( + f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}" + ) + + lines += ["", "Grouped by provider + operator"] + lines.append("-" * 64) + lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator") + for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True): + parts = key.split(":") + provider = parts[0] + op_name = parts[1] + short_ep = provider.replace("ExecutionProvider", "") + calls = provider_op_kernel_records[key] + avg_kernel_time = kernel_time / calls + provider_time_ratio = kernel_time / provider_kernel_time[provider] + lines.append( + f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}" + ) + + return lines + +def process_results(profile_file, args): + profile_records = load_profile_json(profile_file) + + lines = parse_kernel_results(profile_records, args.threshold) + + lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold) + + lines += group_node_results(profile_records) + + return lines + +if __name__ == "__main__": + arguments = parse_arguments() + print("Arguments", arguments) + + from benchmark_helper import setup_logger + + setup_logger(arguments.verbose) + + profile_file = arguments.input + + results = process_results(profile_file, arguments) + + for line in results: + print(line) \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py index 2306b579f92fe..cf9ef6c123691 100644 --- a/onnxruntime/python/tools/transformers/profiler.py +++ b/onnxruntime/python/tools/transformers/profiler.py @@ -1,5 +1,4 @@ import argparse -import json import os import numpy @@ -175,284 +174,6 @@ def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_n profile_file = session.end_profiling() return profile_file - -def load_profile_json(profile_file): - print(f"loading profile output {profile_file} ...") - - with open(profile_file) as opened_file: - sess_time = json.load(opened_file) - - assert isinstance(sess_time, list) - return sess_time - - -def parse_kernel_results(sess_time, threshold=0): - """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. - - Args: - sess_time (List[Dict]): profile data - kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. - threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. - - Returns: - List[str]: lines of string for output. - """ - kernel_name_to_op_name = {} - kernel_time = {} - kernel_freq = {} - total = 0 - session_init = False - for item in sess_time: - # Skip all MemcpyHostToDevice before session_initialization - if item["cat"] == "Session" and item["name"] == "session_initialization": - session_init = True - if not session_init: - continue - - if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]: - kernel_name = item["name"] - - op_name = item["args"]["op_name"] - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: - continue - - # Handle MemcpyHostToDevice and MemcpyDeviceToHost here - if not op_name: - op_name = f"({kernel_name})" - - if kernel_name in kernel_time: - kernel_time[kernel_name] += item["dur"] - kernel_freq[kernel_name] += 1 - else: - kernel_time[kernel_name] = item["dur"] - kernel_freq[kernel_name] = 1 - kernel_name_to_op_name[kernel_name] = op_name - - total += item["dur"] - - if not kernel_time: - return ["No kernel record found!"] - - # Output items with run time ratio > thresholds, and sorted by duration in the descending order. - lines = [] - lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:") - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel") - for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True): - ratio = duration / total - if ratio < threshold: - continue - - calls = kernel_freq[kernel_name] - avg_time = duration / float(calls) - lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}") - - # Group by operator - op_time = {} - for kernel_name, op_name in kernel_name_to_op_name.items(): - duration = kernel_time[kernel_name] - if op_name in op_time: - op_time[op_name] += duration - else: - op_time[op_name] = duration - - lines.append("\nGroup kernel time by operator:") - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tOperator") - for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True): - ratio = duration / total - lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}") - - return lines - - -def parse_node_results(sess_time, kernel_time_only=False, threshold=0): - """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes. - - Args: - sess_time (List[Dict]): profile data - kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. - threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. - - Returns: - List[str]: lines of string for output. - """ - node_name_list = [] - node_time = {} - node_freq = {} - node_provider = {} - total = 0 - for item in sess_time: - if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: - node_name = ( - item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "") - ) - - if "provider" in item["args"]: - if item["args"]["provider"] == "CPUExecutionProvider": - device = "CPU" - elif item["args"]["provider"] == "CUDAExecutionProvider": - device = "CUDA" - elif item["args"]["provider"] == "DmlExecutionProvider": - device = "DML" - - if node_name not in node_provider: - node_provider[node_name] = device - else: - assert node_provider[node_name] == device - elif kernel_time_only: - continue - - op_name = item["args"]["op_name"] - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: - continue - - if node_name in node_time: - node_time[node_name] += item["dur"] - node_freq[node_name] += 1 - else: - node_time[node_name] = item["dur"] - node_freq[node_name] = 1 - node_name_list.append(node_name) - - total += item["dur"] - - # Output items in the original order. - lines = [ - "\nNodes in the original order:", - "-" * 64, - "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode", - ] - before_percentage = 0.0 - for node_name in node_name_list: - duration = node_time[node_name] - calls = node_freq[node_name] - avg_time = duration / float(calls) - percentage = (duration / total) * 100.0 - provider = node_provider.get(node_name, "") - before_percentage += percentage - lines.append( - f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}" - ) - - # Output items with run time ratio > thresholds, and sorted by duration in the descending order. - lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:") - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode") - for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True): - ratio = duration / total - if ratio < threshold: - continue - - calls = node_freq[node_name] - avg_time = duration / float(calls) - percentage = (duration / total) * 100.0 - provider = node_provider.get(node_name, "") - lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}") - - return lines - - -def group_node_results(sess_time, kernel_time_only, use_gpu): - """Group results by operator name. - - Args: - sess_time (List[Dict]): profile data - kernel_time_only (bool): Only include items for kernel time. - use_gpu (bool): GPU is used in profiling or not. - - Returns: - List[str]: lines of string for output. - """ - op_kernel_time = {} - op_kernel_records = {} - total_kernel_time = 0 - - provider_op_kernel_time = {} - provider_op_kernel_records = {} - provider_kernel_time = {} - - op_fence_time = {} - total_fence_time = 0 - - provider_counter = {} - for item in sess_time: - if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]: - op_name = item["args"]["op_name"] - - # TODO: shall we have a separated group for nodes with subgraph? - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: - continue - - if "provider" not in item["args"]: - if "fence" in item["name"]: - if op_name in op_fence_time: - op_fence_time[op_name] += item["dur"] - else: - op_fence_time[op_name] = item["dur"] - total_fence_time += item["dur"] - continue - - provider = item["args"].get("provider", "") - if provider in provider_counter: - provider_counter[provider] += 1 - else: - provider_counter[provider] = 1 - - key = f"{provider}:{op_name}" - if key in provider_op_kernel_time: - provider_op_kernel_time[key] += item["dur"] - provider_op_kernel_records[key] += 1 - else: - provider_op_kernel_time[key] = item["dur"] - provider_op_kernel_records[key] = 1 - - if provider in provider_kernel_time: - provider_kernel_time[provider] += item["dur"] - else: - provider_kernel_time[provider] = item["dur"] - - if op_name in op_kernel_time: - op_kernel_time[op_name] += item["dur"] - op_kernel_records[op_name] += 1 - else: - op_kernel_time[op_name] = item["dur"] - op_kernel_records[op_name] = 1 - - total_kernel_time += item["dur"] - - lines = ["", "Grouped by operator"] - lines.append("-" * 64) - lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator") - for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True): - fence_time = op_fence_time.get(op_name, 0) - kernel_time_ratio = kernel_time / total_kernel_time - total_time = kernel_time + fence_time - time_ratio = total_time / (total_kernel_time + total_fence_time) - kernel_calls = op_kernel_records[op_name] - avg_kernel_time = kernel_time / kernel_calls - lines.append( - f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}" - ) - - lines += ["", "Grouped by provider + operator"] - lines.append("-" * 64) - lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator") - for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True): - parts = key.split(":") - provider = parts[0] - op_name = parts[1] - short_ep = provider.replace("ExecutionProvider", "") - calls = provider_op_kernel_records[key] - avg_kernel_time = kernel_time / calls - provider_time_ratio = kernel_time / provider_kernel_time[provider] - lines.append( - f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}" - ) - - return lines - - def get_dim_from_type_proto(dim): return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None # noqa: E721 @@ -639,18 +360,6 @@ def create_longformer_inputs(onnx_model, batch_size, sequence_length, global_len return all_inputs -def process_results(profile_file, args): - profile_records = load_profile_json(profile_file) - - lines = parse_kernel_results(profile_records, args.threshold) - - lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold) - - lines += group_node_results(profile_records, args.kernel_time_only, args.use_gpu) - - return lines - - def run(args): num_threads = args.thread_num if args.thread_num > 0 else psutil.cpu_count(logical=False) @@ -718,7 +427,7 @@ def run(args): profile_file = run(arguments) else: profile_file = arguments.input - + import profile_result_processor import process_results results = process_results(profile_file, arguments) for line in results: From 0509b709dcc10e3da3157d9af96d9ac51b264c6e Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Sun, 5 Jan 2025 16:13:46 -0800 Subject: [PATCH 02/10] Separating result processor out from profiler.py without changing the behaviors of current profile.py --- .../python/tools/transformers/profile_result_processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py index 4e06e4c4948fd..df07f80be475d 100644 --- a/onnxruntime/python/tools/transformers/profile_result_processor.py +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -336,6 +336,7 @@ def group_node_results(sess_time): return lines + def process_results(profile_file, args): profile_records = load_profile_json(profile_file) @@ -347,6 +348,7 @@ def process_results(profile_file, args): return lines + if __name__ == "__main__": arguments = parse_arguments() print("Arguments", arguments) @@ -360,4 +362,4 @@ def process_results(profile_file, args): results = process_results(profile_file, arguments) for line in results: - print(line) \ No newline at end of file + print(line) From e9d853dc00b7a82f854d80d448aa0499f3adfedc Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 11:55:38 -0800 Subject: [PATCH 03/10] Add process_results function definition --- .../python/tools/transformers/profile_result_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py index df07f80be475d..185f9306f4c91 100644 --- a/onnxruntime/python/tools/transformers/profile_result_processor.py +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -337,6 +337,7 @@ def group_node_results(sess_time): return lines + def process_results(profile_file, args): profile_records = load_profile_json(profile_file) From f39fea8e237f4a9c324daf4a8c5da782f9e47be8 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 12:05:05 -0800 Subject: [PATCH 04/10] Reformat code --- .../python/tools/transformers/profile_result_processor.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py index 185f9306f4c91..8de073175a348 100644 --- a/onnxruntime/python/tools/transformers/profile_result_processor.py +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -75,7 +75,6 @@ def parse_kernel_results(sess_time, threshold=0): Args: sess_time (List[Dict]): profile data - kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False. threshold (int, optional): Minimum ratio of duration among all. Defaults to 0. Returns: @@ -119,7 +118,7 @@ def parse_kernel_results(sess_time, threshold=0): # Output items with run time ratio > thresholds, and sorted by duration in the descending order. lines = [] - lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:") + lines.append(f"\nTop expensive kernels with Time% >= {threshold * 100:.2f}:") lines.append("-" * 64) lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel") for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True): @@ -220,7 +219,7 @@ def parse_node_results(sess_time, kernel_time_only=False, threshold=0): ) # Output items with run time ratio > thresholds, and sorted by duration in the descending order. - lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:") + lines.append(f"\nTop expensive nodes with Time% >= {threshold * 100:.2f}:") lines.append("-" * 64) lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode") for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True): @@ -242,8 +241,6 @@ def group_node_results(sess_time): Args: sess_time (List[Dict]): profile data - kernel_time_only (bool): Only include items for kernel time. - use_gpu (bool): GPU is used in profiling or not. Returns: List[str]: lines of string for output. @@ -337,7 +334,6 @@ def group_node_results(sess_time): return lines - def process_results(profile_file, args): profile_records = load_profile_json(profile_file) From 179620d0113d8feb2b94174745174af4ee23d768 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 12:05:48 -0800 Subject: [PATCH 05/10] change import from --- onnxruntime/python/tools/transformers/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py index cf9ef6c123691..2dd600f678535 100644 --- a/onnxruntime/python/tools/transformers/profiler.py +++ b/onnxruntime/python/tools/transformers/profiler.py @@ -427,7 +427,7 @@ def run(args): profile_file = run(arguments) else: profile_file = arguments.input - import profile_result_processor import process_results + from profile_result_processor import process_results results = process_results(profile_file, arguments) for line in results: From 91c83bdc70d23e9e947decd7a981b03675d693f4 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 12:08:38 -0800 Subject: [PATCH 06/10] remove unused argument basic_optimization --- .../python/tools/transformers/profile_result_processor.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py index 8de073175a348..ba49c9c19820b 100644 --- a/onnxruntime/python/tools/transformers/profile_result_processor.py +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -37,14 +37,6 @@ def parse_arguments(argv=None): help="Execution provider to use", ) - parser.add_argument( - "--basic_optimization", - required=False, - action="store_true", - help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime", - ) - parser.set_defaults(basic_optimization=False) - parser.add_argument( "--kernel_time_only", required=False, From dfcb245d545d9e125237b2bfc68ae8915722eff8 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 12:15:00 -0800 Subject: [PATCH 07/10] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- onnxruntime/python/tools/transformers/profiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py index 2dd600f678535..fa0881099e4c3 100644 --- a/onnxruntime/python/tools/transformers/profiler.py +++ b/onnxruntime/python/tools/transformers/profiler.py @@ -428,6 +428,7 @@ def run(args): else: profile_file = arguments.input from profile_result_processor import process_results + results = process_results(profile_file, arguments) for line in results: From 1c6b3d8b3473b71ced2c8b926bfa40fcfac789f4 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 12:36:42 -0800 Subject: [PATCH 08/10] lintrunner -a --- onnxruntime/python/tools/transformers/profiler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py index 2dd600f678535..71249a1121c60 100644 --- a/onnxruntime/python/tools/transformers/profiler.py +++ b/onnxruntime/python/tools/transformers/profiler.py @@ -13,8 +13,6 @@ python profiler.py --input profile_2021-10-25_12-02-41.json """ -NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"] - def parse_arguments(argv=None): parser = argparse.ArgumentParser() @@ -174,6 +172,7 @@ def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_n profile_file = session.end_profiling() return profile_file + def get_dim_from_type_proto(dim): return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None # noqa: E721 @@ -428,6 +427,7 @@ def run(args): else: profile_file = arguments.input from profile_result_processor import process_results + results = process_results(profile_file, arguments) for line in results: From dd995113c4c72ab6b8ddec83d00b91d9db6e1222 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 12:40:11 -0800 Subject: [PATCH 09/10] apply changes for: docstring should be the first lines Adding Copyright notice Prepend _ for private constants Use a frozenset for quick lookup. --- .../tools/transformers/profile_result_processor.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py index ba49c9c19820b..e5fbc7af3e0fe 100644 --- a/onnxruntime/python/tools/transformers/profile_result_processor.py +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -1,13 +1,17 @@ -import argparse -import json +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- -""" -This profiler result processor print out the kernel time spent on each Node of the model. +"""This profiler result processor print out the kernel time spent on each Node of the model. Example of importing profile result file from onnxruntime_perf_test: python profile_result_processor.py --input profile_2021-10-25_12-02-41.json """ -NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"] +import argparse +import json + +_NODES_TYPE_CONTAINING_SUBGRAPH = frozenset(("Scan", "Loop", "If")) def parse_arguments(argv=None): From 9c1beadded0b3939a5f7a310586aa1cb8a1c0d14 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 6 Jan 2025 12:42:30 -0800 Subject: [PATCH 10/10] _NODES_TYPE_CONTAINING_SUBGRAPH --- .../python/tools/transformers/profile_result_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py index e5fbc7af3e0fe..2a17d41774a42 100644 --- a/onnxruntime/python/tools/transformers/profile_result_processor.py +++ b/onnxruntime/python/tools/transformers/profile_result_processor.py @@ -92,7 +92,7 @@ def parse_kernel_results(sess_time, threshold=0): kernel_name = item["name"] op_name = item["args"]["op_name"] - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: + if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH: continue # Handle MemcpyHostToDevice and MemcpyDeviceToHost here @@ -183,7 +183,7 @@ def parse_node_results(sess_time, kernel_time_only=False, threshold=0): continue op_name = item["args"]["op_name"] - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: + if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH: continue if node_name in node_time: @@ -258,7 +258,7 @@ def group_node_results(sess_time): op_name = item["args"]["op_name"] # TODO: shall we have a separated group for nodes with subgraph? - if op_name in NODES_TYPE_CONTAINING_SUBGRAPH: + if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH: continue if "provider" not in item["args"]: