From c117213a7b5c7f78d4048e020c1c12f43ea21ca7 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sat, 4 Jan 2025 10:39:03 -0800
Subject: [PATCH 01/10] Separating result processor out from profiler.py
 without changing the behaviors of current profile.py

---
 .../transformers/profile_result_processor.py  | 363 ++++++++++++++++++
 .../python/tools/transformers/profiler.py     | 293 +-------------
 2 files changed, 364 insertions(+), 292 deletions(-)
 create mode 100644 onnxruntime/python/tools/transformers/profile_result_processor.py

diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py
new file mode 100644
index 0000000000000..4e06e4c4948fd
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/profile_result_processor.py
@@ -0,0 +1,363 @@
+import argparse
+import json
+
+"""
+This profiler result processor print out the kernel time spent on each Node of the model.
+Example of importing profile result file from onnxruntime_perf_test:
+    python profile_result_processor.py --input profile_2021-10-25_12-02-41.json
+"""
+
+NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"]
+
+
+def parse_arguments(argv=None):
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=False,
+        type=str,
+        help="Set the input file for reading the profile results",
+    )
+
+    parser.add_argument(
+        "--threshold",
+        required=False,
+        type=float,
+        default=0.01,
+        help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
+    )
+
+    parser.add_argument(
+        "--provider",
+        required=False,
+        type=str,
+        default="cuda",
+        help="Execution provider to use",
+    )
+
+    parser.add_argument(
+        "--basic_optimization",
+        required=False,
+        action="store_true",
+        help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime",
+    )
+    parser.set_defaults(basic_optimization=False)
+
+    parser.add_argument(
+        "--kernel_time_only",
+        required=False,
+        action="store_true",
+        help="Only include the kernel time and no fence time",
+    )
+
+    parser.set_defaults(kernel_time_only=False)
+
+    parser.add_argument("-v", "--verbose", required=False, action="store_true")
+    parser.set_defaults(verbose=False)
+
+    return parser.parse_args(argv)
+
+
+def load_profile_json(profile_file):
+    print(f"loading profile output {profile_file} ...")
+
+    with open(profile_file) as opened_file:
+        sess_time = json.load(opened_file)
+
+    assert isinstance(sess_time, list)
+    return sess_time
+
+
+def parse_kernel_results(sess_time, threshold=0):
+    """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
+
+    Args:
+        sess_time (List[Dict]): profile data
+        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
+        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
+
+    Returns:
+        List[str]: lines of string for output.
+    """
+    kernel_name_to_op_name = {}
+    kernel_time = {}
+    kernel_freq = {}
+    total = 0
+    session_init = False
+    for item in sess_time:
+        # Skip all MemcpyHostToDevice before session_initialization
+        if item["cat"] == "Session" and item["name"] == "session_initialization":
+            session_init = True
+        if not session_init:
+            continue
+
+        if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
+            kernel_name = item["name"]
+
+            op_name = item["args"]["op_name"]
+            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
+                continue
+
+            # Handle MemcpyHostToDevice and MemcpyDeviceToHost here
+            if not op_name:
+                op_name = f"({kernel_name})"
+
+            if kernel_name in kernel_time:
+                kernel_time[kernel_name] += item["dur"]
+                kernel_freq[kernel_name] += 1
+            else:
+                kernel_time[kernel_name] = item["dur"]
+                kernel_freq[kernel_name] = 1
+                kernel_name_to_op_name[kernel_name] = op_name
+
+            total += item["dur"]
+
+    if not kernel_time:
+        return ["No kernel record found!"]
+
+    # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
+    lines = []
+    lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:")
+    lines.append("-" * 64)
+    lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
+    for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
+        ratio = duration / total
+        if ratio < threshold:
+            continue
+
+        calls = kernel_freq[kernel_name]
+        avg_time = duration / float(calls)
+        lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
+
+    # Group by operator
+    op_time = {}
+    for kernel_name, op_name in kernel_name_to_op_name.items():
+        duration = kernel_time[kernel_name]
+        if op_name in op_time:
+            op_time[op_name] += duration
+        else:
+            op_time[op_name] = duration
+
+    lines.append("\nGroup kernel time by operator:")
+    lines.append("-" * 64)
+    lines.append("Total(μs)\tTime%\tOperator")
+    for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
+        ratio = duration / total
+        lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
+
+    return lines
+
+
+def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
+    """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
+
+    Args:
+        sess_time (List[Dict]): profile data
+        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
+        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
+
+    Returns:
+        List[str]: lines of string for output.
+    """
+    node_name_list = []
+    node_time = {}
+    node_freq = {}
+    node_provider = {}
+    total = 0
+    for item in sess_time:
+        if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
+            node_name = (
+                item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
+            )
+
+            if "provider" in item["args"]:
+                if item["args"]["provider"] == "CPUExecutionProvider":
+                    device = "CPU"
+                elif item["args"]["provider"] == "CUDAExecutionProvider":
+                    device = "CUDA"
+                elif item["args"]["provider"] == "DmlExecutionProvider":
+                    device = "DML"
+
+                if node_name not in node_provider:
+                    node_provider[node_name] = device
+                else:
+                    assert node_provider[node_name] == device
+            elif kernel_time_only:
+                continue
+
+            op_name = item["args"]["op_name"]
+            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
+                continue
+
+            if node_name in node_time:
+                node_time[node_name] += item["dur"]
+                node_freq[node_name] += 1
+            else:
+                node_time[node_name] = item["dur"]
+                node_freq[node_name] = 1
+                node_name_list.append(node_name)
+
+            total += item["dur"]
+
+    # Output items in the original order.
+    lines = [
+        "\nNodes in the original order:",
+        "-" * 64,
+        "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
+    ]
+    before_percentage = 0.0
+    for node_name in node_name_list:
+        duration = node_time[node_name]
+        calls = node_freq[node_name]
+        avg_time = duration / float(calls)
+        percentage = (duration / total) * 100.0
+        provider = node_provider.get(node_name, "")
+        before_percentage += percentage
+        lines.append(
+            f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
+        )
+
+    # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
+    lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:")
+    lines.append("-" * 64)
+    lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
+    for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
+        ratio = duration / total
+        if ratio < threshold:
+            continue
+
+        calls = node_freq[node_name]
+        avg_time = duration / float(calls)
+        percentage = (duration / total) * 100.0
+        provider = node_provider.get(node_name, "")
+        lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
+
+    return lines
+
+
+def group_node_results(sess_time):
+    """Group results by operator name.
+
+    Args:
+        sess_time (List[Dict]): profile data
+        kernel_time_only (bool): Only include items for kernel time.
+        use_gpu (bool): GPU is used in profiling or not.
+
+    Returns:
+        List[str]: lines of string for output.
+    """
+    op_kernel_time = {}
+    op_kernel_records = {}
+    total_kernel_time = 0
+
+    provider_op_kernel_time = {}
+    provider_op_kernel_records = {}
+    provider_kernel_time = {}
+
+    op_fence_time = {}
+    total_fence_time = 0
+
+    provider_counter = {}
+    for item in sess_time:
+        if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
+            op_name = item["args"]["op_name"]
+
+            # TODO: shall we have a separated group for nodes with subgraph?
+            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
+                continue
+
+            if "provider" not in item["args"]:
+                if "fence" in item["name"]:
+                    if op_name in op_fence_time:
+                        op_fence_time[op_name] += item["dur"]
+                    else:
+                        op_fence_time[op_name] = item["dur"]
+                    total_fence_time += item["dur"]
+                continue
+
+            provider = item["args"].get("provider", "")
+            if provider in provider_counter:
+                provider_counter[provider] += 1
+            else:
+                provider_counter[provider] = 1
+
+            key = f"{provider}:{op_name}"
+            if key in provider_op_kernel_time:
+                provider_op_kernel_time[key] += item["dur"]
+                provider_op_kernel_records[key] += 1
+            else:
+                provider_op_kernel_time[key] = item["dur"]
+                provider_op_kernel_records[key] = 1
+
+            if provider in provider_kernel_time:
+                provider_kernel_time[provider] += item["dur"]
+            else:
+                provider_kernel_time[provider] = item["dur"]
+
+            if op_name in op_kernel_time:
+                op_kernel_time[op_name] += item["dur"]
+                op_kernel_records[op_name] += 1
+            else:
+                op_kernel_time[op_name] = item["dur"]
+                op_kernel_records[op_name] = 1
+
+            total_kernel_time += item["dur"]
+
+    lines = ["", "Grouped by operator"]
+    lines.append("-" * 64)
+    lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
+    for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
+        fence_time = op_fence_time.get(op_name, 0)
+        kernel_time_ratio = kernel_time / total_kernel_time
+        total_time = kernel_time + fence_time
+        time_ratio = total_time / (total_kernel_time + total_fence_time)
+        kernel_calls = op_kernel_records[op_name]
+        avg_kernel_time = kernel_time / kernel_calls
+        lines.append(
+            f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
+        )
+
+    lines += ["", "Grouped by provider + operator"]
+    lines.append("-" * 64)
+    lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
+    for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
+        parts = key.split(":")
+        provider = parts[0]
+        op_name = parts[1]
+        short_ep = provider.replace("ExecutionProvider", "")
+        calls = provider_op_kernel_records[key]
+        avg_kernel_time = kernel_time / calls
+        provider_time_ratio = kernel_time / provider_kernel_time[provider]
+        lines.append(
+            f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
+        )
+
+    return lines
+
+def process_results(profile_file, args):
+    profile_records = load_profile_json(profile_file)
+
+    lines = parse_kernel_results(profile_records, args.threshold)
+
+    lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
+
+    lines += group_node_results(profile_records)
+
+    return lines
+
+if __name__ == "__main__":
+    arguments = parse_arguments()
+    print("Arguments", arguments)
+
+    from benchmark_helper import setup_logger
+
+    setup_logger(arguments.verbose)
+
+    profile_file = arguments.input
+
+    results = process_results(profile_file, arguments)
+
+    for line in results:
+        print(line)
\ No newline at end of file
diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py
index 2306b579f92fe..cf9ef6c123691 100644
--- a/onnxruntime/python/tools/transformers/profiler.py
+++ b/onnxruntime/python/tools/transformers/profiler.py
@@ -1,5 +1,4 @@
 import argparse
-import json
 import os
 
 import numpy
@@ -175,284 +174,6 @@ def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_n
     profile_file = session.end_profiling()
     return profile_file
 
-
-def load_profile_json(profile_file):
-    print(f"loading profile output {profile_file} ...")
-
-    with open(profile_file) as opened_file:
-        sess_time = json.load(opened_file)
-
-    assert isinstance(sess_time, list)
-    return sess_time
-
-
-def parse_kernel_results(sess_time, threshold=0):
-    """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
-
-    Args:
-        sess_time (List[Dict]): profile data
-        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
-        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
-
-    Returns:
-        List[str]: lines of string for output.
-    """
-    kernel_name_to_op_name = {}
-    kernel_time = {}
-    kernel_freq = {}
-    total = 0
-    session_init = False
-    for item in sess_time:
-        # Skip all MemcpyHostToDevice before session_initialization
-        if item["cat"] == "Session" and item["name"] == "session_initialization":
-            session_init = True
-        if not session_init:
-            continue
-
-        if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
-            kernel_name = item["name"]
-
-            op_name = item["args"]["op_name"]
-            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
-                continue
-
-            # Handle MemcpyHostToDevice and MemcpyDeviceToHost here
-            if not op_name:
-                op_name = f"({kernel_name})"
-
-            if kernel_name in kernel_time:
-                kernel_time[kernel_name] += item["dur"]
-                kernel_freq[kernel_name] += 1
-            else:
-                kernel_time[kernel_name] = item["dur"]
-                kernel_freq[kernel_name] = 1
-                kernel_name_to_op_name[kernel_name] = op_name
-
-            total += item["dur"]
-
-    if not kernel_time:
-        return ["No kernel record found!"]
-
-    # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
-    lines = []
-    lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:")
-    lines.append("-" * 64)
-    lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
-    for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
-        ratio = duration / total
-        if ratio < threshold:
-            continue
-
-        calls = kernel_freq[kernel_name]
-        avg_time = duration / float(calls)
-        lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
-
-    # Group by operator
-    op_time = {}
-    for kernel_name, op_name in kernel_name_to_op_name.items():
-        duration = kernel_time[kernel_name]
-        if op_name in op_time:
-            op_time[op_name] += duration
-        else:
-            op_time[op_name] = duration
-
-    lines.append("\nGroup kernel time by operator:")
-    lines.append("-" * 64)
-    lines.append("Total(μs)\tTime%\tOperator")
-    for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
-        ratio = duration / total
-        lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
-
-    return lines
-
-
-def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
-    """Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
-
-    Args:
-        sess_time (List[Dict]): profile data
-        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
-        threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
-
-    Returns:
-        List[str]: lines of string for output.
-    """
-    node_name_list = []
-    node_time = {}
-    node_freq = {}
-    node_provider = {}
-    total = 0
-    for item in sess_time:
-        if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
-            node_name = (
-                item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
-            )
-
-            if "provider" in item["args"]:
-                if item["args"]["provider"] == "CPUExecutionProvider":
-                    device = "CPU"
-                elif item["args"]["provider"] == "CUDAExecutionProvider":
-                    device = "CUDA"
-                elif item["args"]["provider"] == "DmlExecutionProvider":
-                    device = "DML"
-
-                if node_name not in node_provider:
-                    node_provider[node_name] = device
-                else:
-                    assert node_provider[node_name] == device
-            elif kernel_time_only:
-                continue
-
-            op_name = item["args"]["op_name"]
-            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
-                continue
-
-            if node_name in node_time:
-                node_time[node_name] += item["dur"]
-                node_freq[node_name] += 1
-            else:
-                node_time[node_name] = item["dur"]
-                node_freq[node_name] = 1
-                node_name_list.append(node_name)
-
-            total += item["dur"]
-
-    # Output items in the original order.
-    lines = [
-        "\nNodes in the original order:",
-        "-" * 64,
-        "Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
-    ]
-    before_percentage = 0.0
-    for node_name in node_name_list:
-        duration = node_time[node_name]
-        calls = node_freq[node_name]
-        avg_time = duration / float(calls)
-        percentage = (duration / total) * 100.0
-        provider = node_provider.get(node_name, "")
-        before_percentage += percentage
-        lines.append(
-            f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
-        )
-
-    # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
-    lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:")
-    lines.append("-" * 64)
-    lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
-    for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
-        ratio = duration / total
-        if ratio < threshold:
-            continue
-
-        calls = node_freq[node_name]
-        avg_time = duration / float(calls)
-        percentage = (duration / total) * 100.0
-        provider = node_provider.get(node_name, "")
-        lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
-
-    return lines
-
-
-def group_node_results(sess_time, kernel_time_only, use_gpu):
-    """Group results by operator name.
-
-    Args:
-        sess_time (List[Dict]): profile data
-        kernel_time_only (bool): Only include items for kernel time.
-        use_gpu (bool): GPU is used in profiling or not.
-
-    Returns:
-        List[str]: lines of string for output.
-    """
-    op_kernel_time = {}
-    op_kernel_records = {}
-    total_kernel_time = 0
-
-    provider_op_kernel_time = {}
-    provider_op_kernel_records = {}
-    provider_kernel_time = {}
-
-    op_fence_time = {}
-    total_fence_time = 0
-
-    provider_counter = {}
-    for item in sess_time:
-        if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
-            op_name = item["args"]["op_name"]
-
-            # TODO: shall we have a separated group for nodes with subgraph?
-            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
-                continue
-
-            if "provider" not in item["args"]:
-                if "fence" in item["name"]:
-                    if op_name in op_fence_time:
-                        op_fence_time[op_name] += item["dur"]
-                    else:
-                        op_fence_time[op_name] = item["dur"]
-                    total_fence_time += item["dur"]
-                continue
-
-            provider = item["args"].get("provider", "")
-            if provider in provider_counter:
-                provider_counter[provider] += 1
-            else:
-                provider_counter[provider] = 1
-
-            key = f"{provider}:{op_name}"
-            if key in provider_op_kernel_time:
-                provider_op_kernel_time[key] += item["dur"]
-                provider_op_kernel_records[key] += 1
-            else:
-                provider_op_kernel_time[key] = item["dur"]
-                provider_op_kernel_records[key] = 1
-
-            if provider in provider_kernel_time:
-                provider_kernel_time[provider] += item["dur"]
-            else:
-                provider_kernel_time[provider] = item["dur"]
-
-            if op_name in op_kernel_time:
-                op_kernel_time[op_name] += item["dur"]
-                op_kernel_records[op_name] += 1
-            else:
-                op_kernel_time[op_name] = item["dur"]
-                op_kernel_records[op_name] = 1
-
-            total_kernel_time += item["dur"]
-
-    lines = ["", "Grouped by operator"]
-    lines.append("-" * 64)
-    lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
-    for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
-        fence_time = op_fence_time.get(op_name, 0)
-        kernel_time_ratio = kernel_time / total_kernel_time
-        total_time = kernel_time + fence_time
-        time_ratio = total_time / (total_kernel_time + total_fence_time)
-        kernel_calls = op_kernel_records[op_name]
-        avg_kernel_time = kernel_time / kernel_calls
-        lines.append(
-            f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
-        )
-
-    lines += ["", "Grouped by provider + operator"]
-    lines.append("-" * 64)
-    lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
-    for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
-        parts = key.split(":")
-        provider = parts[0]
-        op_name = parts[1]
-        short_ep = provider.replace("ExecutionProvider", "")
-        calls = provider_op_kernel_records[key]
-        avg_kernel_time = kernel_time / calls
-        provider_time_ratio = kernel_time / provider_kernel_time[provider]
-        lines.append(
-            f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
-        )
-
-    return lines
-
-
 def get_dim_from_type_proto(dim):
     return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None  # noqa: E721
 
@@ -639,18 +360,6 @@ def create_longformer_inputs(onnx_model, batch_size, sequence_length, global_len
     return all_inputs
 
 
-def process_results(profile_file, args):
-    profile_records = load_profile_json(profile_file)
-
-    lines = parse_kernel_results(profile_records, args.threshold)
-
-    lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
-
-    lines += group_node_results(profile_records, args.kernel_time_only, args.use_gpu)
-
-    return lines
-
-
 def run(args):
     num_threads = args.thread_num if args.thread_num > 0 else psutil.cpu_count(logical=False)
 
@@ -718,7 +427,7 @@ def run(args):
         profile_file = run(arguments)
     else:
         profile_file = arguments.input
-
+    import profile_result_processor import process_results
     results = process_results(profile_file, arguments)
 
     for line in results:

From 0509b709dcc10e3da3157d9af96d9ac51b264c6e Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Sun, 5 Jan 2025 16:13:46 -0800
Subject: [PATCH 02/10] Separating result processor out from profiler.py
 without changing the behaviors of current profile.py

---
 .../python/tools/transformers/profile_result_processor.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py
index 4e06e4c4948fd..df07f80be475d 100644
--- a/onnxruntime/python/tools/transformers/profile_result_processor.py
+++ b/onnxruntime/python/tools/transformers/profile_result_processor.py
@@ -336,6 +336,7 @@ def group_node_results(sess_time):
 
     return lines
 
+
 def process_results(profile_file, args):
     profile_records = load_profile_json(profile_file)
 
@@ -347,6 +348,7 @@ def process_results(profile_file, args):
 
     return lines
 
+
 if __name__ == "__main__":
     arguments = parse_arguments()
     print("Arguments", arguments)
@@ -360,4 +362,4 @@ def process_results(profile_file, args):
     results = process_results(profile_file, arguments)
 
     for line in results:
-        print(line)
\ No newline at end of file
+        print(line)

From e9d853dc00b7a82f854d80d448aa0499f3adfedc Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 11:55:38 -0800
Subject: [PATCH 03/10] Add process_results function definition

---
 .../python/tools/transformers/profile_result_processor.py        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py
index df07f80be475d..185f9306f4c91 100644
--- a/onnxruntime/python/tools/transformers/profile_result_processor.py
+++ b/onnxruntime/python/tools/transformers/profile_result_processor.py
@@ -337,6 +337,7 @@ def group_node_results(sess_time):
     return lines
 
 
+
 def process_results(profile_file, args):
     profile_records = load_profile_json(profile_file)
 

From f39fea8e237f4a9c324daf4a8c5da782f9e47be8 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 12:05:05 -0800
Subject: [PATCH 04/10] Reformat code

---
 .../python/tools/transformers/profile_result_processor.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py
index 185f9306f4c91..8de073175a348 100644
--- a/onnxruntime/python/tools/transformers/profile_result_processor.py
+++ b/onnxruntime/python/tools/transformers/profile_result_processor.py
@@ -75,7 +75,6 @@ def parse_kernel_results(sess_time, threshold=0):
 
     Args:
         sess_time (List[Dict]): profile data
-        kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
         threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
 
     Returns:
@@ -119,7 +118,7 @@ def parse_kernel_results(sess_time, threshold=0):
 
     # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
     lines = []
-    lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:")
+    lines.append(f"\nTop expensive kernels with Time% >= {threshold * 100:.2f}:")
     lines.append("-" * 64)
     lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
     for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
@@ -220,7 +219,7 @@ def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
         )
 
     # Output items with run time ratio > thresholds, and sorted by duration in the descending order.
-    lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:")
+    lines.append(f"\nTop expensive nodes with Time% >= {threshold * 100:.2f}:")
     lines.append("-" * 64)
     lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
     for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
@@ -242,8 +241,6 @@ def group_node_results(sess_time):
 
     Args:
         sess_time (List[Dict]): profile data
-        kernel_time_only (bool): Only include items for kernel time.
-        use_gpu (bool): GPU is used in profiling or not.
 
     Returns:
         List[str]: lines of string for output.
@@ -337,7 +334,6 @@ def group_node_results(sess_time):
     return lines
 
 
-
 def process_results(profile_file, args):
     profile_records = load_profile_json(profile_file)
 

From 179620d0113d8feb2b94174745174af4ee23d768 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 12:05:48 -0800
Subject: [PATCH 05/10] change import from

---
 onnxruntime/python/tools/transformers/profiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py
index cf9ef6c123691..2dd600f678535 100644
--- a/onnxruntime/python/tools/transformers/profiler.py
+++ b/onnxruntime/python/tools/transformers/profiler.py
@@ -427,7 +427,7 @@ def run(args):
         profile_file = run(arguments)
     else:
         profile_file = arguments.input
-    import profile_result_processor import process_results
+    from profile_result_processor import process_results
     results = process_results(profile_file, arguments)
 
     for line in results:

From 91c83bdc70d23e9e947decd7a981b03675d693f4 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 12:08:38 -0800
Subject: [PATCH 06/10] remove unused argument basic_optimization

---
 .../python/tools/transformers/profile_result_processor.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py
index 8de073175a348..ba49c9c19820b 100644
--- a/onnxruntime/python/tools/transformers/profile_result_processor.py
+++ b/onnxruntime/python/tools/transformers/profile_result_processor.py
@@ -37,14 +37,6 @@ def parse_arguments(argv=None):
         help="Execution provider to use",
     )
 
-    parser.add_argument(
-        "--basic_optimization",
-        required=False,
-        action="store_true",
-        help="Enable only basic graph optimizations. By default, all optimizations are enabled in OnnxRuntime",
-    )
-    parser.set_defaults(basic_optimization=False)
-
     parser.add_argument(
         "--kernel_time_only",
         required=False,

From dfcb245d545d9e125237b2bfc68ae8915722eff8 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 12:15:00 -0800
Subject: [PATCH 07/10] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 onnxruntime/python/tools/transformers/profiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py
index 2dd600f678535..fa0881099e4c3 100644
--- a/onnxruntime/python/tools/transformers/profiler.py
+++ b/onnxruntime/python/tools/transformers/profiler.py
@@ -428,6 +428,7 @@ def run(args):
     else:
         profile_file = arguments.input
     from profile_result_processor import process_results
+
     results = process_results(profile_file, arguments)
 
     for line in results:

From 1c6b3d8b3473b71ced2c8b926bfa40fcfac789f4 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 12:36:42 -0800
Subject: [PATCH 08/10] lintrunner -a

---
 onnxruntime/python/tools/transformers/profiler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/profiler.py b/onnxruntime/python/tools/transformers/profiler.py
index 2dd600f678535..71249a1121c60 100644
--- a/onnxruntime/python/tools/transformers/profiler.py
+++ b/onnxruntime/python/tools/transformers/profiler.py
@@ -13,8 +13,6 @@
     python profiler.py --input profile_2021-10-25_12-02-41.json
 """
 
-NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"]
-
 
 def parse_arguments(argv=None):
     parser = argparse.ArgumentParser()
@@ -174,6 +172,7 @@ def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_n
     profile_file = session.end_profiling()
     return profile_file
 
+
 def get_dim_from_type_proto(dim):
     return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None  # noqa: E721
 
@@ -428,6 +427,7 @@ def run(args):
     else:
         profile_file = arguments.input
     from profile_result_processor import process_results
+
     results = process_results(profile_file, arguments)
 
     for line in results:

From dd995113c4c72ab6b8ddec83d00b91d9db6e1222 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 12:40:11 -0800
Subject: [PATCH 09/10] apply changes for: docstring should be the first lines
 Adding Copyright notice Prepend _ for private constants Use a frozenset for
 quick lookup.

---
 .../tools/transformers/profile_result_processor.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py
index ba49c9c19820b..e5fbc7af3e0fe 100644
--- a/onnxruntime/python/tools/transformers/profile_result_processor.py
+++ b/onnxruntime/python/tools/transformers/profile_result_processor.py
@@ -1,13 +1,17 @@
-import argparse
-import json
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
 
-"""
-This profiler result processor print out the kernel time spent on each Node of the model.
+"""This profiler result processor print out the kernel time spent on each Node of the model.
 Example of importing profile result file from onnxruntime_perf_test:
     python profile_result_processor.py --input profile_2021-10-25_12-02-41.json
 """
 
-NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"]
+import argparse
+import json
+
+_NODES_TYPE_CONTAINING_SUBGRAPH = frozenset(("Scan", "Loop", "If"))
 
 
 def parse_arguments(argv=None):

From 9c1beadded0b3939a5f7a310586aa1cb8a1c0d14 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 6 Jan 2025 12:42:30 -0800
Subject: [PATCH 10/10] _NODES_TYPE_CONTAINING_SUBGRAPH

---
 .../python/tools/transformers/profile_result_processor.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/profile_result_processor.py b/onnxruntime/python/tools/transformers/profile_result_processor.py
index e5fbc7af3e0fe..2a17d41774a42 100644
--- a/onnxruntime/python/tools/transformers/profile_result_processor.py
+++ b/onnxruntime/python/tools/transformers/profile_result_processor.py
@@ -92,7 +92,7 @@ def parse_kernel_results(sess_time, threshold=0):
             kernel_name = item["name"]
 
             op_name = item["args"]["op_name"]
-            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
+            if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
                 continue
 
             # Handle MemcpyHostToDevice and MemcpyDeviceToHost here
@@ -183,7 +183,7 @@ def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
                 continue
 
             op_name = item["args"]["op_name"]
-            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
+            if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
                 continue
 
             if node_name in node_time:
@@ -258,7 +258,7 @@ def group_node_results(sess_time):
             op_name = item["args"]["op_name"]
 
             # TODO: shall we have a separated group for nodes with subgraph?
-            if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
+            if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
                 continue
 
             if "provider" not in item["args"]: