From e460d7b4650e47c578bebcb5cba37c82cbce8a4c Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Thu, 7 Nov 2024 10:30:55 +0000
Subject: [PATCH 1/3] #14406: Add perf test for reduce  scatter

---
 .../operations/ccl/perf/perf_csv.py           |  9 +-
 ...n_profile.sh => run_all_gather_profile.sh} |  0
 .../ccl/perf/run_reduce_scatter_profile.sh    | 97 +++++++++++++++++++
 .../operations/ccl/perf/test_ccl_perf.py      | 70 +++++++++++++
 .../ccl/test_reduce_scatter_post_commit.py    | 37 ++++---
 5 files changed, 197 insertions(+), 16 deletions(-)
 rename tests/ttnn/unit_tests/operations/ccl/perf/{run_profile.sh => run_all_gather_profile.sh} (100%)
 create mode 100755 tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh

diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
index 00be4435617..b5faf080149 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
@@ -56,7 +56,9 @@ def safe_parse_attributes(attributes):
     )
 
     df["dim"] = df["ATTRIBUTES"].apply(
-        lambda x: safe_parse_attributes(x).get("dim", "") if isinstance(safe_parse_attributes(x), dict) else ""
+        lambda x: safe_parse_attributes(x).get("dim", safe_parse_attributes(x).get("scatter_dim", ""))
+        if isinstance(safe_parse_attributes(x), dict)
+        else ""
     )
 
     df["num_links"] = df["ATTRIBUTES"].apply(
@@ -200,7 +202,10 @@ def calculate_bandwidth(row):
             min_val = round(group_df[column].min(), 2)
             largest_vals = group_df[column].nlargest(3)
             max_val = round(largest_vals.iloc[-1], 2)
-            avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2)
+            if min_val == max_val:
+                avg_val = min_val
+            else:
+                avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2)
 
             group_data[column] = f"{min_val} - {avg_val} - {max_val}"
 
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
similarity index 100%
rename from tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh
rename to tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh
new file mode 100755
index 00000000000..23071225ac1
--- /dev/null
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh
@@ -0,0 +1,97 @@
+#!/bin/sh
+MODULE_DIR="tests/ttnn/unit_tests/operations/ccl/perf"
+
+# Defaults
+DEBUG=false
+TARGET="n300"
+
+# Function to display help
+show_help() {
+    echo "Usage: ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh [OPTIONS]"
+    echo
+    echo "Options:"
+    echo "  -d, --debug        Enable debug mode to show real-time output."
+    echo "  -t, --target       Specify the target configuration (t3000 or n300). Default is n300."
+    echo "  -h, --help         Display this help message."
+    echo
+    echo "Example:"
+    echo "  ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh --debug --target n300"
+    echo "  ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -h"
+}
+
+# Parse command-line arguments
+while [ $# -gt 0 ]; do
+    case "$1" in
+        --debug|-d)
+            DEBUG=true
+            shift
+            ;;
+        --help|-h)
+            show_help
+            exit 0
+            ;;
+        --target|-t)
+            # Ensure there is an argument following the target flag
+            if [ -z "$2" ]; then
+                echo "Error: No target specified after $1."
+                show_help
+                exit 1
+            fi
+
+            TARGET="$2"  # Set the target configuration
+            shift 2
+
+            # Validate the target value
+            if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "n300" ]; then
+                echo "Error: Invalid target configuration: $TARGET. Must be either 't3000' or 'n300'."
+                exit 1
+            fi
+            ;;
+        *)
+            echo "Unknown option: $1"
+            show_help
+            exit 1
+            ;;
+    esac
+done
+
+# Function to run the profiling command and extract the CSV path
+run_profile_and_extract_csv() {
+    command="./tt_metal/tools/profiler/profile_this.py -n reduce_scatter_$TARGET -c 'pytest tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py::test_reduce_scatter_on_$TARGET'"
+
+    if [ "$DEBUG" = true ]; then
+        echo "Running profiling command for target $TARGET in debug mode..."
+        full_output=$(eval $command 2>&1 | tee /dev/tty)
+    else
+        echo "Running profiling command for target $TARGET..."
+        full_output=$(eval $command 2>&1)
+    fi
+
+    # Extract the CSV path
+    csv_path=$(echo "$full_output" | grep -oE 'OPs csv generated at: (.+\.csv)' | sed -E 's/OPs csv generated at: //')
+
+    if [ -n "$csv_path" ]; then
+        echo "CSV path found: $csv_path"
+
+        # Run the Python script to generate performance report
+        average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c "
+import pandas as pd
+from perf_csv import perf_report
+from tabulate import tabulate
+
+# Generate the report and convert it to a DataFrame
+average_df = perf_report('$csv_path')
+# Print the DataFrame in a pretty table format
+print(tabulate(average_df, headers='keys', tablefmt='pretty'))
+")
+
+        # Print the output
+        echo "Min - Avg - Max by Common Runs:"
+        echo "$average_values"
+    else
+        echo "CSV path not found in the command output."
+    fi
+}
+
+# Run the function
+run_profile_and_extract_csv
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
index c9a6c90ef31..9b0d332ff18 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
@@ -9,6 +9,9 @@
     run_all_gather_on_n300_impl,
     run_all_gather_on_t3000_impl_tight_loop,
 )
+from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_post_commit import (
+    run_reduce_scatter_test,
+)
 
 
 @skip_for_grayskull("Requires eth connected devices to run")
@@ -128,3 +131,70 @@ def test_all_gather_on_t3000(
         enable_async=enable_async,
         trace_mode=True,
     )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [
+        (8, 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "per_chip_output_shape, scatter_dim, layout",
+    [
+        ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 32], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 64], 3, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        # ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "mem_config",
+    [
+        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
+    ],
+)
+@pytest.mark.parametrize("num_iters", [20])
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True)
+def test_reduce_scatter_on_t3000(
+    t3k_mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    scatter_dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    mem_config,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    num_iters,
+):
+    run_reduce_scatter_test(
+        t3k_mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        scatter_dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        mem_config,
+        use_program_cache,
+        function_level_defaults,
+        num_iters=num_iters,
+        enable_async=enable_async,
+        trace_mode=True,
+    )
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
index 9fbc710ed7c..161bbccd419 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
@@ -32,9 +32,10 @@ def run_with_trace(
     num_links,
     math_op,
     output_mem_config,
-    n_worker,
-    n_buffer,
-    num_iters,
+    n_worker=None,
+    n_buffer=None,
+    num_iters=40,
+    topology=ttnn.Topology.Ring,
 ):
     # Compile Run
     logger.info("Compiling model")
@@ -46,6 +47,7 @@ def run_with_trace(
         memory_config=output_mem_config,
         num_workers=n_worker,
         num_buffers_per_channel=n_buffer,
+        topology=topology,
     )
     for device_id in t3k_mesh_device.get_device_ids():
         ttnn.synchronize_device(t3k_mesh_device.get_device(device_id))
@@ -62,6 +64,7 @@ def run_with_trace(
             memory_config=output_mem_config,
             num_workers=n_worker,
             num_buffers_per_channel=n_buffer,
+            topology=topology,
         )
     ttnn.end_trace_capture(t3k_mesh_device, trace_id, cq_id=0)
     for device_id in t3k_mesh_device.get_device_ids():
@@ -92,6 +95,7 @@ def run_reduce_scatter_test(
     enable_async=True,
     num_iters=1,
     topology=ttnn.Topology.Ring,
+    trace_mode=False,
 ):
     if len(mesh_device.get_device_ids()) < num_devices:
         pytest.skip(
@@ -135,19 +139,24 @@ def run_reduce_scatter_test(
 
     input_tensor_mesh = ttnn.aggregate_as_tensor(tt_input_tensors)
     # Run the op
-    for i in range(num_iters):
-        output_tensor_mesh = ttnn.reduce_scatter(
-            input_tensor_mesh,
-            scatter_dim=scatter_dim,
-            math_op=math_op,
-            num_links=num_links,
-            memory_config=mem_config,
-            topology=topology,
+    if trace_mode:
+        output_tensor_mesh = run_with_trace(
+            mesh_device, input_tensor_mesh, scatter_dim, num_links, math_op, mem_config, num_iters, topology
         )
+    else:
+        for i in range(num_iters):
+            output_tensor_mesh = ttnn.reduce_scatter(
+                input_tensor_mesh,
+                scatter_dim=scatter_dim,
+                math_op=math_op,
+                num_links=num_links,
+                memory_config=mem_config,
+                topology=topology,
+            )
 
-        for device_id in mesh_device.get_device_ids():
-            ttnn.synchronize_device(mesh_device.get_device(device_id))
-        logger.info(f"Done iteration {i}")
+            for device_id in mesh_device.get_device_ids():
+                ttnn.synchronize_device(mesh_device.get_device(device_id))
+            logger.info(f"Done iteration {i}")
 
     # ttnn.visualize_mesh_device(t3k_mesh_device, tensor=output_tensor_mesh)
     # Compute golden

From 3ba4c546bfb65aa70363e3e716217d638b969820 Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Thu, 7 Nov 2024 11:58:24 +0000
Subject: [PATCH 2/3] #14406: Add perf test for N300 reduce  scatter

---
 .../operations/ccl/perf/test_ccl_perf.py      | 73 ++++++++++++++++++-
 .../ccl/test_reduce_scatter_post_commit.py    |  9 ++-
 2 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
index 9b0d332ff18..1caf940289f 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
@@ -146,8 +146,8 @@ def test_all_gather_on_t3000(
         ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT),
         ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 32, 32], 3, ttnn.TILE_LAYOUT),
-        ([1, 1, 32, 64], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 32 * 8], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 64 * 8], 3, ttnn.TILE_LAYOUT),
     ],
 )
 @pytest.mark.parametrize(
@@ -166,6 +166,7 @@ def test_all_gather_on_t3000(
 @pytest.mark.parametrize("num_iters", [20])
 @pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
 @pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("topology", [ttnn.Topology.Linear, ttnn.Topology.Ring])
 @pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True)
 def test_reduce_scatter_on_t3000(
     t3k_mesh_device,
@@ -181,6 +182,7 @@ def test_reduce_scatter_on_t3000(
     function_level_defaults,
     enable_async,
     num_iters,
+    topology,
 ):
     run_reduce_scatter_test(
         t3k_mesh_device,
@@ -196,5 +198,72 @@ def test_reduce_scatter_on_t3000(
         function_level_defaults,
         num_iters=num_iters,
         enable_async=enable_async,
+        topology=topology,
+        trace_mode=True,
+    )
+
+
+@skip_for_grayskull("Requires eth connected devices to run")
+@pytest.mark.parametrize(
+    "num_devices, num_links",
+    [
+        (2, 1),
+    ],
+)
+@pytest.mark.parametrize(
+    "per_chip_output_shape, scatter_dim, layout",
+    [
+        ([1, 1, 32, 4096], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT),
+        ([1, 1, 32, 1024], 3, ttnn.TILE_LAYOUT),
+    ],
+)
+@pytest.mark.parametrize(
+    "input_dtype",
+    [
+        ttnn.bfloat16,
+        ttnn.bfloat8_b,
+    ],
+)
+@pytest.mark.parametrize(
+    "mem_config",
+    [
+        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM),
+        ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1),
+    ],
+)
+@pytest.mark.parametrize("num_iters", [20])
+@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum])
+@pytest.mark.parametrize("enable_async", [True])
+@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True)
+def test_reduce_scatter_on_n300(
+    n300_mesh_device,
+    num_devices,
+    per_chip_output_shape,
+    scatter_dim,
+    num_links,
+    math_op,
+    input_dtype,
+    layout,
+    mem_config,
+    use_program_cache,
+    function_level_defaults,
+    enable_async,
+    num_iters,
+):
+    run_reduce_scatter_test(
+        n300_mesh_device,
+        num_devices,
+        per_chip_output_shape,
+        scatter_dim,
+        num_links,
+        math_op,
+        input_dtype,
+        layout,
+        mem_config,
+        use_program_cache,
+        function_level_defaults,
+        num_iters=num_iters,
+        enable_async=enable_async,
         trace_mode=True,
     )
diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
index 161bbccd419..916682dd84e 100644
--- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
+++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py
@@ -141,7 +141,14 @@ def run_reduce_scatter_test(
     # Run the op
     if trace_mode:
         output_tensor_mesh = run_with_trace(
-            mesh_device, input_tensor_mesh, scatter_dim, num_links, math_op, mem_config, num_iters, topology
+            mesh_device,
+            input_tensor_mesh,
+            scatter_dim,
+            num_links,
+            math_op,
+            mem_config,
+            num_iters=num_iters,
+            topology=topology,
         )
     else:
         for i in range(num_iters):

From fefe768fed3bbff9697bdafe7850f0d56d7655fd Mon Sep 17 00:00:00 2001
From: Aswinmcw <azayasankaran@tenstorrent.com>
Date: Fri, 8 Nov 2024 06:06:17 +0000
Subject: [PATCH 3/3] #14406: Fix BW computation

---
 tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py    | 9 +++++----
 .../ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py | 1 -
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
index b5faf080149..31f4636aa66 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py
@@ -12,7 +12,7 @@ def perf_report(file_path):
 
     df = df.dropna(subset=["DEVICE ERISC KERNEL DURATION [ns]"])
     df = df[df["OP TO OP LATENCY [ns]"] != 0]
-    df = df[df["TRACE ID"].notna() & (df["TRACE ID"] != "")]
+    df = df[df["METAL TRACE ID"].notna() & (df["METAL TRACE ID"] != "")]
 
     def remove_keys_from_attributes(attributes):
         attributes = attributes.replace(";", ",").replace("'", '"')
@@ -156,15 +156,15 @@ def calculate_bandwidth(row):
                 op_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_device_fw_time
                 link_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time
             elif row["OP CODE"] == "ReduceScatter":
-                op_bw = (input_tensor_volume / n_chips) / longest_device_fw_time
-                link_bw = (input_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time
+                op_bw = input_tensor_volume / longest_device_fw_time
+                link_bw = input_tensor_volume / longest_erisc_fw_time
         elif row["topology"] == "Linear":
             if row["OP CODE"] == "AllGather":
                 op_bw = input_tensor_volume * n_chips / longest_device_fw_time
                 link_bw = input_tensor_volume * (n_chips - 1) / longest_erisc_fw_time
             elif row["OP CODE"] == "ReduceScatter":
                 op_bw = input_tensor_volume / longest_device_fw_time
-                link_bw = input_tensor_volume * (n_chips - 1) / n_chips / longest_erisc_fw_time
+                link_bw = input_tensor_volume / longest_erisc_fw_time
         return round(op_bw, 2), round(link_bw, 2)
 
     for i, (group, group_df) in enumerate(grouped, start=1):
@@ -196,6 +196,7 @@ def calculate_bandwidth(row):
             "output_mem_config": group_df["output_mem_config"].iloc[0] if "output_mem_config" in group_df else "",
             "topology": group_df["topology"].iloc[0],
             "Layout": group_df["Layout"].iloc[0] if "Layout" in group_df else "",
+            "Data Type": group_df["Data Type"].iloc[0] if "Data Type" in group_df else "",
         }
 
         for column in numeric_columns:
diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
index 1caf940289f..1429eb0fce1 100644
--- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
+++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py
@@ -154,7 +154,6 @@ def test_all_gather_on_t3000(
     "input_dtype",
     [
         ttnn.bfloat16,
-        # ttnn.bfloat8_b,
     ],
 )
 @pytest.mark.parametrize(