From e460d7b4650e47c578bebcb5cba37c82cbce8a4c Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Thu, 7 Nov 2024 10:30:55 +0000 Subject: [PATCH 1/3] #14406: Add perf test for reduce scatter --- .../operations/ccl/perf/perf_csv.py | 9 +- ...n_profile.sh => run_all_gather_profile.sh} | 0 .../ccl/perf/run_reduce_scatter_profile.sh | 97 +++++++++++++++++++ .../operations/ccl/perf/test_ccl_perf.py | 70 +++++++++++++ .../ccl/test_reduce_scatter_post_commit.py | 37 ++++--- 5 files changed, 197 insertions(+), 16 deletions(-) rename tests/ttnn/unit_tests/operations/ccl/perf/{run_profile.sh => run_all_gather_profile.sh} (100%) create mode 100755 tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py index 00be4435617..b5faf080149 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py @@ -56,7 +56,9 @@ def safe_parse_attributes(attributes): ) df["dim"] = df["ATTRIBUTES"].apply( - lambda x: safe_parse_attributes(x).get("dim", "") if isinstance(safe_parse_attributes(x), dict) else "" + lambda x: safe_parse_attributes(x).get("dim", safe_parse_attributes(x).get("scatter_dim", "")) + if isinstance(safe_parse_attributes(x), dict) + else "" ) df["num_links"] = df["ATTRIBUTES"].apply( @@ -200,7 +202,10 @@ def calculate_bandwidth(row): min_val = round(group_df[column].min(), 2) largest_vals = group_df[column].nlargest(3) max_val = round(largest_vals.iloc[-1], 2) - avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2) + if min_val == max_val: + avg_val = min_val + else: + avg_val = round(group_df[column][~group_df[column].isin(largest_vals.head(2))].mean(), 2) group_data[column] = f"{min_val} - {avg_val} - {max_val}" diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh similarity index 100% rename from tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh rename to tests/ttnn/unit_tests/operations/ccl/perf/run_all_gather_profile.sh diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh new file mode 100755 index 00000000000..23071225ac1 --- /dev/null +++ b/tests/ttnn/unit_tests/operations/ccl/perf/run_reduce_scatter_profile.sh @@ -0,0 +1,97 @@ +#!/bin/sh +MODULE_DIR="tests/ttnn/unit_tests/operations/ccl/perf" + +# Defaults +DEBUG=false +TARGET="n300" + +# Function to display help +show_help() { + echo "Usage: ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh [OPTIONS]" + echo + echo "Options:" + echo " -d, --debug Enable debug mode to show real-time output." + echo " -t, --target Specify the target configuration (t3000 or n300). Default is n300." + echo " -h, --help Display this help message." + echo + echo "Example:" + echo " ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh --debug --target n300" + echo " ./tests/ttnn/unit_tests/operations/ccl/perf/run_profile.sh -h" +} + +# Parse command-line arguments +while [ $# -gt 0 ]; do + case "$1" in + --debug|-d) + DEBUG=true + shift + ;; + --help|-h) + show_help + exit 0 + ;; + --target|-t) + # Ensure there is an argument following the target flag + if [ -z "$2" ]; then + echo "Error: No target specified after $1." + show_help + exit 1 + fi + + TARGET="$2" # Set the target configuration + shift 2 + + # Validate the target value + if [ "$TARGET" != "t3000" ] && [ "$TARGET" != "n300" ]; then + echo "Error: Invalid target configuration: $TARGET. Must be either 't3000' or 'n300'." + exit 1 + fi + ;; + *) + echo "Unknown option: $1" + show_help + exit 1 + ;; + esac +done + +# Function to run the profiling command and extract the CSV path +run_profile_and_extract_csv() { + command="./tt_metal/tools/profiler/profile_this.py -n reduce_scatter_$TARGET -c 'pytest tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py::test_reduce_scatter_on_$TARGET'" + + if [ "$DEBUG" = true ]; then + echo "Running profiling command for target $TARGET in debug mode..." + full_output=$(eval $command 2>&1 | tee /dev/tty) + else + echo "Running profiling command for target $TARGET..." + full_output=$(eval $command 2>&1) + fi + + # Extract the CSV path + csv_path=$(echo "$full_output" | grep -oE 'OPs csv generated at: (.+\.csv)' | sed -E 's/OPs csv generated at: //') + + if [ -n "$csv_path" ]; then + echo "CSV path found: $csv_path" + + # Run the Python script to generate performance report + average_values=$(PYTHONPATH="$MODULE_DIR" python3 -c " +import pandas as pd +from perf_csv import perf_report +from tabulate import tabulate + +# Generate the report and convert it to a DataFrame +average_df = perf_report('$csv_path') +# Print the DataFrame in a pretty table format +print(tabulate(average_df, headers='keys', tablefmt='pretty')) +") + + # Print the output + echo "Min - Avg - Max by Common Runs:" + echo "$average_values" + else + echo "CSV path not found in the command output." + fi +} + +# Run the function +run_profile_and_extract_csv diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py index c9a6c90ef31..9b0d332ff18 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py @@ -9,6 +9,9 @@ run_all_gather_on_n300_impl, run_all_gather_on_t3000_impl_tight_loop, ) +from tests.ttnn.unit_tests.operations.ccl.test_reduce_scatter_post_commit import ( + run_reduce_scatter_test, +) @skip_for_grayskull("Requires eth connected devices to run") @@ -128,3 +131,70 @@ def test_all_gather_on_t3000( enable_async=enable_async, trace_mode=True, ) + + +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links", + [ + (8, 1), + ], +) +@pytest.mark.parametrize( + "per_chip_output_shape, scatter_dim, layout", + [ + ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT), + ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT), + ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 32], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 64], 3, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + # ttnn.bfloat8_b, + ], +) +@pytest.mark.parametrize( + "mem_config", + [ + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM), + ], +) +@pytest.mark.parametrize("num_iters", [20]) +@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True) +def test_reduce_scatter_on_t3000( + t3k_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + enable_async, + num_iters, +): + run_reduce_scatter_test( + t3k_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + num_iters=num_iters, + enable_async=enable_async, + trace_mode=True, + ) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py index 9fbc710ed7c..161bbccd419 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py @@ -32,9 +32,10 @@ def run_with_trace( num_links, math_op, output_mem_config, - n_worker, - n_buffer, - num_iters, + n_worker=None, + n_buffer=None, + num_iters=40, + topology=ttnn.Topology.Ring, ): # Compile Run logger.info("Compiling model") @@ -46,6 +47,7 @@ def run_with_trace( memory_config=output_mem_config, num_workers=n_worker, num_buffers_per_channel=n_buffer, + topology=topology, ) for device_id in t3k_mesh_device.get_device_ids(): ttnn.synchronize_device(t3k_mesh_device.get_device(device_id)) @@ -62,6 +64,7 @@ def run_with_trace( memory_config=output_mem_config, num_workers=n_worker, num_buffers_per_channel=n_buffer, + topology=topology, ) ttnn.end_trace_capture(t3k_mesh_device, trace_id, cq_id=0) for device_id in t3k_mesh_device.get_device_ids(): @@ -92,6 +95,7 @@ def run_reduce_scatter_test( enable_async=True, num_iters=1, topology=ttnn.Topology.Ring, + trace_mode=False, ): if len(mesh_device.get_device_ids()) < num_devices: pytest.skip( @@ -135,19 +139,24 @@ def run_reduce_scatter_test( input_tensor_mesh = ttnn.aggregate_as_tensor(tt_input_tensors) # Run the op - for i in range(num_iters): - output_tensor_mesh = ttnn.reduce_scatter( - input_tensor_mesh, - scatter_dim=scatter_dim, - math_op=math_op, - num_links=num_links, - memory_config=mem_config, - topology=topology, + if trace_mode: + output_tensor_mesh = run_with_trace( + mesh_device, input_tensor_mesh, scatter_dim, num_links, math_op, mem_config, num_iters, topology ) + else: + for i in range(num_iters): + output_tensor_mesh = ttnn.reduce_scatter( + input_tensor_mesh, + scatter_dim=scatter_dim, + math_op=math_op, + num_links=num_links, + memory_config=mem_config, + topology=topology, + ) - for device_id in mesh_device.get_device_ids(): - ttnn.synchronize_device(mesh_device.get_device(device_id)) - logger.info(f"Done iteration {i}") + for device_id in mesh_device.get_device_ids(): + ttnn.synchronize_device(mesh_device.get_device(device_id)) + logger.info(f"Done iteration {i}") # ttnn.visualize_mesh_device(t3k_mesh_device, tensor=output_tensor_mesh) # Compute golden From 3ba4c546bfb65aa70363e3e716217d638b969820 Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Thu, 7 Nov 2024 11:58:24 +0000 Subject: [PATCH 2/3] #14406: Add perf test for N300 reduce scatter --- .../operations/ccl/perf/test_ccl_perf.py | 73 ++++++++++++++++++- .../ccl/test_reduce_scatter_post_commit.py | 9 ++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py index 9b0d332ff18..1caf940289f 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py @@ -146,8 +146,8 @@ def test_all_gather_on_t3000( ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT), ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT), ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT), - ([1, 1, 32, 32], 3, ttnn.TILE_LAYOUT), - ([1, 1, 32, 64], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 32 * 8], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 64 * 8], 3, ttnn.TILE_LAYOUT), ], ) @pytest.mark.parametrize( @@ -166,6 +166,7 @@ def test_all_gather_on_t3000( @pytest.mark.parametrize("num_iters", [20]) @pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) @pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("topology", [ttnn.Topology.Linear, ttnn.Topology.Ring]) @pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True) def test_reduce_scatter_on_t3000( t3k_mesh_device, @@ -181,6 +182,7 @@ def test_reduce_scatter_on_t3000( function_level_defaults, enable_async, num_iters, + topology, ): run_reduce_scatter_test( t3k_mesh_device, @@ -196,5 +198,72 @@ def test_reduce_scatter_on_t3000( function_level_defaults, num_iters=num_iters, enable_async=enable_async, + topology=topology, + trace_mode=True, + ) + + +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links", + [ + (2, 1), + ], +) +@pytest.mark.parametrize( + "per_chip_output_shape, scatter_dim, layout", + [ + ([1, 1, 32, 4096], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 1024], 3, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + ttnn.bfloat8_b, + ], +) +@pytest.mark.parametrize( + "mem_config", + [ + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM), + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1), + ], +) +@pytest.mark.parametrize("num_iters", [20]) +@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True) +def test_reduce_scatter_on_n300( + n300_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + enable_async, + num_iters, +): + run_reduce_scatter_test( + n300_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + num_iters=num_iters, + enable_async=enable_async, trace_mode=True, ) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py index 161bbccd419..916682dd84e 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py @@ -141,7 +141,14 @@ def run_reduce_scatter_test( # Run the op if trace_mode: output_tensor_mesh = run_with_trace( - mesh_device, input_tensor_mesh, scatter_dim, num_links, math_op, mem_config, num_iters, topology + mesh_device, + input_tensor_mesh, + scatter_dim, + num_links, + math_op, + mem_config, + num_iters=num_iters, + topology=topology, ) else: for i in range(num_iters): From fefe768fed3bbff9697bdafe7850f0d56d7655fd Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Fri, 8 Nov 2024 06:06:17 +0000 Subject: [PATCH 3/3] #14406: Fix BW computation --- tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py | 9 +++++---- .../ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py index b5faf080149..31f4636aa66 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/perf_csv.py @@ -12,7 +12,7 @@ def perf_report(file_path): df = df.dropna(subset=["DEVICE ERISC KERNEL DURATION [ns]"]) df = df[df["OP TO OP LATENCY [ns]"] != 0] - df = df[df["TRACE ID"].notna() & (df["TRACE ID"] != "")] + df = df[df["METAL TRACE ID"].notna() & (df["METAL TRACE ID"] != "")] def remove_keys_from_attributes(attributes): attributes = attributes.replace(";", ",").replace("'", '"') @@ -156,15 +156,15 @@ def calculate_bandwidth(row): op_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_device_fw_time link_bw = (output_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time elif row["OP CODE"] == "ReduceScatter": - op_bw = (input_tensor_volume / n_chips) / longest_device_fw_time - link_bw = (input_tensor_volume * (n_chips - 1) / n_chips) / longest_erisc_fw_time + op_bw = input_tensor_volume / longest_device_fw_time + link_bw = input_tensor_volume / longest_erisc_fw_time elif row["topology"] == "Linear": if row["OP CODE"] == "AllGather": op_bw = input_tensor_volume * n_chips / longest_device_fw_time link_bw = input_tensor_volume * (n_chips - 1) / longest_erisc_fw_time elif row["OP CODE"] == "ReduceScatter": op_bw = input_tensor_volume / longest_device_fw_time - link_bw = input_tensor_volume * (n_chips - 1) / n_chips / longest_erisc_fw_time + link_bw = input_tensor_volume / longest_erisc_fw_time return round(op_bw, 2), round(link_bw, 2) for i, (group, group_df) in enumerate(grouped, start=1): @@ -196,6 +196,7 @@ def calculate_bandwidth(row): "output_mem_config": group_df["output_mem_config"].iloc[0] if "output_mem_config" in group_df else "", "topology": group_df["topology"].iloc[0], "Layout": group_df["Layout"].iloc[0] if "Layout" in group_df else "", + "Data Type": group_df["Data Type"].iloc[0] if "Data Type" in group_df else "", } for column in numeric_columns: diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py index 1caf940289f..1429eb0fce1 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py @@ -154,7 +154,6 @@ def test_all_gather_on_t3000( "input_dtype", [ ttnn.bfloat16, - # ttnn.bfloat8_b, ], ) @pytest.mark.parametrize(