From 3ba4c546bfb65aa70363e3e716217d638b969820 Mon Sep 17 00:00:00 2001 From: Aswinmcw Date: Thu, 7 Nov 2024 11:58:24 +0000 Subject: [PATCH] #14406: Add perf test for N300 reduce scatter --- .../operations/ccl/perf/test_ccl_perf.py | 73 ++++++++++++++++++- .../ccl/test_reduce_scatter_post_commit.py | 9 ++- 2 files changed, 79 insertions(+), 3 deletions(-) diff --git a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py index 9b0d332ff18..1caf940289f 100644 --- a/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py +++ b/tests/ttnn/unit_tests/operations/ccl/perf/test_ccl_perf.py @@ -146,8 +146,8 @@ def test_all_gather_on_t3000( ([1, 8, 1024, 1024], 3, ttnn.TILE_LAYOUT), ([1, 4, 1024, 1024], 3, ttnn.TILE_LAYOUT), ([1, 4, 2048, 1024], 3, ttnn.TILE_LAYOUT), - ([1, 1, 32, 32], 3, ttnn.TILE_LAYOUT), - ([1, 1, 32, 64], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 32 * 8], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 64 * 8], 3, ttnn.TILE_LAYOUT), ], ) @pytest.mark.parametrize( @@ -166,6 +166,7 @@ def test_all_gather_on_t3000( @pytest.mark.parametrize("num_iters", [20]) @pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) @pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("topology", [ttnn.Topology.Linear, ttnn.Topology.Ring]) @pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True) def test_reduce_scatter_on_t3000( t3k_mesh_device, @@ -181,6 +182,7 @@ def test_reduce_scatter_on_t3000( function_level_defaults, enable_async, num_iters, + topology, ): run_reduce_scatter_test( t3k_mesh_device, @@ -196,5 +198,72 @@ def test_reduce_scatter_on_t3000( function_level_defaults, num_iters=num_iters, enable_async=enable_async, + topology=topology, + trace_mode=True, + ) + + +@skip_for_grayskull("Requires eth connected devices to run") +@pytest.mark.parametrize( + "num_devices, num_links", + [ + (2, 1), + ], +) +@pytest.mark.parametrize( + "per_chip_output_shape, scatter_dim, layout", + [ + ([1, 1, 32, 4096], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 2048], 3, ttnn.TILE_LAYOUT), + ([1, 1, 32, 1024], 3, ttnn.TILE_LAYOUT), + ], +) +@pytest.mark.parametrize( + "input_dtype", + [ + ttnn.bfloat16, + ttnn.bfloat8_b, + ], +) +@pytest.mark.parametrize( + "mem_config", + [ + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.DRAM), + ttnn.MemoryConfig(buffer_type=ttnn.BufferType.L1), + ], +) +@pytest.mark.parametrize("num_iters", [20]) +@pytest.mark.parametrize("math_op", [ttnn.ReduceType.Sum]) +@pytest.mark.parametrize("enable_async", [True]) +@pytest.mark.parametrize("device_params", [{"trace_region_size": 266240}], indirect=True) +def test_reduce_scatter_on_n300( + n300_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + enable_async, + num_iters, +): + run_reduce_scatter_test( + n300_mesh_device, + num_devices, + per_chip_output_shape, + scatter_dim, + num_links, + math_op, + input_dtype, + layout, + mem_config, + use_program_cache, + function_level_defaults, + num_iters=num_iters, + enable_async=enable_async, trace_mode=True, ) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py index 161bbccd419..916682dd84e 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_reduce_scatter_post_commit.py @@ -141,7 +141,14 @@ def run_reduce_scatter_test( # Run the op if trace_mode: output_tensor_mesh = run_with_trace( - mesh_device, input_tensor_mesh, scatter_dim, num_links, math_op, mem_config, num_iters, topology + mesh_device, + input_tensor_mesh, + scatter_dim, + num_links, + math_op, + mem_config, + num_iters=num_iters, + topology=topology, ) else: for i in range(num_iters):