Revert some commits to fix single-card pipelines (#17121)

### Ticket Device perf Model perf Nightly fast dispatch Demo tests were all failing ### Problem description Provide context for the problem. ### What's changed Describe the approach used to solve the problem. Summarize the changes made and its impact. ### Checklist - [ ] Post commit CI passes - [ ] Blackhole Post commit (if applicable) - [ ] Model regression CI testing passes (if applicable) - [ ] Device performance regression CI testing passes (if applicable) - [ ] **(For models and ops writers)** Full [new models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) tests passes - [ ] New/Existing tests provide coverage for changes --------- Co-authored-by: Raymond Kim <[email protected]>
tenstorrent · Jan 26, 2025 · a5796bc · a5796bc
1 parent a085f40
commit a5796bc
Show file tree

Hide file tree

Showing 42 changed files with 240 additions and 400 deletions.
diff --git a/models/experimental/functional_unet/tests/test_unet_trace.py b/models/experimental/functional_unet/tests/test_unet_trace.py
@@ -27,7 +27,7 @@
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 458752}], indirect=True)
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 444416}], indirect=True)
 @pytest.mark.parametrize(
     "batch, groups, iterations",
     ((1, 2, 32),),
@@ -108,7 +108,7 @@ def test_unet_trace(
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.parametrize(
-    "device_params", [{"l1_small_size": 68864, "trace_region_size": 917504, "num_command_queues": 2}], indirect=True
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
 )
 @pytest.mark.parametrize(
     "batch, groups, iterations",
@@ -343,7 +343,7 @@ def test_unet_trace_2cq_multi_device(
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.parametrize(
-    "device_params", [{"l1_small_size": 68864, "trace_region_size": 1376256, "num_command_queues": 2}], indirect=True
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
 )
 @pytest.mark.parametrize(
     "batch, groups, iterations",

diff --git a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
@@ -71,12 +71,12 @@ uint32_t shard_size = shard_height * shard_width;
 uint32_t input_unit_size = sizeof(uint32_t);
 uint32_t shard_width_bytes = shard_width * data_size;
 uint32_t num_units_per_row = shard_width * input_unit_size;
-uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::L1));
+uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment());
 ```
 
 In order to shard the correct data segments to the respective core, we indicate the shard height, width, size, and other data for the kernel function.
 For this situation, 16 units of data will be sharded across 4 cores; each core will have 4 units of data in their corresponding circular buffer.
-The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory with explicit referencing to BufferType::L1.
+The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory.
 This example demonstrates height sharding; the shard height is therefore set to evenly distribute the number of vector values across the cores.
 If the sharding strategy was different (i.e. width sharding or block sharding), the appropriate values for both the shard height and width would need to be set.
 

diff --git a/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_argmax_int.py b/tests/tt_eager/python_api_testing/sweep_tests/pytests/tt_dnn/test_argmax_int.py
@@ -7,7 +7,6 @@
 import ttnn
 from loguru import logger
 from tests.tt_eager.python_api_testing.sweep_tests import comparison_funcs
-from tests.ttnn.utils_for_testing import assert_with_pcc
 
 
 @pytest.mark.parametrize(
@@ -43,9 +42,23 @@ def test_argmax(self, input_shapes, dim, memconfig, device):
         tt_output_tensor_on_device = ttnn.argmax(input_tensor, dim=dim)
         tt_out_tensor = tt_output_tensor_on_device.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
         golden_tensor = torch.argmax(input_data, dim=dim)
+        if dim == 1 or dim == -3 or dim == 0 or dim == -4:
+            tt_out_tensor = tt_out_tensor[0, :, 0 : input_shapes[2], 0 : input_shapes[3]]
+        else:
+            if input_shapes[1] != 1 or input_shapes[0] != 1:
+                if dim == 2 or dim == -2:
+                    tt_out_tensor = tt_out_tensor[0, :, :, 0 : input_shapes[3]]
+                else:
+                    tt_out_tensor = tt_out_tensor[0, :, :, 0 : input_shapes[2]]
+            else:
+                if dim == 2 or dim == -2:
+                    tt_out_tensor = tt_out_tensor[0, 0, 0, 0 : input_shapes[3]]
+                else:
+                    tt_out_tensor = tt_out_tensor[0, 0, 0, 0 : input_shapes[2]]
 
         pt_out_tensor = golden_tensor
-        assert_with_pcc(pt_out_tensor, tt_out_tensor)
+        tt_out_tensor = tt_output_tensor_on_device.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
+        comp_pass, comp_out = comparison_funcs.comp_pcc(pt_out_tensor, tt_out_tensor, pcc=0.99)
         comp_all, _ = comparison_funcs.comp_allclose(pt_out_tensor, tt_out_tensor, atol=0, rtol=0)
 
         # DEBUG
@@ -55,5 +68,8 @@ def test_argmax(self, input_shapes, dim, memconfig, device):
         # print(flat)
         # print(torch.topk(flat, 8))
 
+        logger.info(comp_pass)
         logger.info(comp_all)
-        assert comp_all
+        logger.info(comp_out)
+        status = comp_pass | comp_all
+        assert status
diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_min_max.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_min_max.py
@@ -53,6 +53,8 @@ def test_min_max_for_dim_hw(device, use_program_cache, shape_dim, kind, layout):
     if kind == "max":
         value = x.max()
     elif kind == "min":
+        if N * C % 32 != 0:
+            pytest.skip("global min with Tensor dimension N*C not multiple of 32 is not supported at this time.")
         value = x.min()
     elif kind == "mean":
         value = x.mean()

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
@@ -164,15 +164,14 @@ def test_sharded_rm(
         ),
     )
 
-    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation, keep_l1_aligned=True)
+    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation)
 
     zt = ttnn.sharded_to_interleaved(
         yt,
         ttnn.MemoryConfig(
             memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
             buffer_type=ttnn.BufferType.L1,
         ),
-        is_l1_aligned=True,
     )
 
     tt_og = xt.cpu().to_torch()

diff --git a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
@@ -19,7 +19,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) {
     auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id());
     auto storage_core_bank_size =
         tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config);
-    const uint32_t allocator_alignment = device->get_allocator_alignment(BufferType::L1);
+    const uint32_t allocator_alignment = device->get_allocator_alignment();
     const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value()
                                                   ? storage_core_bank_size.value()
                                                   : (soc_desc.worker_l1_size - l1_unreserved_base);

diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
@@ -84,7 +84,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     device->load_sub_device_manager(sub_device_manager_1);
 
     auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
-    EXPECT_TRUE(buffer_1->address() <= max_addr - buffer_1->aligned_page_size());
+    EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size());
     EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
     std::vector<uint32_t> output_1;
     EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
@@ -105,7 +105,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     device->load_sub_device_manager(sub_device_manager_2);
 
     auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
-    EXPECT_TRUE(buffer_3->address() <= max_addr - buffer_3->aligned_page_size());
+    EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size());
     EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
     std::vector<uint32_t> output_2;
     EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
@@ -118,7 +118,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     }
 
     auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0});
-    EXPECT_TRUE(buffer_4->address() <= max_addr - buffer_4->aligned_page_size());
+    EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size());
     EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
 }
 

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
@@ -651,7 +651,7 @@ inline void generate_random_paged_payload(
         words_per_page);
 
     // Note: the dst address marches in unison regardless of whether or not a core is written to
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buf_type);
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
     for (uint32_t page_id = start_page; page_id < start_page + cmd.write_paged.pages; page_id++) {
         CoreCoord bank_core;
         uint32_t bank_id = page_id % num_banks;
@@ -931,9 +931,8 @@ inline void gen_dispatcher_paged_write_cmd(
     uint32_t start_page,
     uint32_t page_size,
     uint32_t pages) {
-    BufferType buffer_type = is_dram ? BufferType::DRAM : BufferType::L1;
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buffer_type);
-    uint32_t num_banks = device->num_banks(buffer_type);
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
+    uint32_t num_banks = device->num_banks(is_dram ? BufferType::DRAM : BufferType::L1);
     CoreType core_type = is_dram ? CoreType::DRAM : CoreType::WORKER;
 
     // Not safe to mix paged L1 and paged DRAM writes currently in this test since same book-keeping.

diff --git a/tests/ttnn/unit_tests/operations/test_core.py b/tests/ttnn/unit_tests/operations/test_core.py
@@ -527,9 +527,8 @@ def test_bh_alignment_i2s(
         memory_config=input_buffer_type,
         dtype=ttnn.bfloat16,
     )
-    # So far the sharded tensor alignment is controled by keep_l1_aligned flag, will remove it later after launch
-    x_t_sharded = ttnn.interleaved_to_sharded(x_t, shard_config, keep_l1_aligned=True)
-    x_t = ttnn.sharded_to_interleaved(x_t_sharded, output_buffer_type, is_l1_aligned=True)
+    x_t_sharded = ttnn.to_memory_config(x_t, shard_config)
+    x_t = ttnn.to_memory_config(x_t_sharded, output_buffer_type)
     output_data = ttnn.from_device(x_t)
     output_data = ttnn.to_torch(output_data)
     passing = torch.equal(input_data, output_data)

diff --git a/tests/ttnn/unit_tests/operations/test_group_norm.py b/tests/ttnn/unit_tests/operations/test_group_norm.py
@@ -292,7 +292,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     sharded_mem_config = ttnn.MemoryConfig(
         ttnn.types.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec
     )
-    input_tensor = ttnn.interleaved_to_sharded(input_tensor, sharded_mem_config, keep_l1_aligned=True)
+    input_tensor = ttnn.to_memory_config(input_tensor, sharded_mem_config)
 
     # groupnorm
     output_tensor = ttnn.group_norm(
@@ -306,7 +306,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     )
 
     # output tensor
-    output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG, is_l1_aligned=True)
+    output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG)
     output_tensor = ttnn.from_device(output_tensor)
     output_tensor = ttnn.to_torch(output_tensor)
 

diff --git a/tests/ttnn/unit_tests/operations/test_max.py b/tests/ttnn/unit_tests/operations/test_max.py
@@ -8,7 +8,7 @@
 
 import ttnn
 from tests.ttnn.utils_for_testing import assert_with_pcc
-from models.utility_functions import torch_random, is_grayskull
+from models.utility_functions import torch_random
 
 
 @pytest.mark.parametrize("batch_size", [1, 16, 1, 16])
@@ -99,24 +99,11 @@ def test_max_global(device, batch_size, h, w):
         ((2, 32, 32, 64), -3),
         ((32, 32, 64), -3),
         ((1, 2, 3, 4), -1),
-        ((2, 22, 37, 41), -4),
-        ((2, 32, 64, 64), -3),
-        ((2, 22, 37, 41), -3),
-        ((2, 32, 64, 64), -2),
-        ((2, 22, 37, 41), -1),
-        ((2, 32, 64, 64), -1),
-        ((2, 22, 37), -3),
-        ((2, 22, 37), -2),
-        ((2, 22, 37), -1),
-        ((1, 6, 7), -3),
-        ((32, 6, 7), -3),
     ],
 )
 @pytest.mark.parametrize("keepdim", [True, False])
 def test_max_dim(device, input_shape_and_dim, keepdim):
     input_shape, max_dim = input_shape_and_dim
-    if is_grayskull() and (input_shape[-1] % 32 != 0 or input_shape[-2] % 32 != 0 or input_shape[max_dim] % 32 != 0):
-        pytest.skip("If not a tile size multiple, may fail on GS if run all the tests in this file. #17084")
 
     torch_input_tensor = torch_random(input_shape, -100, 100, dtype=torch.bfloat16)
     torch_output_tensor, _ = torch.max(torch_input_tensor, dim=max_dim, keepdim=keepdim)
@@ -129,5 +116,4 @@ def test_max_dim(device, input_shape_and_dim, keepdim):
 
     output_tensor = ttnn.to_torch(output_tensor)
 
-    pcc = 0.9999
-    assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc)
+    assert_with_pcc(torch_output_tensor, output_tensor)
diff --git a/tests/ttnn/unit_tests/operations/test_pad.py b/tests/ttnn/unit_tests/operations/test_pad.py
@@ -226,10 +226,8 @@ def test_pad_rm_sharded_stickwise(
     ttnn_input_tensor = ttnn.from_torch(
         torch_input_tensor, dtype=ttnn.float32, layout=ttnn.ROW_MAJOR_LAYOUT, device=device
     )
-    # Still relay on keep_l1_aligned = True to make it work with the current implementation
-    ttnn_sharded_input_tensor = ttnn.interleaved_to_sharded(
-        ttnn_input_tensor, input_shard_memory_config, keep_l1_aligned=True
-    )
+    ttnn_sharded_input_tensor = ttnn.to_memory_config(ttnn_input_tensor, input_shard_memory_config)
+
     padded_tensor = ttnn.pad(ttnn_sharded_input_tensor, pad_to_shape, input_tensor_start, pad_value)
 
     tt_output_tensor = ttnn.to_memory_config(padded_tensor, ttnn.L1_MEMORY_CONFIG)

diff --git a/tests/ttnn/unit_tests/operations/test_reduction_mean.py b/tests/ttnn/unit_tests/operations/test_reduction_mean.py
@@ -22,7 +22,6 @@ def test_mean(device, batch_size, h, w, dim):
     torch_output_tensor = torch.mean(torch_input_tensor, dim=dim, keepdim=True, dtype=torch.bfloat16)
 
     input_tensor = ttnn.from_torch(torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device)
-    ttnn.fill_implicit_tile_padding(input_tensor, 42)  # garbage padding to test that mean removes it
 
     output_tensor = ttnn.mean(input_tensor, dim=dim)
     output_tensor = ttnn.to_torch(output_tensor)

diff --git a/tt_metal/api/tt-metalium/allocator.hpp b/tt_metal/api/tt-metalium/allocator.hpp
@@ -120,8 +120,6 @@ const std::vector<uint32_t>& bank_ids_from_dram_channel(const Allocator& allocat
 const std::vector<uint32_t>& bank_ids_from_logical_core(
     const Allocator& allocator, BufferType buffer_type, const CoreCoord& logical_core);
 
-uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type);
-
 Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type);
 
 void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);

diff --git a/tt_metal/api/tt-metalium/allocator_types.hpp b/tt_metal/api/tt-metalium/allocator_types.hpp
@@ -43,7 +43,6 @@ struct AllocatorConfig {
     size_t dram_bank_size = 0;
     std::vector<size_t> dram_bank_offsets = {};
     uint32_t dram_unreserved_base = 0;
-    uint32_t dram_alignment = 0;
     //! worker specific configuration
     uint32_t l1_unreserved_base = 0;
     CoreRangeSet worker_grid = {};
@@ -57,7 +56,7 @@ struct AllocatorConfig {
     BankMapping l1_bank_remap =
         {};  // for remapping which l1 bank points to which bank if we assume normal row-major assignment
     CoreRangeSet compute_grid = {};
-    uint32_t l1_alignment = 0;
+    uint32_t alignment = 0;
     bool disable_interleaved = false;
     void reset();
     ~AllocatorConfig() { reset(); }