#13609: Uplift dram and l1 allocators to use dram/l1 specific alignme…

…nt (#13762) ### Ticket #13609 ### Problem description Using the max of DRAM and L1 alignment for both DRAM and L1 buffers was causing pcc mismatches in i2s and s2i. ### What's changed Use L1/DRAM specific alignment for respective allocations. This will require some ops to be uplifted to handle re-alignment @yugaoTT and @ntarafdar to add corresponding op changes ### Checklist **Below post commits were triggered 12/03** - [x] [Post commit CI passes](https://github.com/tenstorrent/tt-metal/actions/runs/12956459836) - [x] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/runs/12956461469) - [x] [Run microbenchmarks](https://github.com/tenstorrent/tt-metal/actions/runs/12956474089) - [x] [T3000 frequent tests](https://github.com/tenstorrent/tt-metal/actions/runs/12956470735) - [x] [ttnn sweeps](https://github.com/tenstorrent/tt-metal/actions/runs/12892223851) --------- Co-authored-by: Saad Jameel <[email protected]> Co-authored-by: Ligang Long <[email protected]> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
tenstorrent · Jan 25, 2025 · b130161 · b130161
1 parent d6d8ba8
commit b130161
Show file tree

Hide file tree

Showing 35 changed files with 368 additions and 211 deletions.
diff --git a/models/experimental/functional_unet/tests/test_unet_trace.py b/models/experimental/functional_unet/tests/test_unet_trace.py
@@ -27,7 +27,7 @@
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 444416}], indirect=True)
+@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 458752}], indirect=True)
 @pytest.mark.parametrize(
     "batch, groups, iterations",
     ((1, 2, 32),),
@@ -108,7 +108,7 @@ def test_unet_trace(
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.parametrize(
-    "device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 917504, "num_command_queues": 2}], indirect=True
 )
 @pytest.mark.parametrize(
     "batch, groups, iterations",
@@ -343,7 +343,7 @@ def test_unet_trace_2cq_multi_device(
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.parametrize(
-    "device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 1376256, "num_command_queues": 2}], indirect=True
 )
 @pytest.mark.parametrize(
     "batch, groups, iterations",

diff --git a/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md b/tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
@@ -71,12 +71,12 @@ uint32_t shard_size = shard_height * shard_width;
 uint32_t input_unit_size = sizeof(uint32_t);
 uint32_t shard_width_bytes = shard_width * data_size;
 uint32_t num_units_per_row = shard_width * input_unit_size;
-uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment());
+uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::L1));
 ```
 
 In order to shard the correct data segments to the respective core, we indicate the shard height, width, size, and other data for the kernel function.
 For this situation, 16 units of data will be sharded across 4 cores; each core will have 4 units of data in their corresponding circular buffer.
-The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory.
+The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory with explicit referencing to BufferType::L1.
 This example demonstrates height sharding; the shard height is therefore set to evenly distribute the number of vector values across the cores.
 If the sharding strategy was different (i.e. width sharding or block sharding), the appropriate values for both the shard height and width would need to be set.
 

diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_sharded.py
@@ -164,14 +164,15 @@ def test_sharded_rm(
         ),
     )
 
-    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation)
+    yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation, keep_l1_aligned=True)
 
     zt = ttnn.sharded_to_interleaved(
         yt,
         ttnn.MemoryConfig(
             memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
             buffer_type=ttnn.BufferType.L1,
         ),
+        is_l1_aligned=True,
     )
 
     tt_og = xt.cpu().to_torch()

diff --git a/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp b/tests/tt_metal/tt_metal/api/allocator/test_l1_banking_allocator.cpp
@@ -19,7 +19,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) {
     auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id());
     auto storage_core_bank_size =
         tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config);
-    const uint32_t allocator_alignment = device->get_allocator_alignment();
+    const uint32_t allocator_alignment = device->get_allocator_alignment(BufferType::L1);
     const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value()
                                                   ? storage_core_bank_size.value()
                                                   : (soc_desc.worker_l1_size - l1_unreserved_base);

diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_sub_device.cpp
@@ -84,7 +84,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     device->load_sub_device_manager(sub_device_manager_1);
 
     auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
-    EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size());
+    EXPECT_TRUE(buffer_1->address() <= max_addr - buffer_1->aligned_page_size());
     EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
     std::vector<uint32_t> output_1;
     EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
@@ -105,7 +105,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     device->load_sub_device_manager(sub_device_manager_2);
 
     auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
-    EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size());
+    EXPECT_TRUE(buffer_3->address() <= max_addr - buffer_3->aligned_page_size());
     EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
     std::vector<uint32_t> output_2;
     EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
@@ -118,7 +118,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
     }
 
     auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0});
-    EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size());
+    EXPECT_TRUE(buffer_4->address() <= max_addr - buffer_4->aligned_page_size());
     EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
 }
 

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
@@ -651,7 +651,7 @@ inline void generate_random_paged_payload(
         words_per_page);
 
     // Note: the dst address marches in unison regardless of whether or not a core is written to
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buf_type);
     for (uint32_t page_id = start_page; page_id < start_page + cmd.write_paged.pages; page_id++) {
         CoreCoord bank_core;
         uint32_t bank_id = page_id % num_banks;
@@ -931,8 +931,9 @@ inline void gen_dispatcher_paged_write_cmd(
     uint32_t start_page,
     uint32_t page_size,
     uint32_t pages) {
-    uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
-    uint32_t num_banks = device->num_banks(is_dram ? BufferType::DRAM : BufferType::L1);
+    BufferType buffer_type = is_dram ? BufferType::DRAM : BufferType::L1;
+    uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buffer_type);
+    uint32_t num_banks = device->num_banks(buffer_type);
     CoreType core_type = is_dram ? CoreType::DRAM : CoreType::WORKER;
 
     // Not safe to mix paged L1 and paged DRAM writes currently in this test since same book-keeping.

diff --git a/tests/ttnn/unit_tests/operations/test_core.py b/tests/ttnn/unit_tests/operations/test_core.py
@@ -527,8 +527,9 @@ def test_bh_alignment_i2s(
         memory_config=input_buffer_type,
         dtype=ttnn.bfloat16,
     )
-    x_t_sharded = ttnn.to_memory_config(x_t, shard_config)
-    x_t = ttnn.to_memory_config(x_t_sharded, output_buffer_type)
+    # So far the sharded tensor alignment is controled by keep_l1_aligned flag, will remove it later after launch
+    x_t_sharded = ttnn.interleaved_to_sharded(x_t, shard_config, keep_l1_aligned=True)
+    x_t = ttnn.sharded_to_interleaved(x_t_sharded, output_buffer_type, is_l1_aligned=True)
     output_data = ttnn.from_device(x_t)
     output_data = ttnn.to_torch(output_data)
     passing = torch.equal(input_data, output_data)

diff --git a/tests/ttnn/unit_tests/operations/test_group_norm.py b/tests/ttnn/unit_tests/operations/test_group_norm.py
@@ -292,7 +292,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     sharded_mem_config = ttnn.MemoryConfig(
         ttnn.types.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec
     )
-    input_tensor = ttnn.to_memory_config(input_tensor, sharded_mem_config)
+    input_tensor = ttnn.interleaved_to_sharded(input_tensor, sharded_mem_config, keep_l1_aligned=True)
 
     # groupnorm
     output_tensor = ttnn.group_norm(
@@ -306,7 +306,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
     )
 
     # output tensor
-    output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG)
+    output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG, is_l1_aligned=True)
     output_tensor = ttnn.from_device(output_tensor)
     output_tensor = ttnn.to_torch(output_tensor)
 

diff --git a/tests/ttnn/unit_tests/operations/test_pad.py b/tests/ttnn/unit_tests/operations/test_pad.py
@@ -226,8 +226,10 @@ def test_pad_rm_sharded_stickwise(
     ttnn_input_tensor = ttnn.from_torch(
         torch_input_tensor, dtype=ttnn.float32, layout=ttnn.ROW_MAJOR_LAYOUT, device=device
     )
-    ttnn_sharded_input_tensor = ttnn.to_memory_config(ttnn_input_tensor, input_shard_memory_config)
-
+    # Still relay on keep_l1_aligned = True to make it work with the current implementation
+    ttnn_sharded_input_tensor = ttnn.interleaved_to_sharded(
+        ttnn_input_tensor, input_shard_memory_config, keep_l1_aligned=True
+    )
     padded_tensor = ttnn.pad(ttnn_sharded_input_tensor, pad_to_shape, input_tensor_start, pad_value)
 
     tt_output_tensor = ttnn.to_memory_config(padded_tensor, ttnn.L1_MEMORY_CONFIG)

diff --git a/tt_metal/api/tt-metalium/allocator.hpp b/tt_metal/api/tt-metalium/allocator.hpp
@@ -120,6 +120,8 @@ const std::vector<uint32_t>& bank_ids_from_dram_channel(const Allocator& allocat
 const std::vector<uint32_t>& bank_ids_from_logical_core(
     const Allocator& allocator, BufferType buffer_type, const CoreCoord& logical_core);
 
+uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type);
+
 Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type);
 
 void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);

diff --git a/tt_metal/api/tt-metalium/allocator_types.hpp b/tt_metal/api/tt-metalium/allocator_types.hpp
@@ -43,6 +43,7 @@ struct AllocatorConfig {
     size_t dram_bank_size = 0;
     std::vector<size_t> dram_bank_offsets = {};
     uint32_t dram_unreserved_base = 0;
+    uint32_t dram_alignment = 0;
     //! worker specific configuration
     uint32_t l1_unreserved_base = 0;
     CoreRangeSet worker_grid = {};
@@ -56,7 +57,7 @@ struct AllocatorConfig {
     BankMapping l1_bank_remap =
         {};  // for remapping which l1 bank points to which bank if we assume normal row-major assignment
     CoreRangeSet compute_grid = {};
-    uint32_t alignment = 0;
+    uint32_t l1_alignment = 0;
     bool disable_interleaved = false;
     void reset();
     ~AllocatorConfig() { reset(); }