Skip to content

Commit

Permalink
#13609: Uplift dram and l1 allocators to use dram/l1 specific alignme…
Browse files Browse the repository at this point in the history
…nt (#13762)

### Ticket
#13609

### Problem description
Using the max of DRAM and L1 alignment for both DRAM and L1 buffers was
causing pcc mismatches in i2s and s2i.

### What's changed
Use L1/DRAM specific alignment for respective allocations. This will
require some ops to be uplifted to handle re-alignment
@yugaoTT and @ntarafdar to add corresponding op changes

### Checklist
**Below post commits were triggered 12/03**
- [x] [Post commit CI
passes](https://github.com/tenstorrent/tt-metal/actions/runs/12956459836)
- [x] [Blackhole Post
commit](https://github.com/tenstorrent/tt-metal/actions/runs/12956461469)
- [x] [Run
microbenchmarks](https://github.com/tenstorrent/tt-metal/actions/runs/12956474089)
- [x] [T3000 frequent
tests](https://github.com/tenstorrent/tt-metal/actions/runs/12956470735)
- [x] [ttnn
sweeps](https://github.com/tenstorrent/tt-metal/actions/runs/12892223851)

---------

Co-authored-by: Saad Jameel <[email protected]>
Co-authored-by: Ligang Long <[email protected]>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Jan 25, 2025
1 parent d6d8ba8 commit b130161
Show file tree
Hide file tree
Showing 35 changed files with 368 additions and 211 deletions.
6 changes: 3 additions & 3 deletions models/experimental/functional_unet/tests/test_unet_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 444416}], indirect=True)
@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 458752}], indirect=True)
@pytest.mark.parametrize(
"batch, groups, iterations",
((1, 2, 32),),
Expand Down Expand Up @@ -108,7 +108,7 @@ def test_unet_trace(

@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
"device_params", [{"l1_small_size": 68864, "trace_region_size": 917504, "num_command_queues": 2}], indirect=True
)
@pytest.mark.parametrize(
"batch, groups, iterations",
Expand Down Expand Up @@ -343,7 +343,7 @@ def test_unet_trace_2cq_multi_device(

@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
"device_params", [{"l1_small_size": 68864, "trace_region_size": 1376256, "num_command_queues": 2}], indirect=True
)
@pytest.mark.parametrize(
"batch, groups, iterations",
Expand Down
4 changes: 2 additions & 2 deletions tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ uint32_t shard_size = shard_height * shard_width;
uint32_t input_unit_size = sizeof(uint32_t);
uint32_t shard_width_bytes = shard_width * data_size;
uint32_t num_units_per_row = shard_width * input_unit_size;
uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment());
uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::L1));
```
In order to shard the correct data segments to the respective core, we indicate the shard height, width, size, and other data for the kernel function.
For this situation, 16 units of data will be sharded across 4 cores; each core will have 4 units of data in their corresponding circular buffer.
The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory.
The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory with explicit referencing to BufferType::L1.
This example demonstrates height sharding; the shard height is therefore set to evenly distribute the number of vector values across the cores.
If the sharding strategy was different (i.e. width sharding or block sharding), the appropriate values for both the shard height and width would need to be set.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,15 @@ def test_sharded_rm(
),
)

yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation)
yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation, keep_l1_aligned=True)

zt = ttnn.sharded_to_interleaved(
yt,
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
buffer_type=ttnn.BufferType.L1,
),
is_l1_aligned=True,
)

tt_og = xt.cpu().to_torch()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) {
auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id());
auto storage_core_bank_size =
tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config);
const uint32_t allocator_alignment = device->get_allocator_alignment();
const uint32_t allocator_alignment = device->get_allocator_alignment(BufferType::L1);
const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value()
? storage_core_bank_size.value()
: (soc_desc.worker_l1_size - l1_unreserved_base);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
device->load_sub_device_manager(sub_device_manager_1);

auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size());
EXPECT_TRUE(buffer_1->address() <= max_addr - buffer_1->aligned_page_size());
EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
std::vector<uint32_t> output_1;
EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
Expand All @@ -105,7 +105,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
device->load_sub_device_manager(sub_device_manager_2);

auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size());
EXPECT_TRUE(buffer_3->address() <= max_addr - buffer_3->aligned_page_size());
EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
std::vector<uint32_t> output_2;
EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
Expand All @@ -118,7 +118,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
}

auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size());
EXPECT_TRUE(buffer_4->address() <= max_addr - buffer_4->aligned_page_size());
EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
}

Expand Down
7 changes: 4 additions & 3 deletions tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ inline void generate_random_paged_payload(
words_per_page);

// Note: the dst address marches in unison regardless of whether or not a core is written to
uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buf_type);
for (uint32_t page_id = start_page; page_id < start_page + cmd.write_paged.pages; page_id++) {
CoreCoord bank_core;
uint32_t bank_id = page_id % num_banks;
Expand Down Expand Up @@ -931,8 +931,9 @@ inline void gen_dispatcher_paged_write_cmd(
uint32_t start_page,
uint32_t page_size,
uint32_t pages) {
uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
uint32_t num_banks = device->num_banks(is_dram ? BufferType::DRAM : BufferType::L1);
BufferType buffer_type = is_dram ? BufferType::DRAM : BufferType::L1;
uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buffer_type);
uint32_t num_banks = device->num_banks(buffer_type);
CoreType core_type = is_dram ? CoreType::DRAM : CoreType::WORKER;

// Not safe to mix paged L1 and paged DRAM writes currently in this test since same book-keeping.
Expand Down
5 changes: 3 additions & 2 deletions tests/ttnn/unit_tests/operations/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,8 +527,9 @@ def test_bh_alignment_i2s(
memory_config=input_buffer_type,
dtype=ttnn.bfloat16,
)
x_t_sharded = ttnn.to_memory_config(x_t, shard_config)
x_t = ttnn.to_memory_config(x_t_sharded, output_buffer_type)
# So far the sharded tensor alignment is controled by keep_l1_aligned flag, will remove it later after launch
x_t_sharded = ttnn.interleaved_to_sharded(x_t, shard_config, keep_l1_aligned=True)
x_t = ttnn.sharded_to_interleaved(x_t_sharded, output_buffer_type, is_l1_aligned=True)
output_data = ttnn.from_device(x_t)
output_data = ttnn.to_torch(output_data)
passing = torch.equal(input_data, output_data)
Expand Down
4 changes: 2 additions & 2 deletions tests/ttnn/unit_tests/operations/test_group_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
sharded_mem_config = ttnn.MemoryConfig(
ttnn.types.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec
)
input_tensor = ttnn.to_memory_config(input_tensor, sharded_mem_config)
input_tensor = ttnn.interleaved_to_sharded(input_tensor, sharded_mem_config, keep_l1_aligned=True)

# groupnorm
output_tensor = ttnn.group_norm(
Expand All @@ -306,7 +306,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
)

# output tensor
output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG)
output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG, is_l1_aligned=True)
output_tensor = ttnn.from_device(output_tensor)
output_tensor = ttnn.to_torch(output_tensor)

Expand Down
6 changes: 4 additions & 2 deletions tests/ttnn/unit_tests/operations/test_pad.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,10 @@ def test_pad_rm_sharded_stickwise(
ttnn_input_tensor = ttnn.from_torch(
torch_input_tensor, dtype=ttnn.float32, layout=ttnn.ROW_MAJOR_LAYOUT, device=device
)
ttnn_sharded_input_tensor = ttnn.to_memory_config(ttnn_input_tensor, input_shard_memory_config)

# Still relay on keep_l1_aligned = True to make it work with the current implementation
ttnn_sharded_input_tensor = ttnn.interleaved_to_sharded(
ttnn_input_tensor, input_shard_memory_config, keep_l1_aligned=True
)
padded_tensor = ttnn.pad(ttnn_sharded_input_tensor, pad_to_shape, input_tensor_start, pad_value)

tt_output_tensor = ttnn.to_memory_config(padded_tensor, ttnn.L1_MEMORY_CONFIG)
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/api/tt-metalium/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ const std::vector<uint32_t>& bank_ids_from_dram_channel(const Allocator& allocat
const std::vector<uint32_t>& bank_ids_from_logical_core(
const Allocator& allocator, BufferType buffer_type, const CoreCoord& logical_core);

uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type);

Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type);

void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);
Expand Down
3 changes: 2 additions & 1 deletion tt_metal/api/tt-metalium/allocator_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ struct AllocatorConfig {
size_t dram_bank_size = 0;
std::vector<size_t> dram_bank_offsets = {};
uint32_t dram_unreserved_base = 0;
uint32_t dram_alignment = 0;
//! worker specific configuration
uint32_t l1_unreserved_base = 0;
CoreRangeSet worker_grid = {};
Expand All @@ -56,7 +57,7 @@ struct AllocatorConfig {
BankMapping l1_bank_remap =
{}; // for remapping which l1 bank points to which bank if we assume normal row-major assignment
CoreRangeSet compute_grid = {};
uint32_t alignment = 0;
uint32_t l1_alignment = 0;
bool disable_interleaved = false;
void reset();
~AllocatorConfig() { reset(); }
Expand Down
Loading

0 comments on commit b130161

Please sign in to comment.