Skip to content

Commit

Permalink
Revert some commits to fix single-card pipelines (#17121)
Browse files Browse the repository at this point in the history
### Ticket

Device perf
Model perf
Nightly fast dispatch
Demo tests

were all failing

### Problem description
Provide context for the problem.

### What's changed
Describe the approach used to solve the problem.
Summarize the changes made and its impact.

### Checklist
- [ ] Post commit CI passes
- [ ] Blackhole Post commit (if applicable)
- [ ] Model regression CI testing passes (if applicable)
- [ ] Device performance regression CI testing passes (if applicable)
- [ ] **(For models and ops writers)** Full [new
models](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
tests passes
- [ ] New/Existing tests provide coverage for changes

---------

Co-authored-by: Raymond Kim <[email protected]>
  • Loading branch information
tt-rkim and rayraykay authored Jan 26, 2025
1 parent a085f40 commit a5796bc
Show file tree
Hide file tree
Showing 42 changed files with 240 additions and 400 deletions.
6 changes: 3 additions & 3 deletions models/experimental/functional_unet/tests/test_unet_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 458752}], indirect=True)
@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 444416}], indirect=True)
@pytest.mark.parametrize(
"batch, groups, iterations",
((1, 2, 32),),
Expand Down Expand Up @@ -108,7 +108,7 @@ def test_unet_trace(

@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 917504, "num_command_queues": 2}], indirect=True
"device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
)
@pytest.mark.parametrize(
"batch, groups, iterations",
Expand Down Expand Up @@ -343,7 +343,7 @@ def test_unet_trace_2cq_multi_device(

@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 1376256, "num_command_queues": 2}], indirect=True
"device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
)
@pytest.mark.parametrize(
"batch, groups, iterations",
Expand Down
4 changes: 2 additions & 2 deletions tech_reports/prog_examples/shard_data_rm/shard_data_rm.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,12 @@ uint32_t shard_size = shard_height * shard_width;
uint32_t input_unit_size = sizeof(uint32_t);
uint32_t shard_width_bytes = shard_width * data_size;
uint32_t num_units_per_row = shard_width * input_unit_size;
uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment(BufferType::L1));
uint32_t padded_offset_bytes = align(input_unit_size, device->get_allocator_alignment());
```

In order to shard the correct data segments to the respective core, we indicate the shard height, width, size, and other data for the kernel function.
For this situation, 16 units of data will be sharded across 4 cores; each core will have 4 units of data in their corresponding circular buffer.
The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory with explicit referencing to BufferType::L1.
The `padded_offset_bytes` is set to ensure that the correct address is read from the kernel function when moving data to the circular buffer; in this case, the addresses are aligned to L1 memory.
This example demonstrates height sharding; the shard height is therefore set to evenly distribute the number of vector values across the cores.
If the sharding strategy was different (i.e. width sharding or block sharding), the appropriate values for both the shard height and width would need to be set.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import ttnn
from loguru import logger
from tests.tt_eager.python_api_testing.sweep_tests import comparison_funcs
from tests.ttnn.utils_for_testing import assert_with_pcc


@pytest.mark.parametrize(
Expand Down Expand Up @@ -43,9 +42,23 @@ def test_argmax(self, input_shapes, dim, memconfig, device):
tt_output_tensor_on_device = ttnn.argmax(input_tensor, dim=dim)
tt_out_tensor = tt_output_tensor_on_device.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
golden_tensor = torch.argmax(input_data, dim=dim)
if dim == 1 or dim == -3 or dim == 0 or dim == -4:
tt_out_tensor = tt_out_tensor[0, :, 0 : input_shapes[2], 0 : input_shapes[3]]
else:
if input_shapes[1] != 1 or input_shapes[0] != 1:
if dim == 2 or dim == -2:
tt_out_tensor = tt_out_tensor[0, :, :, 0 : input_shapes[3]]
else:
tt_out_tensor = tt_out_tensor[0, :, :, 0 : input_shapes[2]]
else:
if dim == 2 or dim == -2:
tt_out_tensor = tt_out_tensor[0, 0, 0, 0 : input_shapes[3]]
else:
tt_out_tensor = tt_out_tensor[0, 0, 0, 0 : input_shapes[2]]

pt_out_tensor = golden_tensor
assert_with_pcc(pt_out_tensor, tt_out_tensor)
tt_out_tensor = tt_output_tensor_on_device.cpu().to(ttnn.ROW_MAJOR_LAYOUT).to_torch()
comp_pass, comp_out = comparison_funcs.comp_pcc(pt_out_tensor, tt_out_tensor, pcc=0.99)
comp_all, _ = comparison_funcs.comp_allclose(pt_out_tensor, tt_out_tensor, atol=0, rtol=0)

# DEBUG
Expand All @@ -55,5 +68,8 @@ def test_argmax(self, input_shapes, dim, memconfig, device):
# print(flat)
# print(torch.topk(flat, 8))

logger.info(comp_pass)
logger.info(comp_all)
assert comp_all
logger.info(comp_out)
status = comp_pass | comp_all
assert status
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ def test_min_max_for_dim_hw(device, use_program_cache, shape_dim, kind, layout):
if kind == "max":
value = x.max()
elif kind == "min":
if N * C % 32 != 0:
pytest.skip("global min with Tensor dimension N*C not multiple of 32 is not supported at this time.")
value = x.min()
elif kind == "mean":
value = x.mean()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,15 +164,14 @@ def test_sharded_rm(
),
)

yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation, keep_l1_aligned=True)
yt = ttnn.interleaved_to_sharded(xt, grid_size, shard_size, shard_scheme, shard_orientation)

zt = ttnn.sharded_to_interleaved(
yt,
ttnn.MemoryConfig(
memory_layout=ttnn.TensorMemoryLayout.INTERLEAVED,
buffer_type=ttnn.BufferType.L1,
),
is_l1_aligned=True,
)

tt_og = xt.cpu().to_torch()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ uint64_t get_alloc_limit(const tt::tt_metal::IDevice* device) {
auto dispatch_core_config = dispatch_core_manager::instance().get_dispatch_core_config(device->id());
auto storage_core_bank_size =
tt::get_storage_core_bank_size(device->id(), device->num_hw_cqs(), dispatch_core_config);
const uint32_t allocator_alignment = device->get_allocator_alignment(BufferType::L1);
const uint32_t allocator_alignment = device->get_allocator_alignment();
const uint32_t interleaved_l1_bank_size = storage_core_bank_size.has_value()
? storage_core_bank_size.value()
: (soc_desc.worker_l1_size - l1_unreserved_base);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
device->load_sub_device_manager(sub_device_manager_1);

auto buffer_1 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_TRUE(buffer_1->address() <= max_addr - buffer_1->aligned_page_size());
EXPECT_EQ(buffer_1->address(), max_addr - buffer_1->aligned_page_size());
EnqueueWriteBuffer(device->command_queue(), buffer_1, input_1, false);
std::vector<uint32_t> output_1;
EnqueueReadBuffer(device->command_queue(), buffer_1, output_1, true);
Expand All @@ -105,7 +105,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
device->load_sub_device_manager(sub_device_manager_2);

auto buffer_3 = CreateBuffer(shard_config_2, SubDeviceId{1});
EXPECT_TRUE(buffer_3->address() <= max_addr - buffer_3->aligned_page_size());
EXPECT_EQ(buffer_3->address(), max_addr - buffer_3->aligned_page_size());
EnqueueWriteBuffer(device->command_queue(), buffer_3, input_2, false);
std::vector<uint32_t> output_2;
EnqueueReadBuffer(device->command_queue(), buffer_3, output_2, true);
Expand All @@ -118,7 +118,7 @@ TEST_F(CommandQueueSingleCardFixture, TensixTestSubDeviceAllocations) {
}

auto buffer_4 = CreateBuffer(shard_config_1, SubDeviceId{0});
EXPECT_TRUE(buffer_4->address() <= max_addr - buffer_4->aligned_page_size());
EXPECT_EQ(buffer_4->address(), max_addr - buffer_4->aligned_page_size());
EXPECT_THROW(CreateBuffer(interleaved_config, SubDeviceId{0}), std::exception);
}

Expand Down
7 changes: 3 additions & 4 deletions tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ inline void generate_random_paged_payload(
words_per_page);

// Note: the dst address marches in unison regardless of whether or not a core is written to
uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buf_type);
uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
for (uint32_t page_id = start_page; page_id < start_page + cmd.write_paged.pages; page_id++) {
CoreCoord bank_core;
uint32_t bank_id = page_id % num_banks;
Expand Down Expand Up @@ -931,9 +931,8 @@ inline void gen_dispatcher_paged_write_cmd(
uint32_t start_page,
uint32_t page_size,
uint32_t pages) {
BufferType buffer_type = is_dram ? BufferType::DRAM : BufferType::L1;
uint32_t page_size_alignment_bytes = device->get_allocator_alignment(buffer_type);
uint32_t num_banks = device->num_banks(buffer_type);
uint32_t page_size_alignment_bytes = device->get_allocator_alignment();
uint32_t num_banks = device->num_banks(is_dram ? BufferType::DRAM : BufferType::L1);
CoreType core_type = is_dram ? CoreType::DRAM : CoreType::WORKER;

// Not safe to mix paged L1 and paged DRAM writes currently in this test since same book-keeping.
Expand Down
5 changes: 2 additions & 3 deletions tests/ttnn/unit_tests/operations/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,9 +527,8 @@ def test_bh_alignment_i2s(
memory_config=input_buffer_type,
dtype=ttnn.bfloat16,
)
# So far the sharded tensor alignment is controled by keep_l1_aligned flag, will remove it later after launch
x_t_sharded = ttnn.interleaved_to_sharded(x_t, shard_config, keep_l1_aligned=True)
x_t = ttnn.sharded_to_interleaved(x_t_sharded, output_buffer_type, is_l1_aligned=True)
x_t_sharded = ttnn.to_memory_config(x_t, shard_config)
x_t = ttnn.to_memory_config(x_t_sharded, output_buffer_type)
output_data = ttnn.from_device(x_t)
output_data = ttnn.to_torch(output_data)
passing = torch.equal(input_data, output_data)
Expand Down
4 changes: 2 additions & 2 deletions tests/ttnn/unit_tests/operations/test_group_norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
sharded_mem_config = ttnn.MemoryConfig(
ttnn.types.TensorMemoryLayout.BLOCK_SHARDED, ttnn.types.BufferType.L1, shard_spec
)
input_tensor = ttnn.interleaved_to_sharded(input_tensor, sharded_mem_config, keep_l1_aligned=True)
input_tensor = ttnn.to_memory_config(input_tensor, sharded_mem_config)

# groupnorm
output_tensor = ttnn.group_norm(
Expand All @@ -306,7 +306,7 @@ def test_group_norm_with_block_sharded_v2_8x8_grid(device, N, C, H, W, num_group
)

# output tensor
output_tensor = ttnn.sharded_to_interleaved(output_tensor, ttnn.L1_MEMORY_CONFIG, is_l1_aligned=True)
output_tensor = ttnn.to_memory_config(output_tensor, ttnn.L1_MEMORY_CONFIG)
output_tensor = ttnn.from_device(output_tensor)
output_tensor = ttnn.to_torch(output_tensor)

Expand Down
18 changes: 2 additions & 16 deletions tests/ttnn/unit_tests/operations/test_max.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import ttnn
from tests.ttnn.utils_for_testing import assert_with_pcc
from models.utility_functions import torch_random, is_grayskull
from models.utility_functions import torch_random


@pytest.mark.parametrize("batch_size", [1, 16, 1, 16])
Expand Down Expand Up @@ -99,24 +99,11 @@ def test_max_global(device, batch_size, h, w):
((2, 32, 32, 64), -3),
((32, 32, 64), -3),
((1, 2, 3, 4), -1),
((2, 22, 37, 41), -4),
((2, 32, 64, 64), -3),
((2, 22, 37, 41), -3),
((2, 32, 64, 64), -2),
((2, 22, 37, 41), -1),
((2, 32, 64, 64), -1),
((2, 22, 37), -3),
((2, 22, 37), -2),
((2, 22, 37), -1),
((1, 6, 7), -3),
((32, 6, 7), -3),
],
)
@pytest.mark.parametrize("keepdim", [True, False])
def test_max_dim(device, input_shape_and_dim, keepdim):
input_shape, max_dim = input_shape_and_dim
if is_grayskull() and (input_shape[-1] % 32 != 0 or input_shape[-2] % 32 != 0 or input_shape[max_dim] % 32 != 0):
pytest.skip("If not a tile size multiple, may fail on GS if run all the tests in this file. #17084")

torch_input_tensor = torch_random(input_shape, -100, 100, dtype=torch.bfloat16)
torch_output_tensor, _ = torch.max(torch_input_tensor, dim=max_dim, keepdim=keepdim)
Expand All @@ -129,5 +116,4 @@ def test_max_dim(device, input_shape_and_dim, keepdim):

output_tensor = ttnn.to_torch(output_tensor)

pcc = 0.9999
assert_with_pcc(torch_output_tensor, output_tensor, pcc=pcc)
assert_with_pcc(torch_output_tensor, output_tensor)
6 changes: 2 additions & 4 deletions tests/ttnn/unit_tests/operations/test_pad.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,10 +226,8 @@ def test_pad_rm_sharded_stickwise(
ttnn_input_tensor = ttnn.from_torch(
torch_input_tensor, dtype=ttnn.float32, layout=ttnn.ROW_MAJOR_LAYOUT, device=device
)
# Still relay on keep_l1_aligned = True to make it work with the current implementation
ttnn_sharded_input_tensor = ttnn.interleaved_to_sharded(
ttnn_input_tensor, input_shard_memory_config, keep_l1_aligned=True
)
ttnn_sharded_input_tensor = ttnn.to_memory_config(ttnn_input_tensor, input_shard_memory_config)

padded_tensor = ttnn.pad(ttnn_sharded_input_tensor, pad_to_shape, input_tensor_start, pad_value)

tt_output_tensor = ttnn.to_memory_config(padded_tensor, ttnn.L1_MEMORY_CONFIG)
Expand Down
1 change: 0 additions & 1 deletion tests/ttnn/unit_tests/operations/test_reduction_mean.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ def test_mean(device, batch_size, h, w, dim):
torch_output_tensor = torch.mean(torch_input_tensor, dim=dim, keepdim=True, dtype=torch.bfloat16)

input_tensor = ttnn.from_torch(torch_input_tensor, layout=ttnn.TILE_LAYOUT, device=device)
ttnn.fill_implicit_tile_padding(input_tensor, 42) # garbage padding to test that mean removes it

output_tensor = ttnn.mean(input_tensor, dim=dim)
output_tensor = ttnn.to_torch(output_tensor)
Expand Down
2 changes: 0 additions & 2 deletions tt_metal/api/tt-metalium/allocator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,6 @@ const std::vector<uint32_t>& bank_ids_from_dram_channel(const Allocator& allocat
const std::vector<uint32_t>& bank_ids_from_logical_core(
const Allocator& allocator, BufferType buffer_type, const CoreCoord& logical_core);

uint32_t get_alignment(const Allocator& alloator, const BufferType& buffer_type);

Statistics get_statistics(const Allocator& allocator, const BufferType& buffer_type);

void dump_memory_blocks(const Allocator& allocator, const BufferType& buffer_type, std::ofstream& out);
Expand Down
3 changes: 1 addition & 2 deletions tt_metal/api/tt-metalium/allocator_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ struct AllocatorConfig {
size_t dram_bank_size = 0;
std::vector<size_t> dram_bank_offsets = {};
uint32_t dram_unreserved_base = 0;
uint32_t dram_alignment = 0;
//! worker specific configuration
uint32_t l1_unreserved_base = 0;
CoreRangeSet worker_grid = {};
Expand All @@ -57,7 +56,7 @@ struct AllocatorConfig {
BankMapping l1_bank_remap =
{}; // for remapping which l1 bank points to which bank if we assume normal row-major assignment
CoreRangeSet compute_grid = {};
uint32_t l1_alignment = 0;
uint32_t alignment = 0;
bool disable_interleaved = false;
void reset();
~AllocatorConfig() { reset(); }
Expand Down
Loading

0 comments on commit a5796bc

Please sign in to comment.