Skip to content

Commit

Permalink
#0: in place halo
Browse files Browse the repository at this point in the history
  • Loading branch information
wransom-TT committed Mar 4, 2025
1 parent f09f173 commit 8115fd5
Show file tree
Hide file tree
Showing 26 changed files with 671 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def create_unet_model_parameters(
parameters.c8["use_activation_double_buffer"] = True
parameters.c8["use_split_reader"] = True
parameters.c8["input_channels_alignment"] = 16
parameters.c8["in_place"] = True
parameters.c8_2["conv_blocking_and_parallelization_config_override"] = {"act_block_h": 8 * 32}
parameters.c8_2["use_activation_double_buffer"] = True
parameters.c8_2["use_split_reader"] = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def __init__(
output_layout=output_layout,
input_channels_alignment=conv.input_channels_alignment if "input_channels_alignment" in conv else 32,
reshard_if_not_optimal=reshard_if_not_optimal,
in_place=conv.in_place if "in_place" in conv else False,
)
self.compute_config = ttnn.init_device_compute_kernel_config(
device.arch(),
Expand Down
7 changes: 4 additions & 3 deletions tests/tt_eager/ops/test_sliding_window_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ uint32_t compare_conv_out_with_golden(
uint32_t validate_generate_halo_kernel_config(
tt::tt_metal::IDevice* device,
const std::vector<ShardBoundary>& shard_boundaries,
const std::tuple<vector<vector<uint16_t>>, std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>>&
halo_kernel_config,
const std::
tuple<vector<vector<uint16_t>>, std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>, int>&
halo_kernel_config,
const vector<bool>& pad_metadata,
bool remote_read = false,
bool is_block_sharded = false,
bool transpose_mcast = false) {
auto [flattened_pad_config, flattened_local_config, flattened_remote_config] = halo_kernel_config;
auto [flattened_pad_config, flattened_local_config, flattened_remote_config, max_ref_size] = halo_kernel_config;

uint32_t padded_input_tensor_buf_idx = 0;
uint32_t invalid_pads = 0, invalid_indices = 0;
Expand Down
84 changes: 82 additions & 2 deletions tests/ttnn/unit_tests/operations/test_maxpool2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,14 @@
from models.utility_functions import is_wormhole_b0, is_grayskull, is_x2_harvested
from tests.ttnn.utils_for_testing import assert_with_pcc

import numpy as np
import matplotlib.pyplot as plt
import copy

import ttnn

output_shards = [[] for _ in range(64)]


def run_max_pool(
act_shape,
Expand All @@ -25,6 +31,7 @@ def run_max_pool(
memory_config=None,
shard_scheme=None,
ceil_mode=False,
in_place_halo=False,
):
in_n, in_c, in_h, in_w = act_shape
kernel_h, kernel_w = kernel_size
Expand Down Expand Up @@ -126,6 +133,18 @@ def run_max_pool(
if in_c / cores_x < 16:
pytest.skip("Block sharding requires large enough channels to shard (at least 16 per core)")

if in_place_halo == True:
if dtype == ttnn.bfloat8_b:
pytest.skip("In place halo not supported for BFP8_B")
if is_grayskull():
pytest.skip("In place halo not supported on Grayskull")
if (
kernel_size == (13, 13)
and in_c >= 640
and (shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED or shard_scheme == None)
):
pytest.skip("This case runs out of memory with in place")

torch.manual_seed(0)
torch.set_printoptions(precision=3, sci_mode=False, linewidth=500, threshold=10000, edgeitems=32)

Expand Down Expand Up @@ -192,8 +211,41 @@ def run_max_pool(
memory_config=memory_config,
applied_shard_scheme=shard_scheme,
ceil_mode=ceil_mode,
in_place_halo=in_place_halo,
)

# for core_id in range(0, 64):
# output_shard = torch.Tensor(ttnn.to_torch(output.extract_shard(core_id).pad_to_tile(0.0).cpu()))
# output_shards[core_id].append(output_shard)
# if len(output_shards[0]) == 2:
# for core_id in range(0, 64):
# print(f"Core ID: {core_id}")
# # coord = ttnn.CoreCoord(x, y)
# gold_shard = output_shards[core_id][0]
# opt_shard = output_shards[core_id][1]
# diff = gold_shard[0][0] - opt_shard[0][0]

# print(gold_shard[0, 0, :, :1])
# print(opt_shard[0, 0, :, :1])
# print("--")

# diff = diff.to(torch.float32)
# # Replace -inf with zeros only where both gold_shard[0][0] and opt_shard[0][0] are -inf
# mask = (gold_shard[0][0] == -float("inf")) & (opt_shard[0][0] == -float("inf"))
# diff = torch.where(mask, torch.tensor(0.0, dtype=diff.dtype), diff)

# # Plot the difference
# plt.imshow(diff[:, -32:], cmap="viridis")
# plt.colorbar()
# plt.title("Difference between output shards")

# # Save the plot
# filename = "output_shard_pics/output_shard_difference_core_" + str(core_id) + ".png"
# plt.savefig(filename)

# # Clear the plot to avoid overlap in subsequent runs
# plt.clf()

output_host = output.cpu()
output_pytorch_padded = torch.Tensor(ttnn.to_torch(output_host))
output_pytorch = output_pytorch_padded[:, :, :, :in_c]
Expand Down Expand Up @@ -290,7 +342,7 @@ def run_max_pool(
[1, 640, 32, 32],
[1, 576, 32, 32],
[1, 384, 32, 32],
# C=16 test
# C=16 test2
[1, 16, 10, 10],
)
),
Expand Down Expand Up @@ -337,7 +389,16 @@ def run_max_pool(
True,
],
)
def test_run_max_pool(act_shape, kernel_size, padding, stride, dilation, device, dtype, use_program_cache, ceil_mode):
@pytest.mark.parametrize(
"in_place_halo",
[
False,
True,
],
)
def test_run_max_pool(
act_shape, kernel_size, padding, stride, dilation, device, dtype, use_program_cache, ceil_mode, in_place_halo
):
run_max_pool(
act_shape,
kernel_size,
Expand All @@ -348,6 +409,7 @@ def test_run_max_pool(act_shape, kernel_size, padding, stride, dilation, device,
dtype,
shard_scheme=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
ceil_mode=ceil_mode,
in_place_halo=in_place_halo,
)


Expand Down Expand Up @@ -417,6 +479,13 @@ def test_run_max_pool(act_shape, kernel_size, padding, stride, dilation, device,
True,
],
)
@pytest.mark.parametrize(
"in_place_halo",
[
False,
True,
],
)
def test_run_max_pool_width_shard(
act_shape,
kernel_size,
Expand All @@ -427,6 +496,7 @@ def test_run_max_pool_width_shard(
dtype,
use_program_cache,
ceil_mode,
in_place_halo,
):
run_max_pool(
act_shape,
Expand All @@ -438,6 +508,7 @@ def test_run_max_pool_width_shard(
dtype,
shard_scheme=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
ceil_mode=ceil_mode,
in_place_halo=in_place_halo,
)


Expand Down Expand Up @@ -527,6 +598,13 @@ def test_run_max_pool_width_shard(
True,
],
)
@pytest.mark.parametrize(
"in_place_halo",
[
False,
True,
],
)
def test_run_max_pool_block_shard(
act_shape,
kernel_size,
Expand All @@ -537,6 +615,7 @@ def test_run_max_pool_block_shard(
dtype,
use_program_cache,
ceil_mode,
in_place_halo,
):
run_max_pool(
act_shape,
Expand All @@ -548,6 +627,7 @@ def test_run_max_pool_block_shard(
dtype,
shard_scheme=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
ceil_mode=ceil_mode,
in_place_halo=in_place_halo,
)


Expand Down
40 changes: 23 additions & 17 deletions tests/ttnn/unit_tests/operations/test_new_conv2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def run_conv(
output_mesh_composer=None,
enable_split_reader=False,
activation="",
in_place_halo=False,
):
if isinstance(device, ttnn.MeshDevice):
assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh"
Expand Down Expand Up @@ -136,6 +137,7 @@ def run_conv(
enable_act_double_buffer=False,
enable_split_reader=enable_split_reader,
enable_subblock_padding=False,
in_place=in_place_halo,
output_layout=output_layout,
activation=activation,
)
Expand Down Expand Up @@ -1761,24 +1763,24 @@ def test_unet_conv_groups_2_wh(
)
@pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
@pytest.mark.parametrize(
"output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant",
"output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, in_place_halo",
(
(16, 4, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
(16, 16, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
(16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
(32, 16, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
(32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
(32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False),
(64, 32, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False),
(64, 64, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False),
(32, 96, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False),
(32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False),
(32, 64, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
(32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
# (16, 48, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True), # OOM - need inplace convolution
(16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
# (16, 32, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True), # OOM - need inplace convolution
(1, 16, 1056, 160, 1, 1, 1, 1, 0, 0, HS, {"act_block_h": 2 * 32}, False),
(16, 4, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
(16, 16, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
(16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
(32, 16, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(64, 32, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(64, 64, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(32, 96, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(32, 64, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
(16, 48, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, True),
(16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
(16, 32, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, True),
(1, 16, 1056, 160, 1, 1, 1, 1, 0, 0, HS, {"act_block_h": 2 * 32}, False, False),
),
)
@pytest.mark.parametrize(
Expand Down Expand Up @@ -1814,13 +1816,16 @@ def test_unet_conv_groups_4_6_wh(
use_shallow_conv_variant,
output_layout,
groups,
in_place_halo,
):
if (device.compute_with_storage_grid_size().x, device.compute_with_storage_grid_size().y) == (8, 7):
pytest.skip("Test is not supported on n300 (8,7) grid")
if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat8_b:
pytest.skip("Row major layout not compatible with bfloat8_b")
if output_layout == ttnn.ROW_MAJOR_LAYOUT and input_height >= 1056:
pytest.skip("OOM")
if input_channels == 32 and input_height == 1056 and groups == 6:
pytest.skip("OOM, need fused untilize with halo")
run_conv(
device,
torch_tensor_map,
Expand All @@ -1843,6 +1848,7 @@ def test_unet_conv_groups_4_6_wh(
use_shallow_conv_variant=use_shallow_conv_variant,
output_layout=output_layout,
groups=groups,
in_place_halo=in_place_halo,
)


Expand Down
3 changes: 2 additions & 1 deletion ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ Result conv2d(
parallel_config.shard_orientation == ShardOrientation::COL_MAJOR,
0,
input_tensor_post_tm.memory_config(),
true);
true,
conv_config.in_place);

if (conv_config.deallocate_activation) {
input_tensor_post_tm.deallocate(/*force*/ true);
Expand Down
5 changes: 4 additions & 1 deletion ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,7 @@ void py_bind_conv2d(py::module& module) {
bool,
bool,
bool,
bool,
bool>(),
py::kw_only(),
py::arg("dtype") = DataType::BFLOAT16,
Expand All @@ -354,7 +355,8 @@ void py_bind_conv2d(py::module& module) {
py::arg("enable_act_double_buffer") = false,
py::arg("enable_weights_double_buffer") = false,
py::arg("enable_split_reader") = false,
py::arg("enable_subblock_padding") = false);
py::arg("enable_subblock_padding") = false,
py::arg("in_place") = false);
py_conv_config.def_readwrite("dtype", &Conv2dConfig::dtype);
py_conv_config.def_readwrite("weights_dtype", &Conv2dConfig::weights_dtype);
py_conv_config.def_readwrite("activation", &Conv2dConfig::activation);
Expand All @@ -373,6 +375,7 @@ void py_bind_conv2d(py::module& module) {
py_conv_config.def_readwrite("enable_weights_double_buffer", &Conv2dConfig::enable_weights_double_buffer);
py_conv_config.def_readwrite("enable_split_reader", &Conv2dConfig::enable_split_reader);
py_conv_config.def_readwrite("enable_subblock_padding", &Conv2dConfig::enable_subblock_padding);
py_conv_config.def_readwrite("in_place", &Conv2dConfig::in_place);

py_conv_config.def("__repr__", [](const Conv2dConfig& config) { return fmt::format("{}", config); });

Expand Down
10 changes: 8 additions & 2 deletions ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ struct Conv2dConfig {
bool enable_split_reader = false;

bool enable_subblock_padding = false;

// Re-use input tensor storage when creating output tensor
bool in_place = false;

static constexpr auto attribute_names = std::make_tuple(
"dtype",
"weights_dtype",
Expand All @@ -91,7 +95,8 @@ struct Conv2dConfig {
"enable_act_double_buffer",
"enable_weights_double_buffer",
"enable_split_reader",
"enable_subblock_padding");
"enable_subblock_padding",
"in_place");
const auto attribute_values() const {
return std::make_tuple(
std::cref(this->dtype),
Expand All @@ -111,7 +116,8 @@ struct Conv2dConfig {
std::cref(this->enable_act_double_buffer),
std::cref(this->enable_weights_double_buffer),
std::cref(this->enable_split_reader),
std::cref(this->enable_subblock_padding));
std::cref(this->enable_subblock_padding),
std::cref(this->in_place));
}
};

Expand Down
Loading

0 comments on commit 8115fd5

Please sign in to comment.