#0: in place halo

tenstorrent · Mar 4, 2025 · 8115fd5 · 8115fd5
1 parent f09f173
commit 8115fd5
Show file tree

Hide file tree

Showing 26 changed files with 671 additions and 76 deletions.
diff --git a/models/experimental/functional_unet/tt/model_preprocessing.py b/models/experimental/functional_unet/tt/model_preprocessing.py
@@ -139,6 +139,7 @@ def create_unet_model_parameters(
     parameters.c8["use_activation_double_buffer"] = True
     parameters.c8["use_split_reader"] = True
     parameters.c8["input_channels_alignment"] = 16
+    parameters.c8["in_place"] = True
     parameters.c8_2["conv_blocking_and_parallelization_config_override"] = {"act_block_h": 8 * 32}
     parameters.c8_2["use_activation_double_buffer"] = True
     parameters.c8_2["use_split_reader"] = True

diff --git a/models/experimental/functional_unet/tt/unet_shallow_ttnn.py b/models/experimental/functional_unet/tt/unet_shallow_ttnn.py
@@ -149,6 +149,7 @@ def __init__(
             output_layout=output_layout,
             input_channels_alignment=conv.input_channels_alignment if "input_channels_alignment" in conv else 32,
             reshard_if_not_optimal=reshard_if_not_optimal,
+            in_place=conv.in_place if "in_place" in conv else False,
         )
         self.compute_config = ttnn.init_device_compute_kernel_config(
             device.arch(),

diff --git a/tests/tt_eager/ops/test_sliding_window_ops.cpp b/tests/tt_eager/ops/test_sliding_window_ops.cpp
@@ -57,13 +57,14 @@ uint32_t compare_conv_out_with_golden(
 uint32_t validate_generate_halo_kernel_config(
     tt::tt_metal::IDevice* device,
     const std::vector<ShardBoundary>& shard_boundaries,
-    const std::tuple<vector<vector<uint16_t>>, std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>>&
-        halo_kernel_config,
+    const std::
+        tuple<vector<vector<uint16_t>>, std::vector<std::vector<uint16_t>>, std::vector<std::vector<uint16_t>>, int>&
+            halo_kernel_config,
     const vector<bool>& pad_metadata,
     bool remote_read = false,
     bool is_block_sharded = false,
     bool transpose_mcast = false) {
-    auto [flattened_pad_config, flattened_local_config, flattened_remote_config] = halo_kernel_config;
+    auto [flattened_pad_config, flattened_local_config, flattened_remote_config, max_ref_size] = halo_kernel_config;
 
     uint32_t padded_input_tensor_buf_idx = 0;
     uint32_t invalid_pads = 0, invalid_indices = 0;

diff --git a/tests/ttnn/unit_tests/operations/test_maxpool2d.py b/tests/ttnn/unit_tests/operations/test_maxpool2d.py
@@ -11,8 +11,14 @@
 from models.utility_functions import is_wormhole_b0, is_grayskull, is_x2_harvested
 from tests.ttnn.utils_for_testing import assert_with_pcc
 
+import numpy as np
+import matplotlib.pyplot as plt
+import copy
+
 import ttnn
 
+output_shards = [[] for _ in range(64)]
+
 
 def run_max_pool(
     act_shape,
@@ -25,6 +31,7 @@ def run_max_pool(
     memory_config=None,
     shard_scheme=None,
     ceil_mode=False,
+    in_place_halo=False,
 ):
     in_n, in_c, in_h, in_w = act_shape
     kernel_h, kernel_w = kernel_size
@@ -126,6 +133,18 @@ def run_max_pool(
         if in_c / cores_x < 16:
             pytest.skip("Block sharding requires large enough channels to shard (at least 16 per core)")
 
+    if in_place_halo == True:
+        if dtype == ttnn.bfloat8_b:
+            pytest.skip("In place halo not supported for BFP8_B")
+        if is_grayskull():
+            pytest.skip("In place halo not supported on Grayskull")
+        if (
+            kernel_size == (13, 13)
+            and in_c >= 640
+            and (shard_scheme == ttnn.TensorMemoryLayout.HEIGHT_SHARDED or shard_scheme == None)
+        ):
+            pytest.skip("This case runs out of memory with in place")
+
     torch.manual_seed(0)
     torch.set_printoptions(precision=3, sci_mode=False, linewidth=500, threshold=10000, edgeitems=32)
 
@@ -192,8 +211,41 @@ def run_max_pool(
         memory_config=memory_config,
         applied_shard_scheme=shard_scheme,
         ceil_mode=ceil_mode,
+        in_place_halo=in_place_halo,
     )
 
+    # for core_id in range(0, 64):
+    #     output_shard = torch.Tensor(ttnn.to_torch(output.extract_shard(core_id).pad_to_tile(0.0).cpu()))
+    #     output_shards[core_id].append(output_shard)
+    # if len(output_shards[0]) == 2:
+    #     for core_id in range(0, 64):
+    #         print(f"Core ID: {core_id}")
+    #         # coord = ttnn.CoreCoord(x, y)
+    #         gold_shard = output_shards[core_id][0]
+    #         opt_shard = output_shards[core_id][1]
+    #         diff = gold_shard[0][0] - opt_shard[0][0]
+
+    #         print(gold_shard[0, 0, :, :1])
+    #         print(opt_shard[0, 0, :, :1])
+    #         print("--")
+
+    #         diff = diff.to(torch.float32)
+    #         # Replace -inf with zeros only where both gold_shard[0][0] and opt_shard[0][0] are -inf
+    #         mask = (gold_shard[0][0] == -float("inf")) & (opt_shard[0][0] == -float("inf"))
+    #         diff = torch.where(mask, torch.tensor(0.0, dtype=diff.dtype), diff)
+
+    #         # Plot the difference
+    #         plt.imshow(diff[:, -32:], cmap="viridis")
+    #         plt.colorbar()
+    #         plt.title("Difference between output shards")
+
+    #         # Save the plot
+    #         filename = "output_shard_pics/output_shard_difference_core_" + str(core_id) + ".png"
+    #         plt.savefig(filename)
+
+    #         # Clear the plot to avoid overlap in subsequent runs
+    #         plt.clf()
+
     output_host = output.cpu()
     output_pytorch_padded = torch.Tensor(ttnn.to_torch(output_host))
     output_pytorch = output_pytorch_padded[:, :, :, :in_c]
@@ -290,7 +342,7 @@ def run_max_pool(
             [1, 640, 32, 32],
             [1, 576, 32, 32],
             [1, 384, 32, 32],
-            # C=16 test
+            # C=16 test2
             [1, 16, 10, 10],
         )
     ),
@@ -337,7 +389,16 @@ def run_max_pool(
         True,
     ],
 )
-def test_run_max_pool(act_shape, kernel_size, padding, stride, dilation, device, dtype, use_program_cache, ceil_mode):
+@pytest.mark.parametrize(
+    "in_place_halo",
+    [
+        False,
+        True,
+    ],
+)
+def test_run_max_pool(
+    act_shape, kernel_size, padding, stride, dilation, device, dtype, use_program_cache, ceil_mode, in_place_halo
+):
     run_max_pool(
         act_shape,
         kernel_size,
@@ -348,6 +409,7 @@ def test_run_max_pool(act_shape, kernel_size, padding, stride, dilation, device,
         dtype,
         shard_scheme=ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
         ceil_mode=ceil_mode,
+        in_place_halo=in_place_halo,
     )
 
 
@@ -417,6 +479,13 @@ def test_run_max_pool(act_shape, kernel_size, padding, stride, dilation, device,
         True,
     ],
 )
+@pytest.mark.parametrize(
+    "in_place_halo",
+    [
+        False,
+        True,
+    ],
+)
 def test_run_max_pool_width_shard(
     act_shape,
     kernel_size,
@@ -427,6 +496,7 @@ def test_run_max_pool_width_shard(
     dtype,
     use_program_cache,
     ceil_mode,
+    in_place_halo,
 ):
     run_max_pool(
         act_shape,
@@ -438,6 +508,7 @@ def test_run_max_pool_width_shard(
         dtype,
         shard_scheme=ttnn.TensorMemoryLayout.WIDTH_SHARDED,
         ceil_mode=ceil_mode,
+        in_place_halo=in_place_halo,
     )
 
 
@@ -527,6 +598,13 @@ def test_run_max_pool_width_shard(
         True,
     ],
 )
+@pytest.mark.parametrize(
+    "in_place_halo",
+    [
+        False,
+        True,
+    ],
+)
 def test_run_max_pool_block_shard(
     act_shape,
     kernel_size,
@@ -537,6 +615,7 @@ def test_run_max_pool_block_shard(
     dtype,
     use_program_cache,
     ceil_mode,
+    in_place_halo,
 ):
     run_max_pool(
         act_shape,
@@ -548,6 +627,7 @@ def test_run_max_pool_block_shard(
         dtype,
         shard_scheme=ttnn.TensorMemoryLayout.BLOCK_SHARDED,
         ceil_mode=ceil_mode,
+        in_place_halo=in_place_halo,
     )
 
 

diff --git a/tests/ttnn/unit_tests/operations/test_new_conv2d.py b/tests/ttnn/unit_tests/operations/test_new_conv2d.py
@@ -73,6 +73,7 @@ def run_conv(
     output_mesh_composer=None,
     enable_split_reader=False,
     activation="",
+    in_place_halo=False,
 ):
     if isinstance(device, ttnn.MeshDevice):
         assert input_mesh_mapper is not None, "Expected mesh mapper for input tensor when using device mesh"
@@ -136,6 +137,7 @@ def run_conv(
         enable_act_double_buffer=False,
         enable_split_reader=enable_split_reader,
         enable_subblock_padding=False,
+        in_place=in_place_halo,
         output_layout=output_layout,
         activation=activation,
     )
@@ -1761,24 +1763,24 @@ def test_unet_conv_groups_2_wh(
 )
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 16384}], indirect=True)
 @pytest.mark.parametrize(
-    "output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant",
+    "output_channels, input_channels, input_height, input_width, filter_height, filter_width, stride_h, stride_w, pad_h, pad_w, shard_layout, config_override, use_shallow_conv_variant, in_place_halo",
     (
-        (16, 4, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
-        (16, 16, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
-        (16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
-        (32, 16, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (64, 32, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (64, 64, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (32, 96, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (32, 64, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
-        (32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False),
-        # (16, 48, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True), # OOM - need inplace convolution
-        (16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True),
-        # (16, 32, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True), # OOM - need inplace convolution
-        (1, 16, 1056, 160, 1, 1, 1, 1, 0, 0, HS, {"act_block_h": 2 * 32}, False),
+        (16, 4, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
+        (16, 16, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
+        (16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
+        (32, 16, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (64, 32, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (64, 64, 66, 10, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (32, 96, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (32, 32, 132, 20, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (32, 64, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (32, 32, 264, 40, 3, 3, 1, 1, 1, 1, HS, None, False, False),
+        (16, 48, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, True),
+        (16, 16, 528, 80, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, False),
+        (16, 32, 1056, 160, 3, 3, 1, 1, 1, 1, HS, {"act_block_h": 2 * 32}, True, True),
+        (1, 16, 1056, 160, 1, 1, 1, 1, 0, 0, HS, {"act_block_h": 2 * 32}, False, False),
     ),
 )
 @pytest.mark.parametrize(
@@ -1814,13 +1816,16 @@ def test_unet_conv_groups_4_6_wh(
     use_shallow_conv_variant,
     output_layout,
     groups,
+    in_place_halo,
 ):
     if (device.compute_with_storage_grid_size().x, device.compute_with_storage_grid_size().y) == (8, 7):
         pytest.skip("Test is not supported on n300 (8,7) grid")
     if output_layout == ttnn.ROW_MAJOR_LAYOUT and activations_dtype == ttnn.bfloat8_b:
         pytest.skip("Row major layout not compatible with bfloat8_b")
     if output_layout == ttnn.ROW_MAJOR_LAYOUT and input_height >= 1056:
         pytest.skip("OOM")
+    if input_channels == 32 and input_height == 1056 and groups == 6:
+        pytest.skip("OOM, need fused untilize with halo")
     run_conv(
         device,
         torch_tensor_map,
@@ -1843,6 +1848,7 @@ def test_unet_conv_groups_4_6_wh(
         use_shallow_conv_variant=use_shallow_conv_variant,
         output_layout=output_layout,
         groups=groups,
+        in_place_halo=in_place_halo,
     )
 
 

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d.cpp
@@ -181,7 +181,8 @@ Result conv2d(
                 parallel_config.shard_orientation == ShardOrientation::COL_MAJOR,
                 0,
                 input_tensor_post_tm.memory_config(),
-                true);
+                true,
+                conv_config.in_place);
 
             if (conv_config.deallocate_activation) {
                 input_tensor_post_tm.deallocate(/*force*/ true);

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp b/ttnn/cpp/ttnn/operations/conv/conv2d/conv2d_pybind.cpp
@@ -335,6 +335,7 @@ void py_bind_conv2d(py::module& module) {
             bool,
             bool,
             bool,
+            bool,
             bool>(),
         py::kw_only(),
         py::arg("dtype") = DataType::BFLOAT16,
@@ -354,7 +355,8 @@ void py_bind_conv2d(py::module& module) {
         py::arg("enable_act_double_buffer") = false,
         py::arg("enable_weights_double_buffer") = false,
         py::arg("enable_split_reader") = false,
-        py::arg("enable_subblock_padding") = false);
+        py::arg("enable_subblock_padding") = false,
+        py::arg("in_place") = false);
     py_conv_config.def_readwrite("dtype", &Conv2dConfig::dtype);
     py_conv_config.def_readwrite("weights_dtype", &Conv2dConfig::weights_dtype);
     py_conv_config.def_readwrite("activation", &Conv2dConfig::activation);
@@ -373,6 +375,7 @@ void py_bind_conv2d(py::module& module) {
     py_conv_config.def_readwrite("enable_weights_double_buffer", &Conv2dConfig::enable_weights_double_buffer);
     py_conv_config.def_readwrite("enable_split_reader", &Conv2dConfig::enable_split_reader);
     py_conv_config.def_readwrite("enable_subblock_padding", &Conv2dConfig::enable_subblock_padding);
+    py_conv_config.def_readwrite("in_place", &Conv2dConfig::in_place);
 
     py_conv_config.def("__repr__", [](const Conv2dConfig& config) { return fmt::format("{}", config); });
 

diff --git a/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp b/ttnn/cpp/ttnn/operations/conv/conv2d/device/conv2d_op.hpp
@@ -73,6 +73,10 @@ struct Conv2dConfig {
     bool enable_split_reader = false;
 
     bool enable_subblock_padding = false;
+
+    // Re-use input tensor storage when creating output tensor
+    bool in_place = false;
+
     static constexpr auto attribute_names = std::make_tuple(
         "dtype",
         "weights_dtype",
@@ -91,7 +95,8 @@ struct Conv2dConfig {
         "enable_act_double_buffer",
         "enable_weights_double_buffer",
         "enable_split_reader",
-        "enable_subblock_padding");
+        "enable_subblock_padding",
+        "in_place");
     const auto attribute_values() const {
         return std::make_tuple(
             std::cref(this->dtype),
@@ -111,7 +116,8 @@ struct Conv2dConfig {
             std::cref(this->enable_act_double_buffer),
             std::cref(this->enable_weights_double_buffer),
             std::cref(this->enable_split_reader),
-            std::cref(this->enable_subblock_padding));
+            std::cref(this->enable_subblock_padding),
+            std::cref(this->in_place));
     }
 };