Quick fix for single card device perf (#17752)

### Ticket Link to Github Issue NA ### Problem description Using parallelization over the width for untilize with unpadding caused perf regression for some models. This PR fixes it ### What's changed Limiting the use of the function only when the height parallelization does not work. Block parallelization with a better threshold will be added in the future ### Checklist - [x] [All post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml) CI passes https://github.com/tenstorrent/tt-metal/actions/runs/13212247605 - [ ] [Blackhole Post commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml) CI passes (if applicable) - [ ] [Model regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml) CI passes (if applicable) - [x] [Device performance regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml) CI passes (if applicable) https://github.com/tenstorrent/tt-metal/actions/runs/13207121716 - [ ] **(For models and ops writers)** Full [new models tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml) CI passes (if applicable) - [ ] New/Existing tests provide coverage for changes
tenstorrent · Feb 8, 2025 · a4b0687 · a4b0687
1 parent e4ecf87
commit a4b0687
Show file tree

Hide file tree

Showing 5 changed files with 54 additions and 8 deletions.
diff --git a/...nn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp b/...nn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.cpp
@@ -109,7 +109,7 @@ operation::ProgramWithCallbacks UntilizeWithUnpadding::create_program(
     auto& output_tensor = output_tensors.at(0);
     if (input_tensors.at(0).memory_config().is_sharded() || this->use_multicore) {
         return detail::untilize_with_unpadding_multi_core(
-            input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en);
+            input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en, this->enough_space_height);
     } else {
         return detail::untilize_with_unpadding_single_core(
             input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en);

diff --git a/...nn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp b/...nn/operations/data_movement/untilize_with_unpadding/device/untilize_with_unpadding_op.hpp
@@ -17,6 +17,8 @@ struct UntilizeWithUnpadding {
     const bool use_multicore;
     const bool use_pack_untilize;
     const bool fp32_dest_acc_en;
+    const bool enough_space_width;
+    const bool enough_space_height;
 
     void validate(const std::vector<Tensor>& input_tensors) const;
     std::vector<ttnn::TensorSpec> compute_output_specs(const std::vector<Tensor>& input_tensors) const;

diff --git a/.../data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp b/.../data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.cpp
@@ -365,7 +365,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_col_interleav
 }
 
 operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
-    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) {
+    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height) {
     tt::tt_metal::Program program{};
 
     tt::DataFormat input_cb_data_format = datatype_to_dataformat_converter(a.get_dtype());
@@ -383,7 +383,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
     uint32_t num_tiles_per_row = a.get_padded_shape()[-1] / TILE_WIDTH;
 
     uint32_t num_tiles_per_col = a.get_padded_shape()[-2] / TILE_HEIGHT;
-    if (num_tiles_per_row > num_tiles_per_col) {
+    if (!enough_space_height) {
         return untilize_with_unpadding_multi_core_col_interleaved(a, output, use_pack_untilize, fp32_dest_acc_en);
     }
 
@@ -839,11 +839,12 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
 }
 
 operation::ProgramWithCallbacks untilize_with_unpadding_multi_core(
-    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) {
+    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height) {
     if (a.memory_config().is_sharded()) {
         return untilize_with_unpadding_multi_core_sharded(a, output, use_pack_untilize, fp32_dest_acc_en);
     } else {
-        return untilize_with_unpadding_multi_core_interleaved(a, output, use_pack_untilize, fp32_dest_acc_en);
+        return untilize_with_unpadding_multi_core_interleaved(
+            a, output, use_pack_untilize, fp32_dest_acc_en, enough_space_height);
     }
 }
 

diff --git a/.../data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.hpp b/.../data_movement/untilize_with_unpadding/device/untilize_with_unpadding_program_factory.hpp
@@ -12,13 +12,13 @@ tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_single_cor
     const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
 
 tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
-    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
+    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height);
 
 // This purely supports input block shard -> output interleaved for now
 tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
     const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
 
 tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core(
-    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
+    const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height);
 
 }  // namespace ttnn::operations::data_movement::detail
diff --git a/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp b/ttnn/cpp/ttnn/operations/data_movement/untilize_with_unpadding/untilize_with_unpadding.cpp
@@ -32,6 +32,35 @@ ttnn::Shape squeeze_vector_shape(ttnn::Shape output_shape) {
 
 namespace ttnn::operations::data_movement {
 
+inline uint32_t get_estimated_size_of_cbs(
+    const Tensor& input_tensor_a,
+    const uint32_t input_single_tile_size,
+    const uint32_t output_single_tile_size,
+    const uint32_t num_tiles_per_row) {
+    uint32_t cb_src0_size = input_single_tile_size * num_tiles_per_row;
+    uint32_t cb_output_size = output_single_tile_size * num_tiles_per_row;
+    return cb_src0_size + cb_output_size;
+}
+
+inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) {
+    auto device = input_tensor_a.device();
+    auto lowest_address = device->lowest_occupied_compute_l1_address();
+    uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core();
+    max_l1_space = max_l1_space - device->allocator()->get_base_allocator_addr(HalMemType::L1);
+    return max_l1_space;
+}
+
+inline bool enough_available_space(
+    const Tensor& input_tensor_a,
+    const uint32_t input_single_tile_size,
+    const uint32_t output_single_tile_size,
+    const uint32_t num_tiles_per_row) {
+    uint32_t max_l1_space = get_max_l1_space(input_tensor_a);
+    uint32_t estimated_size_of_cbs =
+        get_estimated_size_of_cbs(input_tensor_a, input_single_tile_size, output_single_tile_size, num_tiles_per_row);
+    return max_l1_space > estimated_size_of_cbs;
+}
+
 using OwnedUntilizeValArgs = std::tuple<ttnn::Tensor>;
 using BaseUntilizeValType = std::function<ttnn::Tensor(const ttnn::Tensor&)>;
 
@@ -82,14 +111,28 @@ ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke(
         output_end = ttnn::Shape(std::move(output_end_vector));
     }
 
+    auto input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype());
+    uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);
+
+    uint32_t num_tiles_per_row = input_tensor.get_padded_shape()[-1] / tt::constants::TILE_WIDTH;
+    uint32_t num_tiles_per_col = input_tensor.get_padded_shape()[-2] / tt::constants::TILE_HEIGHT;
+
+    uint32_t output_single_tile_size = input_single_tile_size;
+    bool enough_space_width =
+        enough_available_space(input_tensor, input_single_tile_size, output_single_tile_size, num_tiles_per_col);
+    bool enough_space_height =
+        enough_available_space(input_tensor, input_single_tile_size, output_single_tile_size, num_tiles_per_row);
+
     auto base_untilize = [=](const ttnn::Tensor& input_tensor) {
         return operation::run(
             UntilizeWithUnpadding{// output_end,
                                   ttnn::Shape(output_end),
                                   memory_config.value_or(input_tensor.memory_config()),
                                   use_multicore,
                                   use_pack_untilize,
-                                  fp32_dest_acc_en},
+                                  fp32_dest_acc_en,
+                                  enough_space_width,
+                                  enough_space_height},
             {input_tensor},
             {},
             {},