Skip to content

Commit

Permalink
Quick fix for single card device perf (#17752)
Browse files Browse the repository at this point in the history
### Ticket
Link to Github Issue NA

### Problem description
Using parallelization over the width for untilize with unpadding caused
perf regression for some models. This PR fixes it

### What's changed
Limiting the use of the function only when the height parallelization
does not work.
Block parallelization with a better threshold will be added in the
future

### Checklist
- [x] [All post
commit](https://github.com/tenstorrent/tt-metal/actions/workflows/all-post-commit-workflows.yaml)
CI passes
https://github.com/tenstorrent/tt-metal/actions/runs/13212247605
- [ ] [Blackhole Post
commit](https://github.com/tenstorrent/tt-metal/actions/workflows/blackhole-post-commit.yaml)
CI passes (if applicable)
- [ ] [Model
regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-models.yaml)
CI passes (if applicable)
- [x] [Device performance
regression](https://github.com/tenstorrent/tt-metal/actions/workflows/perf-device-models.yaml)
CI passes (if applicable)
https://github.com/tenstorrent/tt-metal/actions/runs/13207121716
- [ ] **(For models and ops writers)** Full [new models
tests](https://github.com/tenstorrent/tt-metal/actions/workflows/full-new-models-suite.yaml)
CI passes (if applicable)
- [ ] New/Existing tests provide coverage for changes
  • Loading branch information
nardoTT authored Feb 8, 2025
1 parent e4ecf87 commit a4b0687
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ operation::ProgramWithCallbacks UntilizeWithUnpadding::create_program(
auto& output_tensor = output_tensors.at(0);
if (input_tensors.at(0).memory_config().is_sharded() || this->use_multicore) {
return detail::untilize_with_unpadding_multi_core(
input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en);
input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en, this->enough_space_height);
} else {
return detail::untilize_with_unpadding_single_core(
input_tensor_a, output_tensor, this->use_pack_untilize, this->fp32_dest_acc_en);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ struct UntilizeWithUnpadding {
const bool use_multicore;
const bool use_pack_untilize;
const bool fp32_dest_acc_en;
const bool enough_space_width;
const bool enough_space_height;

void validate(const std::vector<Tensor>& input_tensors) const;
std::vector<ttnn::TensorSpec> compute_output_specs(const std::vector<Tensor>& input_tensors) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_col_interleav
}

operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) {
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height) {
tt::tt_metal::Program program{};

tt::DataFormat input_cb_data_format = datatype_to_dataformat_converter(a.get_dtype());
Expand All @@ -383,7 +383,7 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
uint32_t num_tiles_per_row = a.get_padded_shape()[-1] / TILE_WIDTH;

uint32_t num_tiles_per_col = a.get_padded_shape()[-2] / TILE_HEIGHT;
if (num_tiles_per_row > num_tiles_per_col) {
if (!enough_space_height) {
return untilize_with_unpadding_multi_core_col_interleaved(a, output, use_pack_untilize, fp32_dest_acc_en);
}

Expand Down Expand Up @@ -839,11 +839,12 @@ operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
}

operation::ProgramWithCallbacks untilize_with_unpadding_multi_core(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en) {
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height) {
if (a.memory_config().is_sharded()) {
return untilize_with_unpadding_multi_core_sharded(a, output, use_pack_untilize, fp32_dest_acc_en);
} else {
return untilize_with_unpadding_multi_core_interleaved(a, output, use_pack_untilize, fp32_dest_acc_en);
return untilize_with_unpadding_multi_core_interleaved(
a, output, use_pack_untilize, fp32_dest_acc_en, enough_space_height);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_single_cor
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);

tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_interleaved(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height);

// This purely supports input block shard -> output interleaved for now
tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core_sharded(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);

tt::tt_metal::operation::ProgramWithCallbacks untilize_with_unpadding_multi_core(
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en);
const Tensor& a, Tensor& output, bool use_pack_untilize, bool fp32_dest_acc_en, bool enough_space_height);

} // namespace ttnn::operations::data_movement::detail
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,35 @@ ttnn::Shape squeeze_vector_shape(ttnn::Shape output_shape) {

namespace ttnn::operations::data_movement {

inline uint32_t get_estimated_size_of_cbs(
const Tensor& input_tensor_a,
const uint32_t input_single_tile_size,
const uint32_t output_single_tile_size,
const uint32_t num_tiles_per_row) {
uint32_t cb_src0_size = input_single_tile_size * num_tiles_per_row;
uint32_t cb_output_size = output_single_tile_size * num_tiles_per_row;
return cb_src0_size + cb_output_size;
}

inline uint32_t get_max_l1_space(const Tensor& input_tensor_a) {
auto device = input_tensor_a.device();
auto lowest_address = device->lowest_occupied_compute_l1_address();
uint32_t max_l1_space = lowest_address.has_value() ? lowest_address.value() : device->l1_size_per_core();
max_l1_space = max_l1_space - device->allocator()->get_base_allocator_addr(HalMemType::L1);
return max_l1_space;
}

inline bool enough_available_space(
const Tensor& input_tensor_a,
const uint32_t input_single_tile_size,
const uint32_t output_single_tile_size,
const uint32_t num_tiles_per_row) {
uint32_t max_l1_space = get_max_l1_space(input_tensor_a);
uint32_t estimated_size_of_cbs =
get_estimated_size_of_cbs(input_tensor_a, input_single_tile_size, output_single_tile_size, num_tiles_per_row);
return max_l1_space > estimated_size_of_cbs;
}

using OwnedUntilizeValArgs = std::tuple<ttnn::Tensor>;
using BaseUntilizeValType = std::function<ttnn::Tensor(const ttnn::Tensor&)>;

Expand Down Expand Up @@ -82,14 +111,28 @@ ttnn::Tensor ExecuteUntilizeWithUnpadding::invoke(
output_end = ttnn::Shape(std::move(output_end_vector));
}

auto input_cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input_tensor.get_dtype());
uint32_t input_single_tile_size = tt::tt_metal::detail::TileSize(input_cb_data_format);

uint32_t num_tiles_per_row = input_tensor.get_padded_shape()[-1] / tt::constants::TILE_WIDTH;
uint32_t num_tiles_per_col = input_tensor.get_padded_shape()[-2] / tt::constants::TILE_HEIGHT;

uint32_t output_single_tile_size = input_single_tile_size;
bool enough_space_width =
enough_available_space(input_tensor, input_single_tile_size, output_single_tile_size, num_tiles_per_col);
bool enough_space_height =
enough_available_space(input_tensor, input_single_tile_size, output_single_tile_size, num_tiles_per_row);

auto base_untilize = [=](const ttnn::Tensor& input_tensor) {
return operation::run(
UntilizeWithUnpadding{// output_end,
ttnn::Shape(output_end),
memory_config.value_or(input_tensor.memory_config()),
use_multicore,
use_pack_untilize,
fp32_dest_acc_en},
fp32_dest_acc_en,
enough_space_width,
enough_space_height},
{input_tensor},
{},
{},
Expand Down

0 comments on commit a4b0687

Please sign in to comment.