diff --git a/tests/tt_eager/python_api_testing/unit_testing/misc/test_fold_op.py b/tests/tt_eager/python_api_testing/unit_testing/misc/test_fold_op.py index 5f883e90ae41..9ae39a0fbaa3 100644 --- a/tests/tt_eager/python_api_testing/unit_testing/misc/test_fold_op.py +++ b/tests/tt_eager/python_api_testing/unit_testing/misc/test_fold_op.py @@ -162,8 +162,27 @@ def test_fold_with_permute_reshape_on_device(device, n, c, h, w, pad_h, pad_w, s torch_input_tensor, pad_h, pad_w, stride_h, stride_w ) torch_output_tensor = torch.permute(torch_output_tensor, (0, 2, 3, 1)) - tt_output_tensor = pad_and_fold_with_permute_and_reshape_on_device( - device, torch_input_tensor, pad_h, pad_w, stride_h, stride_w + # pad on host + n, c, h, w = torch_input_tensor.shape + C = _nearest_y(c, 4) + padded_h = h + pad_h * 2 + padded_w = w + pad_w * 2 + w_pad32 = padded_w + (32 - padded_w % 32) % 32 + pad_w_right = w_pad32 - (w + pad_w) + torch_input_tensor_padded = torch.nn.functional.pad(torch_input_tensor, (pad_w, pad_w_right, pad_h, pad_h)) + # on device + tt_input_tensor = ttnn.from_torch( + torch_input_tensor_padded, layout=ttnn.ROW_MAJOR_LAYOUT, device=device, memory_config=ttnn.L1_MEMORY_CONFIG + ) + tt_output_tensor = ttl.tensor.fold( + tt_input_tensor, + stride_h, + stride_w, + use_transpose_as_fold=True, + output_shape=(n, padded_h // stride_h, padded_w // stride_w, C * (stride_h * stride_w)), + pad_c=C - c, + pad_h=pad_h, + pad_w=0, ) tt_output_tensor = ttnn.to_torch(tt_output_tensor) assert_with_pcc(torch_output_tensor, tt_output_tensor, 1) diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.cpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.cpp index 093155e5acd0..186e70853071 100644 --- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.cpp +++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.cpp @@ -6,7 +6,95 @@ #include "ttnn/run_operation.hpp" + +#include "ttnn/operations/data_movement/transpose/transpose.hpp" +#include "ttnn/cpp/ttnn/operations/data_movement/slice/slice.hpp" +#include "ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/reshape/reshape_op.hpp" +#include "ttnn/cpp/ttnn/operations/data_movement/pad/pad.hpp" + namespace tt::tt_metal { + +std::vector fold_with_transpose_( + const Tensor& input, const std::optional& output_shape, uint8_t stride_h, uint8_t stride_w, uint8_t pad_c, uint8_t pad_h, uint8_t pad_w) { + + Device * device; + + // Get the device + if (input.storage_type() != StorageType::DEVICE) { + device = AutoFormat::GetDefaultDevice(); + TT_ASSERT(device != nullptr, "Requires setting default device if no inputs to op are on device"); + } else { + device = input.device(); + } + + uint32_t n = input.shape()[0], c = input.shape()[1], h = input.shape()[2], w = input.shape()[3]; + auto padded_c = c + pad_c; // end padding only + auto padded_h = h + pad_h * 2; // front and end padding + auto padded_w = w + pad_w * 2; // front and end padding + auto padded_h32 = round_up(padded_h, TILE_HEIGHT); + auto padded_w32 = round_up(padded_w, TILE_HEIGHT); + + auto L1_mem_config = tt::tt_metal::MemoryConfig{.memory_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED, .buffer_type=BufferType::L1}; + + tt::log_debug("input: {}", input.shape()); + + // pad input tensor + tt::tt_metal::Array4D padded_shape = {n, padded_c, padded_h32, padded_w32}; + auto pad_output = ttnn::pad(input, padded_shape, tt::tt_metal::Array4D({0, 0, 0, 0}), 0); + + tt::log_debug("pad_output: {}", pad_output.shape()); + + // transpose + auto transpose_hw_output = ttnn::transpose(pad_output, 2, 3, L1_mem_config); + + tt::log_debug("transpose_hw_output: {}", transpose_hw_output.shape()); + + // transpose + auto transpose_hc_output = ttnn::transpose(transpose_hw_output, 1, 2, L1_mem_config); + + tt::log_debug("transpose_hc_output: {}", transpose_hc_output.shape()); + + // reshape + n = transpose_hc_output.shape()[0], w = transpose_hc_output.shape()[1], c = transpose_hc_output.shape()[2], h = transpose_hc_output.shape()[3]; + auto reshape_hc_output = tt::tt_metal::reshape(transpose_hc_output, n, (w / stride_w), (c * stride_w), h, L1_mem_config); + + tt::log_debug("reshape_hc_output: {}", reshape_hc_output.shape()); + + // transpose + auto transpose_hw_output2 = ttnn::transpose(reshape_hc_output, 2, 3, L1_mem_config); + + tt::log_debug("transpose_hw_output2: {}", transpose_hw_output2.shape()); + + // reshape + n = transpose_hw_output2.shape()[0], w = transpose_hw_output2.shape()[1], h = transpose_hw_output2.shape()[2], c = transpose_hw_output2.shape()[3]; + auto reshape_hw_output = tt::tt_metal::reshape(transpose_hw_output2, n, w, (h / stride_h), (c * stride_h), L1_mem_config); + + tt::log_debug("reshape_hw_output: {}", reshape_hw_output.shape()); + + // transpose + auto transpose_hc_output2 = ttnn::transpose(reshape_hw_output, 1, 2, L1_mem_config); + + tt::log_debug("transpose_hc_output2: {}", transpose_hc_output2.shape()); + + std::vector output_tensors; + if (output_shape.has_value()) { + // slice + n = output_shape.value()[0], w = output_shape.value()[1], h = output_shape.value()[2], c = output_shape.value()[3]; + tt::tt_metal::Array4D slice_output_tensor_start = {0, 0, 0, 0}; + tt::tt_metal::Array4D slice_output_tensor_end = {n - 1, w - 1, h - 1, c - 1}; + auto slice_output = ttnn::slice(transpose_hc_output2, slice_output_tensor_start, slice_output_tensor_end, L1_mem_config); + + output_tensors.emplace_back(slice_output); + + tt::log_debug("slice_output: {}", slice_output.shape()); + } else { + output_tensors.emplace_back(transpose_hc_output2); + } + + return output_tensors; + +} + FoldOpParallelizationStrategy Fold::get_parallelization_strategy(const std::vector &input_tensors) const { if (is_sharded) { return FoldOpParallelizationStrategy::SHARDED_MULTI_CORE; @@ -85,9 +173,12 @@ operation::ProgramWithCallbacks Fold::create_program( } } -Tensor fold(const Tensor &input_tensor, uint8_t stride_h, uint8_t stride_w) { +Tensor fold(const Tensor &input_tensor, uint8_t stride_h, uint8_t stride_w, bool use_transpose_as_fold, const std::optional& output_shape, uint8_t pad_c, uint8_t pad_h, uint8_t pad_w) { bool is_sharded = input_tensor.is_sharded(); + if (use_transpose_as_fold) { + return operation::decorate_as_composite(__func__, fold_with_transpose_)(input_tensor, output_shape, stride_h, stride_w, pad_c, pad_h, pad_w).at(0); + } return operation::run(Fold{.stride_h = stride_h, .stride_w = stride_w, .is_sharded = is_sharded}, {input_tensor}) .at(0); } diff --git a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.hpp b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.hpp index cf048120c0f6..f547d7ede0a7 100644 --- a/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.hpp +++ b/ttnn/cpp/ttnn/deprecated/tt_dnn/op_library/fold/fold_op.hpp @@ -36,5 +36,5 @@ operation::ProgramWithCallbacks fold_single_core( operation::ProgramWithCallbacks fold_multi_core( const Tensor &input, const Tensor &output, uint8_t stride_h, uint8_t stride_w); -Tensor fold(const Tensor &input_tensor_a, uint8_t stride_h, uint8_t stride_w); +Tensor fold(const Tensor &input_tensor_a, uint8_t stride_h, uint8_t stride_w, bool use_transpose_as_fold = false, const std::optional& output_shape = std::nullopt, uint8_t pad_c = 0, uint8_t pad_h = 0, uint8_t pad_w = 0); } // namespace tt::tt_metal diff --git a/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp b/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp index 5a6737460549..5535f226cc6d 100644 --- a/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp +++ b/ttnn/cpp/ttnn/deprecated/tt_lib/csrc/tt_lib_bindings_tensor_dm_ops.cpp @@ -209,7 +209,7 @@ namespace tt::tt_metal::detail{ )doc"); m_tensor.def("fold", &fold, - py::arg("input").noconvert(), py::arg("stride_h"), py::arg("stride_w"), R"doc( + py::arg("input").noconvert(), py::arg("stride_h"), py::arg("stride_w"), py::arg("use_transpose_as_fold")=false, py::arg("output_shape")=std::nullopt, py::arg("pad_c")=0, py::arg("pad_h")=0, py::arg("pad_w")=0, R"doc( Fold TT Tensor. Input tensor must be on TT accelerator device, in ROW_MAJOR. diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp index a214e34964c5..bb00c45604e4 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/pad.cpp @@ -70,9 +70,6 @@ static ttnn::Tensor pad_impl( TT_FATAL( padding.size() == original_rank, "ttnn.pad: padding must be the same length as the input tensor rank"); - TT_FATAL( - input_tensor.get_layout() != ttnn::ROW_MAJOR_LAYOUT, - "ttnn.pad: row-major tensors have to use fallback because the kernel currently causes a PCC error"); // Unsqueeze Tensor to 4D if it is not already ttnn::Tensor input_tensor_4D = ttnn::unsqueeze_to_4D(input_tensor); @@ -94,12 +91,14 @@ static ttnn::Tensor pad_impl( front_padding_is_zero, "ttnn.pad: on device padding does not support front padding"); - const int target_height = output_padded_shape[padding.size() - 2]; - const int target_width = output_padded_shape[padding.size() - 1]; - TT_FATAL( - target_height % ttnn::TILE_SIZE == 0 || target_width % ttnn::TILE_SIZE == 0, - "ttnn.pad: for tiled tensors padding end must be a multiple of the tile size on height and width for a " - "tensor in tile layout"); + if (input_tensor.get_layout() == ttnn::TILE_LAYOUT) { + const int target_height = output_padded_shape[padding.size() - 2]; + const int target_width = output_padded_shape[padding.size() - 1]; + TT_FATAL( + target_height % ttnn::TILE_SIZE == 0 || target_width % ttnn::TILE_SIZE == 0, + "ttnn.pad: for tiled tensors padding end must be a multiple of the tile size on height and width for a " + "tensor in tile layout"); + } // Performing actual padding ShapeType pad_front_array; diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_interleaved_start_id.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_interleaved_start_id.cpp index a5631b32648e..0fc1de871bca 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_interleaved_start_id.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_interleaved_start_id.cpp @@ -56,56 +56,3 @@ void kernel_main() { cb_push_back(cb_id_in0, num_read_per_barrier); } } - - -// // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// // -// // SPDX-License-Identifier: Apache-2.0 - -// #include -// #include "dataflow_api.h" - -// void kernel_main() { - -// const uint32_t src_addr = get_arg_val(0); -// const uint32_t padded_stick_size = get_arg_val(1); -// const uint32_t unpadded_stick_size = get_arg_val(2); -// const uint32_t num_dims = get_arg_val(3); -// const uint32_t start_id = get_arg_val(4); -// const uint32_t num_sticks = get_arg_val(5); - -// tt_l1_ptr uint32_t * num_unpadded_sticks = (tt_l1_ptr uint32_t*)(get_arg_addr(6)); -// volatile tt_l1_ptr uint32_t * num_padded_sticks = num_unpadded_sticks + num_dims; -// volatile tt_l1_ptr uint32_t * id_per_dim = num_padded_sticks + num_dims; - -// constexpr bool src0_is_dram = get_compile_time_arg_val(0) == 1; - -// const InterleavedAddrGen s0 = { -// .bank_base_address = src_addr, -// .page_size = padded_stick_size -// }; - -// constexpr uint32_t cb_id_in0 = 0; - -// uint32_t src_stick_id = start_id; - -// for(uint32_t i = 0; i < num_sticks; i++) { -// // Copy Input -// cb_reserve_back(cb_id_in0, 1); -// uint32_t src_buffer_l1_addr = get_write_ptr(cb_id_in0); -// uint64_t src_noc_addr = get_noc_addr(src_stick_id, s0); -// noc_async_read(src_noc_addr, src_buffer_l1_addr, unpadded_stick_size); -// noc_async_read_barrier(); -// cb_push_back(cb_id_in0, 1); -// src_stick_id++; -// for(uint32_t j = 0; j < num_dims; j++) { -// id_per_dim[j]++; -// if (id_per_dim[j] == num_unpadded_sticks[j]) { -// id_per_dim[j] = 0; -// src_stick_id += num_padded_sticks[j]; -// } else { -// break; -// } -// } -// } -// } diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_writer_unary_stick_layout_interleaved_start_id.cpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_writer_unary_stick_layout_interleaved_start_id.cpp index e99efc9e6f75..d7fe0d13bd77 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_writer_unary_stick_layout_interleaved_start_id.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_writer_unary_stick_layout_interleaved_start_id.cpp @@ -43,39 +43,3 @@ void kernel_main() { cb_pop_front(cb_id_out0, num_read_per_barrier); } } - - -// // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// // -// // SPDX-License-Identifier: Apache-2.0 - -// #include -// #include "dataflow_api.h" - -// void kernel_main() { - - -// uint32_t dst_addr = get_arg_val(0); -// uint32_t stick_size = get_arg_val(1); -// uint32_t num_sticks = get_arg_val(2); -// uint32_t start_id = get_arg_val(3); - -// constexpr uint32_t cb_id_out0 = get_compile_time_arg_val(0); -// constexpr bool dst0_is_dram = get_compile_time_arg_val(1) == 1; - - -// const InterleavedAddrGen s0 = { -// .bank_base_address = dst_addr, -// .page_size = stick_size -// }; - - -// for (uint32_t i = start_id; i < start_id + num_sticks; i++) { -// cb_wait_front(cb_id_out0, 1); -// uint32_t l1_read_addr = get_read_ptr(cb_id_out0); -// uint64_t dst_noc_addr = get_noc_addr(i, s0); -// noc_async_write(l1_read_addr, dst_noc_addr, stick_size); -// noc_async_write_barrier(); -// cb_pop_front(cb_id_out0, 1); -// } -// } diff --git a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp index 2e3ddd43af52..10821e644a1e 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp +++ b/ttnn/cpp/ttnn/operations/data_movement/slice/device/slice_program_factory.hpp @@ -17,8 +17,6 @@ namespace tt { namespace tt_metal { - - inline std::vector, std::vector>> get_slice_runtime_args_rm( const Tensor& input_tensor, Tensor& output_tensor, @@ -548,527 +546,3 @@ operation::ProgramWithCallbacks slice_multi_core( } // namespace tt_metal } // namespace tt - -// // SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. -// // -// // SPDX-License-Identifier: Apache-2.0 -// #pragma once - -// #include "optional" -// #include "ttnn/deprecated/tt_dnn/op_library/math.hpp" -// #include "ttnn/deprecated/tt_dnn/op_library/work_split.hpp" -// #include "tt_metal/common/constants.hpp" -// #include "tt_metal/detail/util.hpp" -// #include "tt_metal/host_api.hpp" - -// #include "slice_op.hpp" -// using namespace tt::constants; - -// namespace tt { - -// namespace tt_metal { - -// inline std::vector, std::vector>> get_slice_runtime_args_rm( -// const Tensor& input_tensor, -// Tensor& output_tensor, -// const Shape& output_tensor_start, -// uint32_t num_cores_total, -// uint32_t num_cores, -// uint32_t num_cores_y, -// CoreRangeSet core_group_1, -// CoreRangeSet core_group_2, -// uint32_t num_sticks_per_core_group_1, -// uint32_t num_sticks_per_core_group_2) { -// tt_metal::Device* device = input_tensor.device(); - -// auto input_buffer = input_tensor.buffer(); -// auto output_buffer = output_tensor.buffer(); -// auto input_shape = input_tensor.get_legacy_shape(); -// auto output_shape = output_tensor.get_legacy_shape(); - -// uint32_t padded_row_size_bytes = input_shape[-1] * input_tensor.element_size(); -// uint32_t unpadded_row_size_bytes = output_shape[-1] * input_tensor.element_size(); - -// std::uint32_t num_dims = static_cast(input_shape.rank()); -// std::vector num_unpadded_sticks_per_dim(num_dims); -// std::vector num_padded_sticks_per_dim(num_dims); -// std::vector id_per_dim(num_dims); - -// std::vector accumulated_total_per_dim(num_dims); - -// // TODO: Remove first element of these arrays and update kernel accordingly -// // This currently just matches tile version where we iterate over the row as well -// num_unpadded_sticks_per_dim[0] = 1; -// num_padded_sticks_per_dim[0] = 0; -// accumulated_total_per_dim[0] = 1; - -// for (int32_t i = 1; i < num_dims; i++) { -// uint32_t num_unpadded_dim = output_shape[-(i + 1)]; -// uint32_t num_total_dim = input_shape[-(i + 1)]; -// uint32_t num_padded_dim = (num_total_dim - num_unpadded_dim) * accumulated_total_per_dim[i - 1]; -// num_unpadded_sticks_per_dim[i] = num_unpadded_dim; -// num_padded_sticks_per_dim[i] = num_padded_dim; -// accumulated_total_per_dim[i] = num_total_dim * accumulated_total_per_dim[i - 1]; -// } - -// vector common_reader_kernel_args = { -// input_tensor.buffer()->address() + output_tensor_start[-1] * output_tensor.element_size(), -// padded_row_size_bytes, -// unpadded_row_size_bytes, -// num_dims, -// 0, -// 0}; -// common_reader_kernel_args.insert( -// common_reader_kernel_args.end(), num_unpadded_sticks_per_dim.begin(), num_unpadded_sticks_per_dim.end()); -// common_reader_kernel_args.insert( -// common_reader_kernel_args.end(), num_padded_sticks_per_dim.begin(), num_padded_sticks_per_dim.end()); - -// std::vector, std::vector>> ret_val(num_cores_total); - -// uint32_t start_offset = ttnn::operations::data_movement::get_rm_start_offset(input_tensor, ttnn::Shape(output_tensor_start)); -// for (uint32_t i = 0, num_sticks_written = 0; i < num_cores_total; i++) { -// CoreCoord core = {i / num_cores_y, i % num_cores_y}; -// uint32_t num_sticks_per_core; -// if (core_group_1.core_coord_in_core_ranges(core)) { -// num_sticks_per_core = num_sticks_per_core_group_1; -// } else if (core_group_2.core_coord_in_core_ranges(core)) { -// num_sticks_per_core = num_sticks_per_core_group_2; -// } else { -// // no-op -// num_sticks_per_core = 0; -// } - -// id_per_dim[0] = num_sticks_written % num_unpadded_sticks_per_dim[0]; -// uint32_t unpadded_written = num_sticks_written / num_unpadded_sticks_per_dim[0]; -// uint32_t start_id = id_per_dim[0] + start_offset; - -// for (uint32_t j = 1; j < num_dims; j++) { -// id_per_dim[j] = unpadded_written % num_unpadded_sticks_per_dim[j]; -// unpadded_written = unpadded_written / num_unpadded_sticks_per_dim[j]; -// start_id += id_per_dim[j] * accumulated_total_per_dim[j - 1]; -// } -// vector reader_kernel_args = common_reader_kernel_args; -// // -// uint32_t addr_offset = 4; // input buffer addr, padded_row_size_bytes, unpadded_row_size_bytes, num_dims -// reader_kernel_args[addr_offset++] = start_id; -// reader_kernel_args[addr_offset] = num_sticks_per_core; -// reader_kernel_args.insert(reader_kernel_args.end(), id_per_dim.begin(), id_per_dim.end()); - -// vector writer_kernel_args = { -// output_buffer->address(), unpadded_row_size_bytes, num_sticks_per_core, num_sticks_written, 0}; -// num_sticks_written += num_sticks_per_core; -// ret_val[i] = {reader_kernel_args, writer_kernel_args}; -// } - -// return ret_val; -// } - -// operation::ProgramWithCallbacks slice_rm_multi_core( -// const Tensor& a, Tensor& output, const Shape& output_tensor_start, const Shape& output_tensor_end) { -// const Shape output_shape = output.get_legacy_shape(); - -// tt_metal::Program program = tt_metal::CreateProgram(); - -// // This should allocate a DRAM buffer on the device -// tt_metal::Device* device = a.device(); - -// uint32_t num_unpadded_sticks = output.volume() / output.get_legacy_shape()[-1]; - -// auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); -// uint32_t num_cores_x = compute_with_storage_grid_size.x; -// uint32_t num_cores_y = compute_with_storage_grid_size.y; - -// CoreRange total_cores({0, 0}, {num_cores_x - 1, num_cores_y - 1}); -// uint32_t num_cores_total = num_cores_x * num_cores_y; -// auto [num_cores, all_cores, core_group_1, core_group_2, num_sticks_per_core_group_1, num_sticks_per_core_group_2] = -// split_work_to_cores(compute_with_storage_grid_size, num_unpadded_sticks); - -// tt_metal::Buffer* src0_buffer = a.buffer(); - -// tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); - -// uint32_t padded_row_size_bytes = a.get_legacy_shape()[-1] * a.element_size(); -// uint32_t unpadded_row_size_bytes = output_shape[-1] * a.element_size(); - -// tt_metal::Buffer* dst_buffer = output.buffer(); -// TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!"); - -// uint32_t src_stick_size = padded_row_size_bytes; -// uint32_t dst_stick_size = unpadded_row_size_bytes; - -// uint32_t src0_cb_index = 0; -// uint32_t num_input_pages = 2; - -// uint32_t cb_page_size = round_up(unpadded_row_size_bytes, TILE_WIDTH); -// tt_metal::CircularBufferConfig cb_src0_config = -// tt_metal::CircularBufferConfig(num_input_pages * cb_page_size, {{src0_cb_index, cb_data_format}}) -// .set_page_size(src0_cb_index, cb_page_size); -// auto cb_src0 = tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config); - -// bool src0_is_dram = src0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; -// std::vector reader_compile_time_args_vec = {(std::uint32_t)src0_is_dram}; -// bool dst_is_dram = dst_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; - -// std::vector writer_compile_time_args_vec = {(std::uint32_t)src0_cb_index, (std::uint32_t)dst_is_dram}; - -// tt_metal::KernelHandle unary_reader_kernel_id = tt_metal::CreateKernel( -// program, -// "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_reader_unary_unpad_dims_rm_interleaved_start_id.cpp", -// total_cores, -// tt_metal::ReaderDataMovementConfig(reader_compile_time_args_vec)); - -// tt_metal::KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel( -// program, -// "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/slice_writer_unary_stick_layout_interleaved_start_id.cpp", -// total_cores, -// tt_metal::WriterDataMovementConfig(writer_compile_time_args_vec)); - -// auto all_runtime_args = get_slice_runtime_args_rm( -// a, -// output, -// output_tensor_start, -// num_cores_total, -// num_cores, -// num_cores_y, -// core_group_1, -// core_group_2, -// num_sticks_per_core_group_1, -// num_sticks_per_core_group_2); - -// for (uint32_t i = 0, num_sticks_written = 0; i < num_cores_total; i++) { -// CoreCoord core = {i / num_cores_y, i % num_cores_y}; -// tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first); - -// tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second); -// } - -// auto override_runtime_args_callback = -// [unary_reader_kernel_id, unary_writer_kernel_id, compute_with_storage_grid_size]( -// const void* operation, -// const Program& program, -// const std::vector& input_tensors, -// const std::vector>&, -// const std::vector& output_tensors) { -// auto src_tensor = input_tensors.at(0); -// auto dst_tensor = output_tensors.at(0); -// uint32_t num_cores_x = compute_with_storage_grid_size.x; -// uint32_t num_cores_y = compute_with_storage_grid_size.y; -// uint32_t num_cores_total = num_cores_x * num_cores_y; -// uint32_t num_unpadded_sticks = dst_tensor.volume() / dst_tensor.get_legacy_shape()[-1]; -// auto -// [num_cores, -// all_cores, -// core_group_1, -// core_group_2, -// num_sticks_per_core_group_1, -// num_sticks_per_core_group_2] = -// split_work_to_cores(compute_with_storage_grid_size, num_unpadded_sticks); - -// const auto tensor_start = static_cast(operation)->slice_start; -// auto all_runtime_args = get_slice_runtime_args_rm( -// src_tensor, -// dst_tensor, -// tensor_start, -// num_cores_total, -// num_cores, -// num_cores_y, -// core_group_1, -// core_group_2, -// num_sticks_per_core_group_1, -// num_sticks_per_core_group_2); - -// for (uint32_t i = 0, num_tiles_written = 0; i < num_cores_total; i++) { -// CoreCoord core = {i / num_cores_y, i % num_cores_y}; - -// { SetRuntimeArgs(program, unary_reader_kernel_id, core, all_runtime_args[i].first); } - -// { SetRuntimeArgs(program, unary_writer_kernel_id, core, all_runtime_args[i].second); } -// } -// }; - -// return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback}; -// } - -// template -// inline __attribute__((always_inline)) void set_slice_runtime_args_tile( -// const Tensor& input_tensor, -// const Tensor& output_tensor, -// const Shape& output_tensor_start, -// const uint32_t& num_cores_total, -// const uint32_t& num_cores, -// const std::vector& cores, -// const uint32_t& num_cores_group_1, -// const uint32_t& num_cores_group_2, -// const uint32_t& num_tiles_per_core_group_1, -// const uint32_t& num_tiles_per_core_group_2, -// const Program& program, -// const tt_metal::KernelHandle& unary_reader_kernel_id, -// const tt_metal::KernelHandle& unary_writer_kernel_id, -// std::vector& accumulated_total_per_dim) { -// const auto input_buffer = input_tensor.buffer(); -// const auto output_buffer = output_tensor.buffer(); -// const auto& input_shape = input_tensor.get_legacy_shape(); -// const auto& output_shape = output_tensor.get_legacy_shape(); - -// std::uint32_t num_dims = static_cast(input_shape.rank()); - -// uint32_t num_unpadded_Xt = output_shape[-1] / TILE_WIDTH; -// uint32_t num_total_Xt = input_shape[-1] / TILE_WIDTH; -// uint32_t num_padded_Xt = num_total_Xt - num_unpadded_Xt; -// uint32_t num_unpadded_Yt = output_shape[-2] / TILE_HEIGHT; -// uint32_t num_total_Yt = input_shape[-2] / TILE_HEIGHT; -// uint32_t num_padded_Yt = (num_total_Yt - num_unpadded_Yt) * num_total_Xt; - -// const auto set_common_reader_args = [&]( -// uint32_t* reader_common_args, -// uint32_t* num_unpadded_tiles_per_dim, -// uint32_t* num_padded_tiles_per_dim) __attribute__((always_inline)) { -// reader_common_args[0] = input_buffer->address(); -// num_unpadded_tiles_per_dim[0] = num_unpadded_Xt; -// num_unpadded_tiles_per_dim[1] = num_unpadded_Yt; -// num_padded_tiles_per_dim[0] = num_padded_Xt; -// num_padded_tiles_per_dim[1] = num_padded_Yt; -// accumulated_total_per_dim[0] = num_total_Xt; -// accumulated_total_per_dim[1] = num_total_Yt * num_total_Xt; -// for (int32_t i = 2; i < num_dims; ++i) { -// uint32_t num_unpadded_dim = output_shape[-(i + 1)]; -// uint32_t num_total_dim = input_shape[-(i + 1)]; -// uint32_t num_padded_dim = (num_total_dim - num_unpadded_dim) * accumulated_total_per_dim[i - 1]; -// num_unpadded_tiles_per_dim[i] = num_unpadded_dim; -// num_padded_tiles_per_dim[i] = num_padded_dim; -// accumulated_total_per_dim[i] = num_total_dim * accumulated_total_per_dim[i - 1]; -// } -// }; - -// const auto set_reader_rt_args = [&]( -// uint32_t* reader_rt_args, -// const uint32_t* num_unpadded_tiles_per_dim, -// const uint32_t* num_padded_tiles_per_dim, -// const uint32_t& num_tiles_per_core, -// const uint32_t& start_offset, -// const uint32_t& num_tiles_written) __attribute__((always_inline)) { -// reader_rt_args[2] = num_tiles_written % num_unpadded_tiles_per_dim[0]; -// uint32_t unpadded_written = num_tiles_written / num_unpadded_tiles_per_dim[0]; -// uint32_t start_id = reader_rt_args[2] + start_offset; -// for (uint32_t j = 1; j < num_dims; ++j) { -// reader_rt_args[2 + j] = unpadded_written % num_unpadded_tiles_per_dim[j]; -// unpadded_written = unpadded_written / num_unpadded_tiles_per_dim[j]; -// start_id += reader_rt_args[2 + j] * accumulated_total_per_dim[j - 1]; -// } -// reader_rt_args[0] = start_id; -// reader_rt_args[1] = num_tiles_per_core; -// }; - -// if constexpr (initialize_args) { -// std::vector reader_common_args(1 + num_dims * 2); -// uint32_t* num_unpadded_tiles_per_dim = reader_common_args.data() + 1; -// uint32_t* num_padded_tiles_per_dim = num_unpadded_tiles_per_dim + num_dims; -// set_common_reader_args(reader_common_args.data(), num_unpadded_tiles_per_dim, num_padded_tiles_per_dim); -// SetCommonRuntimeArgs(program, unary_reader_kernel_id, reader_common_args); -// } -// auto& reader_common_args = GetCommonRuntimeArgs(program, unary_reader_kernel_id); -// uint32_t* num_unpadded_tiles_per_dim = reader_common_args.data() + 1; -// uint32_t* num_padded_tiles_per_dim = num_unpadded_tiles_per_dim + num_dims; -// if constexpr (!initialize_args) { -// set_common_reader_args(reader_common_args.data(), num_unpadded_tiles_per_dim, num_padded_tiles_per_dim); -// } - -// uint32_t start_offset = ttnn::operations::data_movement::get_tiled_start_offset(input_tensor, ttnn::Shape(output_tensor_start)); - -// auto& reader_kernel_args_by_core = GetRuntimeArgs(program, unary_reader_kernel_id); -// auto& writer_kernel_args_by_core = GetRuntimeArgs(program, unary_writer_kernel_id); -// const uint32_t num_used_cores = num_cores_group_1 + num_cores_group_2; -// for (uint32_t i = 0, num_tiles_written = 0; i < num_cores_total; ++i) { -// const CoreCoord& core = cores[i]; -// uint32_t num_tiles_per_core; -// if (i < num_cores_group_1) { -// num_tiles_per_core = num_tiles_per_core_group_1; -// } else if (i < num_used_cores) { -// num_tiles_per_core = num_tiles_per_core_group_2; -// } else { -// // no-op -// if constexpr (initialize_args) { -// std::vector reader_kernel_args(2 + num_dims, 0); -// std::vector writer_kernel_args(3, 0); -// tt_metal::SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args); -// tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_kernel_args); -// } else { -// auto& reader_kernel_args = reader_kernel_args_by_core[core.x][core.y]; -// reader_kernel_args[1] = 0; -// auto& writer_kernel_args = writer_kernel_args_by_core[core.x][core.y]; -// writer_kernel_args[1] = 0; -// } -// continue; -// } - -// if constexpr (initialize_args) { -// std::vector reader_kernel_args(2 + num_dims); -// set_reader_rt_args( -// reader_kernel_args.data(), -// num_unpadded_tiles_per_dim, -// num_padded_tiles_per_dim, -// num_tiles_per_core, -// start_offset, -// num_tiles_written); -// SetRuntimeArgs(program, unary_reader_kernel_id, core, reader_kernel_args); -// } else { -// auto& reader_kernel_args = reader_kernel_args_by_core[core.x][core.y]; -// set_reader_rt_args( -// reader_kernel_args.data(), -// num_unpadded_tiles_per_dim, -// num_padded_tiles_per_dim, -// num_tiles_per_core, -// start_offset, -// num_tiles_written); -// } - -// if constexpr (initialize_args) { -// vector writer_kernel_args = {output_buffer->address(), num_tiles_per_core, num_tiles_written}; -// tt_metal::SetRuntimeArgs(program, unary_writer_kernel_id, core, writer_kernel_args); -// } else { -// auto& writer_kernel_args = writer_kernel_args_by_core[core.x][core.y]; -// writer_kernel_args[0] = output_buffer->address(); -// writer_kernel_args[1] = num_tiles_per_core; -// writer_kernel_args[2] = num_tiles_written; -// } -// num_tiles_written += num_tiles_per_core; -// } -// } - -// operation::ProgramWithCallbacks slice_tile_multi_core( -// const Tensor& a, Tensor& output, const Shape& output_tensor_start, const Shape& output_tensor_end) { -// const Shape output_shape = output.get_legacy_shape(); - -// tt_metal::Program program = tt_metal::CreateProgram(); - -// // This should allocate a DRAM buffer on the device -// tt_metal::Device* device = a.device(); - -// uint32_t num_unpadded_tiles = output.volume() / TILE_HW; - -// auto compute_with_storage_grid_size = device->compute_with_storage_grid_size(); -// uint32_t num_cores_x = compute_with_storage_grid_size.x; -// uint32_t num_cores_y = compute_with_storage_grid_size.y; -// auto num_cores_total = num_cores_x * num_cores_y; -// CoreRange total_cores({0, 0}, {num_cores_x - 1, num_cores_y - 1}); - -// auto [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = -// split_work_to_cores(compute_with_storage_grid_size, num_unpadded_tiles); - -// tt_metal::Buffer* src0_buffer = a.buffer(); - -// tt_metal::Buffer* dst_buffer = output.buffer(); -// TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!"); - -// tt::DataFormat cb_data_format = tt_metal::datatype_to_dataformat_converter(a.get_dtype()); -// uint32_t single_tile_size = tt_metal::detail::TileSize(cb_data_format); - -// uint32_t src0_cb_index = 0; -// uint32_t num_input_tiles = 2; -// tt_metal::CircularBufferConfig cb_src0_config = -// tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, cb_data_format}}) -// .set_page_size(src0_cb_index, single_tile_size); -// auto cb_src0 = tt_metal::CreateCircularBuffer(program, total_cores, cb_src0_config); - -// std::uint32_t num_dims = static_cast(a.get_legacy_shape().rank()); - -// // Reader compile-time args -// // Data is 32 byte aligned -// bool src0_is_dram = src0_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; -// bool dst_is_dram = dst_buffer->buffer_type() == tt_metal::BufferType::DRAM ? 1 : 0; -// std::vector reader_compile_time_args = { -// static_cast(src0_cb_index), -// static_cast(num_dims), -// static_cast(src0_is_dram), -// }; -// std::vector writer_compile_time_args = { -// static_cast(src0_cb_index), static_cast(dst_is_dram)}; - -// // Tilized reader -// tt_metal::KernelHandle unary_reader_kernel_id = tt_metal::CreateKernel( -// program, -// "ttnn/cpp/ttnn/operations/data_movement/slice/device/kernels/dataflow/reader_unary_unpad_dims_interleaved_start_id.cpp", -// total_cores, -// tt_metal::ReaderDataMovementConfig(reader_compile_time_args)); - -// tt_metal::KernelHandle unary_writer_kernel_id = tt_metal::CreateKernel( -// program, -// "ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp", -// total_cores, -// tt_metal::WriterDataMovementConfig(writer_compile_time_args)); - -// const auto cores = grid_to_cores(num_cores_total, num_cores_x, num_cores_y, false); - -// std::vector accumulated_total_per_dim(num_dims); -// set_slice_runtime_args_tile( -// a, -// output, -// output_tensor_start, -// num_cores_total, -// num_cores, -// cores, -// core_group_1.num_cores(), -// core_group_2.num_cores(), -// num_tiles_per_core_group_1, -// num_tiles_per_core_group_2, -// program, -// unary_reader_kernel_id, -// unary_writer_kernel_id, -// accumulated_total_per_dim); - -// auto override_runtime_args_callback = [unary_reader_kernel_id, -// unary_writer_kernel_id, -// compute_with_storage_grid_size, -// cores, -// accumulated_total_per_dim]( -// const void* operation, -// const Program& program, -// const std::vector& input_tensors, -// const std::vector>&, -// const std::vector& output_tensors) mutable { -// const Tensor& src_tensor = input_tensors[0]; -// const Tensor& dst_tensor = output_tensors[0]; -// uint32_t num_unpadded_tiles = dst_tensor.volume() / TILE_HW; - -// uint32_t num_cores_x = compute_with_storage_grid_size.x; -// uint32_t num_cores_y = compute_with_storage_grid_size.y; -// uint32_t num_cores_total = cores.size(); - -// auto -// [num_cores, all_cores, core_group_1, core_group_2, num_tiles_per_core_group_1, num_tiles_per_core_group_2] = -// split_work_to_cores(compute_with_storage_grid_size, num_unpadded_tiles); - -// const auto& tensor_start = static_cast(operation)->slice_start; -// set_slice_runtime_args_tile( -// src_tensor, -// dst_tensor, -// tensor_start, -// num_cores_total, -// num_cores, -// cores, -// core_group_1.num_cores(), -// core_group_2.num_cores(), -// num_tiles_per_core_group_1, -// num_tiles_per_core_group_2, -// program, -// unary_reader_kernel_id, -// unary_writer_kernel_id, -// accumulated_total_per_dim); -// }; - -// return {.program = std::move(program), .override_runtime_arguments_callback = override_runtime_args_callback}; -// } - -// operation::ProgramWithCallbacks slice_multi_core( -// const Tensor& a, Tensor& output, const Shape& output_tensor_start, const Shape& output_tensor_end) { -// switch (a.get_layout()) { -// case Layout::ROW_MAJOR: return slice_rm_multi_core(a, output, output_tensor_start, output_tensor_end); -// case Layout::TILE: return slice_tile_multi_core(a, output, output_tensor_start, output_tensor_end); -// default: TT_ASSERT(false, "Unsupported Layout"); -// } -// return {}; -// } - -// } // namespace tt_metal - -// } // namespace tt