diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded_stickwise.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded_stickwise.cpp index 42e18f0f3653..378c5a7f70b5 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded_stickwise.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/kernels/dataflow/reader_pad_dims_rm_sharded_stickwise.cpp @@ -23,29 +23,25 @@ void kernel_main() { constexpr uint32_t unpadded_stick_step = get_compile_time_arg_val(7); constexpr uint32_t padded_stick_step = get_compile_time_arg_val(8); - auto output_offset_bytes = get_arg_val(0); uint32_t input_shard_base_addr = get_write_ptr(input_shard_cb); uint32_t output_shard_base_addr = get_write_ptr(output_shard_cb); - // for the first stick, we need to add the front padding by adjusting where - // we start writing into the output shard - uint32_t output_shard_addr = output_shard_base_addr + output_offset_bytes; auto input_stick_ptr = reinterpret_cast(input_shard_base_addr); - auto output_stick_ptr = reinterpret_cast(output_shard_addr); + auto output_stick_ptr = reinterpret_cast(output_shard_base_addr); // fill the sticks that aren't entirely padding with data from the input tensor for (uint32_t h = 0; h < unpadded_shard_height; h++) { cb_wait_front(output_shard_cb, 1); // wait for writer to fill this stick with padding - // read the input stick into the padded output stick starting after the - // front padding - // FIXME: this isn't aligned. we need to do a memcpy for now. we can try // to do a noc_async_read later on with a trick. + // + // currently small noc transfers are slow, but once runtime drops an + // optimization (upcoming as of 12/12/2024) this might be worth + // investigating. - // noc_async_read(output_stick_addr + W_front_pad_bytes, input_stick_addr, unpadded_stick_bytes); - - // NOTE: memcpy is safe here because the input/output tensors have disjoint buffers. + // read the input stick into the padded output stick starting after the + // front padding for (uint32_t i = 0; i < unpadded_stick_bytes; i++) { output_stick_ptr[W_front_pad_bytes + i] = input_stick_ptr[i]; } diff --git a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp index 1433121dbfd2..0bff06eafe0c 100644 --- a/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp +++ b/ttnn/cpp/ttnn/operations/data_movement/pad/device/pad_program_factory.cpp @@ -1696,26 +1696,8 @@ operation::ProgramWithCallbacks pad_rm_sharded_width_only( all_cores_padded, tt::tt_metal::WriterDataMovementConfig(writer_ct_args)); - uint32_t first_shard_output_offset = 0; - - auto [which_shard, shard_coord] = tensor_coord_to_shard_coord( - output_tensor_shape.logical_shape().view(), shard_spec_padded.shape, input_tensor_start.view()); - - TT_FATAL(which_shard == 0, "pad_rm_sharded_stickwise: input tensor start should be in the first shard."); - uint32_t h_in_shard = shard_coord[0]; - first_shard_output_offset = h_in_shard * tt::round_up(padded_stick_bytes, dram_alignment_bytes); - - auto all_cores_padded_vec = corerange_to_cores(all_cores_padded, std::nullopt, true); - - // Set runtime args for all cores - for (const auto& core : all_cores_padded_vec) { - // First core gets first_shard_output_offset, others get 0 - uint32_t offset = (core == *all_cores_padded_vec.begin()) ? first_shard_output_offset : 0; - tt::tt_metal::SetRuntimeArgs(program, reader_kernel_id, core, {offset}); - tt::tt_metal::SetRuntimeArgs(program, writer_kernel_id, core, {}); - } - - // FIXME: need to update runtime args? + tt::tt_metal::SetRuntimeArgs(program, reader_kernel_id, all_cores_padded, {}); + tt::tt_metal::SetRuntimeArgs(program, writer_kernel_id, all_cores_padded, {}); auto override_runtime_args_callback = [ input_shard_cb,