tenstorrent · jvegaTT · Dec 5, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 29, 2024
@@ -293,6 +293,10 @@ def test_reshape_tile_layout_only_change_shape(device):
         ((1, 1445, 192), (1445, 192)),
         ((1, 256), (1, 1, 256)),
         ((16, 1, 32), (16, 1, 32)),
+        ((1, 32, 4608), (1, 32, 16, 3, 96)),  # issue 13889
+        ((2888, 49, 96), (8, 19, 19, 7, 7, 96)),  # issue 12153
+        ((128, 1, 1, 128), (128, 128)),  # issue 14676
+        ((5, 4, 208, 156), (3, 13, 8, 2080)),  # issue 14513
     ],
 )
 @pytest.mark.parametrize("layout", [ttnn.ROW_MAJOR_LAYOUT, ttnn.TILE_LAYOUT])
@@ -304,6 +308,26 @@ def test_reshape_tile_with_padding(input_shape, output_shape, layout, device):
     ttnn_output = ttnn.reshape(input_tensor, output_shape)
     assert layout == ttnn_output.layout
     output = ttnn.to_torch(ttnn_output)
+    assert_with_pcc(torch_result, output, 0.9999)
+
+
+# issue 15048
+def test_broken_reshape(device):
+    src_shape = (1, 56, 56, 64)
+    target_shape = (1, 1, 56 * 56, 64)
+    torch_input_tensor = torch.randn(src_shape, dtype=torch.bfloat16)
+    torch_result = torch_input_tensor.reshape(target_shape)
+
+    input_tensor = ttnn.from_torch(
+        torch_input_tensor,
+        dtype=ttnn.bfloat16,
+        layout=ttnn.TILE_LAYOUT,
+        device=device,
+        memory_config=ttnn.DRAM_MEMORY_CONFIG,
+    )
+
+    ttnn_output = ttnn.reshape(input_tensor, target_shape)
+    output = ttnn.to_torch(ttnn_output)
 
     assert_with_pcc(torch_result, output, 0.9999)
 

@@ -100,6 +100,8 @@ set(ALL_TTNN_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_on_device/device/reshape_program_factory.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_view/reshape.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_view/reshape_pybind.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_view/device/reshape_rm_op.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/reshape_view/device/host/reshape_rm_host_prep.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/unsqueeze/unsqueeze_pybind.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/cpp/ttnn/operations/data_movement/squeeze/squeeze.cpp

@@ -6,9 +6,45 @@
 // It's best to copy and paste the functions in rather than include the header as code size will likely explode
 // Best to separate in to cpp/hpp at some point to avoid the code size explosion but need to figure out the linking
 // issues
+#include <stdio.h>
+#include <cstring>
+#define MASK_64      0xFFFFFFFFFFFFFFC0
+#define OFFSET_64    0x000000000000003F
+#define MASK_16      0xFFFFFFFFFFFFFFF0
+#define OFFSET_16    0x000000000000000F
 
 namespace tt::data_movement::common {
 
+template <bool guaranteed_16B_alligned, bool copy_async>
+FORCE_INLINE
+void tt_memmove (
+    const uint32_t dst_l1_addr,
+    const uint64_t src_l1_addr,
+    const uint32_t bytes)
+{
+    //Function performs a memory copy between two l1 addresses in the local core
+    //Uses noc_async_read when possible to copy the data over
+    //Set guaranteed 16B alligned to true if the source and destination are externally guaranteed to be 16B alligned (dangerous)
+    //Set copy_async to true if you wish to perform the operation asynchronously, in this case you can add a noc_async_read_barrier to synchronize later
+    if constexpr (guaranteed_16B_alligned)
+    {
+        noc_async_read(get_noc_addr(src_l1_addr),dst_l1_addr, bytes);
+        if constexpr (!copy_async) {noc_async_read_barrier();}
+    }
+    else
+    {
+        if ((dst_l1_addr&OFFSET_16) == (src_l1_addr&OFFSET_16))
+        {
+            noc_async_read(get_noc_addr(src_l1_addr),dst_l1_addr, bytes);
+            if constexpr (!copy_async) {noc_async_read_barrier();}
+        }
+        else
+        {
+            memmove((void *)(dst_l1_addr), (void *)(src_l1_addr), (size_t) (bytes));
+        }
+    }
+}
+
 // this function is useful for converting bfloat16 values to float32
 FORCE_INLINE float bfloat16_to_float32(uint16_t bfloat16_data) {
     uint32_t bits = static_cast<uint32_t>(bfloat16_data) << 16;

@@ -0,0 +1,173 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+
+/*
+Function reads from RM and writes to RM
+
+Assumptions:
+
+Compile arguments
+0. src0_is_dram: 1 if source is dram else 0
+1. read_size_is_pow2: 1 if read size is power of 2 else 0
+2. log_base_2_of_page_size: log base 2 of page size
+3. write_size_is_pow2: 1 if write size is power of 2 else 0
+4. log_base_2_of_page_size: log base 2 of page size
+5. needs_read_allignment: 1 if read needs allignment else 0
+//Needed if BRAM and page size is not multiple of 64 bytes
+
+Runtime arguments
+0. src_addr: source address
+1. dst_addr: destination address
+2. source_page_size_bytes: source page size in bytes
+3. dest_page_size_bytes: destination page size in bytes
+4. source_read_size_bytes: source read size in bytes
+5. read_start_page: read start page
+6. read_end_page: read end page
+7. write_start_page: write start page
+*/
+#include <stdint.h>
+#include "dataflow_api.h"
+#include "debug/dprint.h"  // required in all kernels using DPRINT
+#include "ttnn/cpp/ttnn/operations/data_movement/common/kernels/common.hpp"
+
+void kernel_main() {
+    //We are guranteed to be in 2D going to 2D
+
+    const uint32_t src_addr                 = get_arg_val<uint32_t>(0);
+    const uint32_t dst_addr                 = get_arg_val<uint32_t>(1);
+    const uint32_t source_page_size_bytes   = get_arg_val<uint32_t>(2);
+    const uint32_t dest_page_size_bytes     = get_arg_val<uint32_t>(3);
+    //If DDR this is source_page_size_bytes + 64 (rounded up to next 64B), if L1 this is source_page_size_bytes + 16 (rounded up to next 16B)
+    const uint32_t source_read_size_bytes   = get_arg_val<uint32_t>(4);
+    const uint32_t read_start_page          = get_arg_val<uint32_t>(5);
+    const uint32_t read_end_page            = get_arg_val<uint32_t>(6);
+    const uint32_t write_start_page         = get_arg_val<uint32_t>(7);
+    //cb_id_in0 is a circular buffer with 1 source_page_size_bytes page if no alignment needed
+    //source_read_size_bytes otherwise
+    const uint32_t cb_id_in0                = get_arg_val<uint32_t>(8);
+    //cb_id_in1 is a circular buffer with 1 dest_page_size_bytes+16 (rounded up to next 64B) page
+    const uint32_t cb_id_in1                = get_arg_val<uint32_t>(9);
+
+
+    constexpr bool tensor_is_dram                   = get_compile_time_arg_val(0) == 1;
+    #define src_aligned_to_64                       get_compile_time_arg_val<uint32_t>(1) == 1
+    #define src_aligned_to_16                       get_compile_time_arg_val<uint32_t>(2) == 1
+    #define dst_aligned_to_16                       get_compile_time_arg_val<uint32_t>(3) == 1
+
+
+    const InterleavedAddrGen<tensor_is_dram> s = {
+        .bank_base_address = src_addr,
+        .page_size = source_page_size_bytes
+    };
+
+    const InterleavedAddrGen<tensor_is_dram> d = {
+        .bank_base_address = dst_addr,
+        .page_size = dest_page_size_bytes
+    };
+
+
+    uint32_t read_offset = 0;
+    uint32_t write_page = write_start_page;
+    uint32_t readable = 0;
+    uint32_t transaction = 0;
+    uint32_t writable = dest_page_size_bytes;
+    //cb_id_in0 is a CB source_read_size_bytes page size, 1 page
+    //cb_id_in1 is a CB dest_page_size_bytes + allignment_to_64 page size, 1 page
+    cb_reserve_back(cb_id_in0, 1);
+    cb_reserve_back(cb_id_in1, 1);
+    const uint32_t source_buffer = get_write_ptr(cb_id_in0);
+    const uint32_t dest_buffer = get_write_ptr(cb_id_in1);
+
+    uint64_t dst_noc_addr = get_noc_addr(write_page, d);
+#if (dst_aligned_to_16)
+    uint32_t write_offset = 0;
+#else
+    uint32_t write_offset = dst_noc_addr&OFFSET_16;
+    uint32_t begin_write_offset = write_offset;
+#endif
+    for (uint32_t i = read_start_page; i <= read_end_page; i++) {
+        //Read from source
+        uint64_t src_noc_addr = s.get_noc_addr(i,0);
+
+#if (src_aligned_to_64 || ((!tensor_is_dram) && src_aligned_to_16))
+        //Aligned to 64 bytes or 16 bytes but L1
+        noc_async_read(src_noc_addr, source_buffer, source_page_size_bytes);
+        read_offset = 0;
+#elif (tensor_is_dram)
+        //DDR but not alligned to 64 (potentially also not alligned to 16)
+        noc_async_read(src_noc_addr&MASK_64, source_buffer, source_read_size_bytes);
+        read_offset = src_noc_addr&OFFSET_64;
+#else
+        //L1 but not alligned to 16
+        noc_async_read(src_noc_addr&MASK_16, source_buffer, source_read_size_bytes);
+        read_offset = src_noc_addr&OFFSET_16;
+#endif
+        readable = source_page_size_bytes;
+        noc_async_read_barrier();
+
+        //Write to dest
+        while (readable > 0)
+        {
+            noc_async_write_barrier();
+            if (readable < writable)
+            {
+                tt::data_movement::common::tt_memmove<false,true>(dest_buffer+write_offset, source_buffer + read_offset, readable);
+                writable = writable -readable;
+                write_offset = write_offset + readable;
+                readable = 0;
+            }
+            else if (readable == writable)
+            {
+                tt::data_movement::common::tt_memmove<false,false>(dest_buffer+write_offset, source_buffer + read_offset, readable);
+#if ((dst_aligned_to_16))
+                noc_async_write(dest_buffer,dst_noc_addr, dest_page_size_bytes);
+#else
+                noc_async_write(dest_buffer+begin_write_offset,dst_noc_addr, dest_page_size_bytes);
+#endif
+                writable = dest_page_size_bytes;
+                readable = 0;
+                if (i == read_end_page-1)
+                {
+                    cb_push_back(cb_id_in0, 1);
+                    cb_push_back(cb_id_in1, 1);
+                    return;
+                }
+                write_page++;
+                dst_noc_addr = get_noc_addr(write_page, d);
+#if ((dst_aligned_to_16))
+                write_offset=0;
+#else
+                write_offset = dst_noc_addr&OFFSET_16;
+                begin_write_offset = write_offset;
+#endif
+            }
+            else
+            {
+                //writable < readable
+
+                tt::data_movement::common::tt_memmove<false,false>(dest_buffer+write_offset, source_buffer + read_offset, writable);
+#if ((dst_aligned_to_16))
+                noc_async_write(dest_buffer,dst_noc_addr, dest_page_size_bytes);
+#else
+                noc_async_write(dest_buffer+begin_write_offset,dst_noc_addr, dest_page_size_bytes);
+#endif
+                readable = readable - writable;
+                read_offset = read_offset + writable;
+                write_page++;
+                dst_noc_addr = get_noc_addr(write_page, d);
+#if ((dst_aligned_to_16))
+                write_offset=0;
+#else
+                write_offset = dst_noc_addr&OFFSET_16;
+                begin_write_offset = write_offset;
+#endif
+                writable = dest_page_size_bytes;
+            }
+        }
+    }
+    cb_push_back(cb_id_in0, 1);
+    cb_push_back(cb_id_in1, 1);
+    return;
+}
@@ -0,0 +1,103 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <math.h>
+
+#include "ttnn/operations/cb_utils.hpp"
+#include "ttnn/operations/math.hpp"
+#include "ttnn/operation.hpp"
+#include "ttnn/operations/core/work_split/work_split_tilize.hpp"
+#include "tt_metal/common/constants.hpp"
+#include "tt_metal/detail/util.hpp"
+#include "tt_metal/host_api.hpp"
+#include "ttnn/cpp/ttnn/operations/data_movement/reshape_view/reshape_common.hpp"
+
+#include <optional>
+#include <variant>
+
+#include "ttnn/tensor/tensor.hpp"
+#include "ttnn/core.hpp"
+#include "ttnn/device_operation.hpp"
+#include "ttnn/types.hpp"
+#include "ttnn/decorators.hpp"
+
+#define MASK_64      0xFFFFFFFFFFFFFFC0
+#define OFFSET_64    0x000000000000003F
+#define MASK_16      0xFFFFFFFFFFFFFFF0
+#define OFFSET_16    0x000000000000000F
+
+namespace ttnn::operations::data_movement::rm_reshape{
+
+operation::ProgramWithCallbacks rm_reshape_preparer(const Tensor& input, const Tensor& output)
+{
+    tt::tt_metal::Program program = tt::tt_metal::CreateProgram();
+    //get datum size
+    tt::DataFormat cb_data_format = tt::tt_metal::datatype_to_dataformat_converter(input.get_dtype());
+    const uint32_t data_size = input.element_size();
+    CoreRange core({0, 0}, {0, 0});
+
+    tt::tt_metal::Device *device = input.device();
+    ttnn::Shape input_log_shape = ttnn::Shape(input.get_logical_shape().view());
+    ttnn::Shape output_log_shape = ttnn::Shape(output.get_logical_shape().view());
+    tt::log_debug("row major reshape");
+    tt::log_debug("input shape: {}", input_log_shape);
+    tt::log_debug("output shape: {}", output_log_shape);
+    tt::log_debug("data size: {}", data_size);
+    uint32_t source_page_size_bytes = input_log_shape[-1] * data_size;
+    uint32_t dest_page_size_bytes = output_log_shape[-1] * data_size;
+    uint32_t source_read_size_bytes = ((source_page_size_bytes-1) & MASK_64) + 128;
+    uint32_t read_start_page = 0;
+    uint32_t read_end_page = input_log_shape[-2];
+    uint32_t write_start_page = 0;
+    tt::tt_metal::Buffer *src_buffer = input.buffer();
+    tt::tt_metal::Buffer *dst_buffer = output.buffer();
+    TT_ASSERT(dst_buffer != nullptr, "Output buffer should be allocated on device!");
+
+    const uint32_t cb_size0 = source_read_size_bytes;
+    const uint32_t cb_size1 = ((dest_page_size_bytes-1)&MASK_64) + 80;
+
+    uint32_t src0_cb_index = 0;
+    uint32_t src1_cb_index = 1;
+    tt::tt_metal::CircularBufferConfig cb_src0_config = tt::tt_metal::CircularBufferConfig(cb_size0*2, {{src0_cb_index, cb_data_format}})
+		.set_page_size(src0_cb_index, cb_size0);
+    auto cb_src0 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src0_config);
+    tt::tt_metal::CircularBufferConfig cb_src1_config = tt::tt_metal::CircularBufferConfig(cb_size1, {{src1_cb_index, cb_data_format}})
+		.set_page_size(src1_cb_index, cb_size1);
+    auto cb_src1 = tt::tt_metal::CreateCircularBuffer(program, core, cb_src1_config);
+    //set the runtime args
+    //set the compile time args
+    uint32_t src0_is_dram = src_buffer->buffer_type() == tt::tt_metal::BufferType::DRAM ? 1 : 0;
+    std::vector<uint32_t> compile_time_args = {
+        (std::uint32_t) src0_is_dram,
+        (std::uint32_t) (source_page_size_bytes%64==0) ? 1 : 0,
+        (std::uint32_t) (source_page_size_bytes%16==0) ? 1 : 0,
+        (std::uint32_t) (dest_page_size_bytes%16==0) ? 1 : 0,
+    };
+
+    tt::tt_metal::KernelHandle reader_kernel_id = tt::tt_metal::CreateKernel(
+        program,
+        "ttnn/cpp/ttnn/operations/data_movement/reshape_view/device/device/rm_reshape_interleaved.cpp",
+        core,
+        tt::tt_metal::ReaderDataMovementConfig(compile_time_args));
+    std::vector<uint32_t> reader_runtime_args = {
+        src_buffer->address(),
+        dst_buffer->address(),
+        source_page_size_bytes,
+        dest_page_size_bytes,
+        source_read_size_bytes,
+        read_start_page,
+        read_end_page,
+        write_start_page,
+        src0_cb_index,
+        src1_cb_index
+    };
+    tt::tt_metal::SetRuntimeArgs(
+        program,
+        reader_kernel_id,
+        core,
+        reader_runtime_args
+    );
+    return {.program=std::move(program)};
+}
+}; // namespace ttnn::operations::data_movement::rm_reshape