diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h index c7106cd43ab..0766d4e5de2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/common.h @@ -1035,7 +1035,6 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device, sub_cmd.addr = device_data.get_result_data_addr(range.start_coord); sub_cmd.length = xfer_size_bytes; sub_cmd.num_mcast_dests = (range.end_coord.x - range.start_coord.x + 1) * (range.end_coord.y - range.start_coord.y + 1); - sub_cmd.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; for (uint32_t i = 0; i < sizeof(CQDispatchWritePackedLargeSubCmd) / sizeof(uint32_t); i++) { cmds.push_back(((uint32_t *)&sub_cmd)[i]); diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 45e986f704f..1c00ed7b2c5 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -931,8 +931,7 @@ void EnqueueProgramCommand::assemble_device_commands( .noc_xy_addr = noc_encoding, .addr = dst_addr, .length = (uint16_t)write_length, - .num_mcast_dests = (uint8_t)num_mcast_dests, - .flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE}); + .num_mcast_dests = (uint16_t)num_mcast_dests}); RecordDispatchData( program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]); dst_addr += write_length; @@ -948,10 +947,6 @@ void EnqueueProgramCommand::assemble_device_commands( } } } - // Unlink the last subcmd of the current core range - if (!write_linear) { - kernel_bins_dispatch_subcmds.back().back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; - } } for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) { cmd_sequence_sizeB += align( diff --git a/tt_metal/impl/dispatch/cq_commands.hpp b/tt_metal/impl/dispatch/cq_commands.hpp index 3f71158ed65..c94e8e6762f 100644 --- a/tt_metal/impl/dispatch/cq_commands.hpp +++ b/tt_metal/impl/dispatch/cq_commands.hpp @@ -178,6 +178,7 @@ struct CQDispatchWritePagedCmd { constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NONE = 0x00; constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST = 0x01; constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NO_STRIDE = 0x02; +constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_RELAY = 0x03; struct CQDispatchWritePackedCmd { uint8_t flags; // see above @@ -196,14 +197,11 @@ struct CQDispatchWritePackedMulticastSubCmd { uint32_t num_mcast_dests; } __attribute__((packed)); -constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE = 0x00; -constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK = 0x01; struct CQDispatchWritePackedLargeSubCmd { uint32_t noc_xy_addr; uint32_t addr; uint16_t length; // multiples of L1 cache line alignment - uint8_t num_mcast_dests; - uint8_t flags; + uint16_t num_mcast_dests; } __attribute__((packed)); constexpr inline __attribute__((always_inline)) uint32_t get_packed_write_max_multicast_sub_cmds(uint32_t packed_write_max_unicast_sub_cmds) { diff --git a/tt_metal/impl/dispatch/kernels/cq_common.hpp b/tt_metal/impl/dispatch/kernels/cq_common.hpp index 2fbe886dfe7..6b0227f4eae 100644 --- a/tt_metal/impl/dispatch/kernels/cq_common.hpp +++ b/tt_metal/impl/dispatch/kernels/cq_common.hpp @@ -117,30 +117,18 @@ void cq_noc_async_write_with_state(uint32_t src_addr, uint64_t dst_addr, uint32_ // More generic version of cq_noc_async_write_with_state: Allows writing an abitrary amount of data, when the NOC config (dst_noc, // VC..) have been specified. -template FORCE_INLINE -uint32_t cq_noc_async_write_with_state_any_len(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) { - if (size > NOC_MAX_BURST_SIZE) { +void cq_noc_async_write_with_state_any_len(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) { + while(size > NOC_MAX_BURST_SIZE) { cq_noc_async_write_with_state(src_addr, dst_addr, NOC_MAX_BURST_SIZE, ndests); src_addr += NOC_MAX_BURST_SIZE; dst_addr += NOC_MAX_BURST_SIZE; size -= NOC_MAX_BURST_SIZE; - while(size > NOC_MAX_BURST_SIZE) { - cq_noc_async_write_with_state(src_addr, dst_addr, NOC_MAX_BURST_SIZE, ndests); - src_addr += NOC_MAX_BURST_SIZE; - dst_addr += NOC_MAX_BURST_SIZE; - size -= NOC_MAX_BURST_SIZE; - } - } - if constexpr (write_last_packet) { - cq_noc_async_write_with_state(src_addr, dst_addr, size, ndests); - return 0; - } else { - return size; } + cq_noc_async_write_with_state(src_addr, dst_addr, size, ndests); } -template +template FORCE_INLINE void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0) { @@ -151,8 +139,9 @@ void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_ } DEBUG_STATUS("NSID"); - constexpr bool multicast_path_reserve = true; + constexpr bool multicast_path_reserve = mcast; constexpr bool posted = false; + constexpr bool linked = false; constexpr uint32_t vc = mcast ? NOC_DISPATCH_MULTICAST_WRITE_VC : NOC_UNICAST_WRITE_VC; constexpr uint32_t noc_cmd_field = diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp index 8c9f90303b6..a5180c4a939 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch.cpp @@ -662,6 +662,8 @@ void process_write_packed_large() { uint32_t data_ptr = cmd_ptr + sizeof(CQDispatchCmd) + count * sizeof(CQDispatchWritePackedLargeSubCmd); data_ptr = round_up_pow2(data_ptr, L1_ALIGNMENT); + cq_noc_async_write_init_state(0, 0); + constexpr uint32_t sub_cmd_size = sizeof(CQDispatchWritePackedLargeSubCmd); careful_copy_from_l1_to_local_cache( (volatile uint32_t tt_l1_ptr *)(cmd_ptr + sizeof(CQDispatchCmd)), count * sub_cmd_size / sizeof(uint32_t), l1_cache); @@ -670,25 +672,18 @@ void process_write_packed_large() { uint32_t mcasts = noc_nonposted_writes_acked[noc_index]; CQDispatchWritePackedLargeSubCmd *sub_cmd_ptr = (CQDispatchWritePackedLargeSubCmd *)l1_cache; - bool init_state = true; while (count != 0) { + uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr; uint32_t dst_addr = sub_cmd_ptr->addr + local_write_offset; uint32_t length = sub_cmd_ptr->length; uint32_t num_dests = sub_cmd_ptr->num_mcast_dests; uint32_t pad_size = align(length, alignment) - length; - uint32_t unlink = sub_cmd_ptr->flags & CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK; - - // Only re-init state after we have unlinked the last transaction - // Otherwise we assume NOC coord hasn't changed - // TODO: If we are able to send 0 length txn to unset link, we don't need a flag and can compare dst_noc to prev to determine linking - if (init_state) { - uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr; - // TODO: Linking should be set to true once atomic txn is handled properly - cq_noc_async_write_init_state(0, get_noc_addr_helper(dst_noc, dst_addr)); - } sub_cmd_ptr++; + // Note: expect to only have 1 or a few pages, so this doesn't optimize writing length + cq_noc_async_write_with_state(0, get_noc_addr_helper(dst_noc, dst_addr)); + while (length != 0) { // More data needs to be written, but we've exhausted the CB. Acquire more pages. if (data_ptr == cb_fence) { @@ -710,30 +705,15 @@ void process_write_packed_large() { } // Transfer size is min(remaining_length, data_available_in_cb) uint32_t available_data = cb_fence - data_ptr; - uint32_t xfer_size; - if (length > available_data) { - xfer_size = available_data; - cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); - } else { - xfer_size = length; - if (unlink) { - uint32_t rem_xfer_size = cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); - // Unset Link flag - cq_noc_async_write_init_state(0, 0, 0); - uint32_t data_offset = xfer_size - rem_xfer_size; - cq_noc_async_write_with_state(data_ptr + data_offset, dst_addr + data_offset, rem_xfer_size, num_dests); - } else { - cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); - } - } - writes += div_up(xfer_size, NOC_MAX_BURST_SIZE); + uint32_t xfer_size = (length > available_data) ? available_data : length; + cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests); + uint32_t num_noc_packets_written = div_up(xfer_size, NOC_MAX_BURST_SIZE); + writes += num_noc_packets_written; length -= xfer_size; data_ptr += xfer_size; dst_addr += xfer_size; } - init_state = unlink; - // Release pages for prefetcher // Releasing here requires the sub_cmds to be read into local memory above block_noc_writes_to_clear[rd_block_idx] += writes;