Skip to content

Commit

Permalink
Revert "#11014: Add a flag to CQDispatchWritePackedLargeSubCmd to spe…
Browse files Browse the repository at this point in the history
…cify unlinking mcasts"

This reverts commit ace8746.
  • Loading branch information
tt-aho committed Aug 18, 2024
1 parent d12d7ac commit d079bd3
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1035,7 +1035,6 @@ inline bool gen_rnd_dispatcher_packed_write_large_cmd(Device *device,
sub_cmd.addr = device_data.get_result_data_addr(range.start_coord);
sub_cmd.length = xfer_size_bytes;
sub_cmd.num_mcast_dests = (range.end_coord.x - range.start_coord.x + 1) * (range.end_coord.y - range.start_coord.y + 1);
sub_cmd.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;

for (uint32_t i = 0; i < sizeof(CQDispatchWritePackedLargeSubCmd) / sizeof(uint32_t); i++) {
cmds.push_back(((uint32_t *)&sub_cmd)[i]);
Expand Down
7 changes: 1 addition & 6 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -931,8 +931,7 @@ void EnqueueProgramCommand::assemble_device_commands(
.noc_xy_addr = noc_encoding,
.addr = dst_addr,
.length = (uint16_t)write_length,
.num_mcast_dests = (uint8_t)num_mcast_dests,
.flags = CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE});
.num_mcast_dests = (uint16_t)num_mcast_dests});
RecordDispatchData(
program, DISPATCH_DATA_BINARY, write_length, kg_transfer_info.riscvs[kernel_idx]);
dst_addr += write_length;
Expand All @@ -948,10 +947,6 @@ void EnqueueProgramCommand::assemble_device_commands(
}
}
}
// Unlink the last subcmd of the current core range
if (!write_linear) {
kernel_bins_dispatch_subcmds.back().back().flags |= CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;
}
}
for (uint32_t i = 0; i < kernel_bins_dispatch_subcmds.size(); ++i) {
cmd_sequence_sizeB += align(
Expand Down
6 changes: 2 additions & 4 deletions tt_metal/impl/dispatch/cq_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ struct CQDispatchWritePagedCmd {
constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NONE = 0x00;
constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_MCAST = 0x01;
constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_FLAG_NO_STRIDE = 0x02;
constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_RELAY = 0x03;

struct CQDispatchWritePackedCmd {
uint8_t flags; // see above
Expand All @@ -196,14 +197,11 @@ struct CQDispatchWritePackedMulticastSubCmd {
uint32_t num_mcast_dests;
} __attribute__((packed));

constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_NONE = 0x00;
constexpr uint32_t CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK = 0x01;
struct CQDispatchWritePackedLargeSubCmd {
uint32_t noc_xy_addr;
uint32_t addr;
uint16_t length; // multiples of L1 cache line alignment
uint8_t num_mcast_dests;
uint8_t flags;
uint16_t num_mcast_dests;
} __attribute__((packed));

constexpr inline __attribute__((always_inline)) uint32_t get_packed_write_max_multicast_sub_cmds(uint32_t packed_write_max_unicast_sub_cmds) {
Expand Down
23 changes: 6 additions & 17 deletions tt_metal/impl/dispatch/kernels/cq_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,30 +117,18 @@ void cq_noc_async_write_with_state(uint32_t src_addr, uint64_t dst_addr, uint32_

// More generic version of cq_noc_async_write_with_state: Allows writing an abitrary amount of data, when the NOC config (dst_noc,
// VC..) have been specified.
template<bool write_last_packet = true>
FORCE_INLINE
uint32_t cq_noc_async_write_with_state_any_len(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) {
if (size > NOC_MAX_BURST_SIZE) {
void cq_noc_async_write_with_state_any_len(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0, uint32_t ndests = 1) {
while(size > NOC_MAX_BURST_SIZE) {
cq_noc_async_write_with_state<CQ_NOC_SnDL>(src_addr, dst_addr, NOC_MAX_BURST_SIZE, ndests);
src_addr += NOC_MAX_BURST_SIZE;
dst_addr += NOC_MAX_BURST_SIZE;
size -= NOC_MAX_BURST_SIZE;
while(size > NOC_MAX_BURST_SIZE) {
cq_noc_async_write_with_state<CQ_NOC_SnDl>(src_addr, dst_addr, NOC_MAX_BURST_SIZE, ndests);
src_addr += NOC_MAX_BURST_SIZE;
dst_addr += NOC_MAX_BURST_SIZE;
size -= NOC_MAX_BURST_SIZE;
}
}
if constexpr (write_last_packet) {
cq_noc_async_write_with_state<CQ_NOC_SnDL>(src_addr, dst_addr, size, ndests);
return 0;
} else {
return size;
}
cq_noc_async_write_with_state<CQ_NOC_SnDL>(src_addr, dst_addr, size, ndests);
}

template<enum CQNocFlags flags, bool mcast = false, bool linked = false>
template<enum CQNocFlags flags, bool mcast = false>
FORCE_INLINE
void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_t size = 0) {

Expand All @@ -151,8 +139,9 @@ void cq_noc_async_write_init_state(uint32_t src_addr, uint64_t dst_addr, uint32_
}
DEBUG_STATUS("NSID");

constexpr bool multicast_path_reserve = true;
constexpr bool multicast_path_reserve = mcast;
constexpr bool posted = false;
constexpr bool linked = false;
constexpr uint32_t vc = mcast ? NOC_DISPATCH_MULTICAST_WRITE_VC : NOC_UNICAST_WRITE_VC;

constexpr uint32_t noc_cmd_field =
Expand Down
40 changes: 10 additions & 30 deletions tt_metal/impl/dispatch/kernels/cq_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,8 @@ void process_write_packed_large() {
uint32_t data_ptr = cmd_ptr + sizeof(CQDispatchCmd) + count * sizeof(CQDispatchWritePackedLargeSubCmd);
data_ptr = round_up_pow2(data_ptr, L1_ALIGNMENT);

cq_noc_async_write_init_state<CQ_NOC_sndl, true>(0, 0);

constexpr uint32_t sub_cmd_size = sizeof(CQDispatchWritePackedLargeSubCmd);
careful_copy_from_l1_to_local_cache<l1_to_local_cache_copy_chunk, l1_cache_elements_rounded>(
(volatile uint32_t tt_l1_ptr *)(cmd_ptr + sizeof(CQDispatchCmd)), count * sub_cmd_size / sizeof(uint32_t), l1_cache);
Expand All @@ -670,25 +672,18 @@ void process_write_packed_large() {
uint32_t mcasts = noc_nonposted_writes_acked[noc_index];
CQDispatchWritePackedLargeSubCmd *sub_cmd_ptr = (CQDispatchWritePackedLargeSubCmd *)l1_cache;

bool init_state = true;
while (count != 0) {
uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr;
uint32_t dst_addr = sub_cmd_ptr->addr + local_write_offset;
uint32_t length = sub_cmd_ptr->length;
uint32_t num_dests = sub_cmd_ptr->num_mcast_dests;
uint32_t pad_size = align(length, alignment) - length;
uint32_t unlink = sub_cmd_ptr->flags & CQ_DISPATCH_CMD_PACKED_WRITE_LARGE_FLAG_UNLINK;

// Only re-init state after we have unlinked the last transaction
// Otherwise we assume NOC coord hasn't changed
// TODO: If we are able to send 0 length txn to unset link, we don't need a flag and can compare dst_noc to prev to determine linking
if (init_state) {
uint32_t dst_noc = sub_cmd_ptr->noc_xy_addr;
// TODO: Linking should be set to true once atomic txn is handled properly
cq_noc_async_write_init_state<CQ_NOC_sNdl, true, false>(0, get_noc_addr_helper(dst_noc, dst_addr));
}

sub_cmd_ptr++;

// Note: expect to only have 1 or a few pages, so this doesn't optimize writing length
cq_noc_async_write_with_state<CQ_NOC_sNdl, CQ_NOC_WAIT, CQ_NOC_send>(0, get_noc_addr_helper(dst_noc, dst_addr));

while (length != 0) {
// More data needs to be written, but we've exhausted the CB. Acquire more pages.
if (data_ptr == cb_fence) {
Expand All @@ -710,30 +705,15 @@ void process_write_packed_large() {
}
// Transfer size is min(remaining_length, data_available_in_cb)
uint32_t available_data = cb_fence - data_ptr;
uint32_t xfer_size;
if (length > available_data) {
xfer_size = available_data;
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests);
} else {
xfer_size = length;
if (unlink) {
uint32_t rem_xfer_size = cq_noc_async_write_with_state_any_len<false>(data_ptr, dst_addr, xfer_size, num_dests);
// Unset Link flag
cq_noc_async_write_init_state<CQ_NOC_sndl, true, false>(0, 0, 0);
uint32_t data_offset = xfer_size - rem_xfer_size;
cq_noc_async_write_with_state<CQ_NOC_SnDL, CQ_NOC_wait>(data_ptr + data_offset, dst_addr + data_offset, rem_xfer_size, num_dests);
} else {
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests);
}
}
writes += div_up(xfer_size, NOC_MAX_BURST_SIZE);
uint32_t xfer_size = (length > available_data) ? available_data : length;
cq_noc_async_write_with_state_any_len(data_ptr, dst_addr, xfer_size, num_dests);
uint32_t num_noc_packets_written = div_up(xfer_size, NOC_MAX_BURST_SIZE);
writes += num_noc_packets_written;
length -= xfer_size;
data_ptr += xfer_size;
dst_addr += xfer_size;
}

init_state = unlink;

// Release pages for prefetcher
// Releasing here requires the sub_cmds to be read into local memory above
block_noc_writes_to_clear[rd_block_idx] += writes;
Expand Down

0 comments on commit d079bd3

Please sign in to comment.