Skip to content

Commit

Permalink
optimize size field placement in edm fabric packet header
Browse files Browse the repository at this point in the history
Make it in the same spot for all command types to remove any sort of conditional lookup to determin packet size.

Also switch the size to now represent payload size only (excludes header). This simplifies some caller code as well.
  • Loading branch information
SeanNijjar committed Feb 7, 2025
1 parent 4de85a5 commit 960bbde
Show file tree
Hide file tree
Showing 13 changed files with 115 additions and 118 deletions.
10 changes: 5 additions & 5 deletions tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ void kernel_main() {
noc_async_write(source_l1_buffer_address, dest_addr, packet_payload_size_bytes);
if (fabric_connection.has_forward_connection()) {
DeviceZoneScopedN("WR-FWD");
mcast_fwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{
noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)});
mcast_fwd_packet_header->to_noc_unicast_write(
NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes);
{
DeviceZoneScopedN("WR-FWD-WAIT");
fabric_connection.get_forward_connection().wait_for_empty_write_slot();
Expand All @@ -155,8 +155,8 @@ void kernel_main() {

if (fabric_connection.has_backward_connection()) {
DeviceZoneScopedN("WR-BWD");
mcast_bwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{
noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)});
mcast_bwd_packet_header->to_noc_unicast_write(
NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes);
{
DeviceZoneScopedN("WR-BWD-WAIT");
fabric_connection.get_backward_connection().wait_for_empty_write_slot();
Expand All @@ -179,7 +179,7 @@ void kernel_main() {
DeviceZoneScopedN("UNICAST-WRITE");
auto& fabric_conn =
unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection();
unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr, packet_payload_size_bytes});
unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes);
fabric_conn.wait_for_empty_write_slot();
fabric_conn.send_payload_without_header_non_blocking_from_address(
source_l1_buffer_address, packet_payload_size_bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,18 @@ void kernel_main() {

// bit of a hack to extract X/Y
const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX);
const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader);
const size_t packet_size = page_size;
auto packet_addr = get_read_ptr(cb_id_in0);
auto& packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(packet_addr);
if constexpr (mcast_mode) {
packet_header
.to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
packet_header.reserved2 = 0x1111; // debug only
.to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size));
} else {
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
packet_header.reserved2 = 0x1111; // debug only
.to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size));
}

sender.send_payload_blocking_from_address(packet_addr, packet_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,10 @@ auto forward_to_fabric_from_cb(
if constexpr (mcast_mode) {
packet_header
.to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size));
} else {
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size));
}

uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t)));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ bool terminate_fabric_endpoints_farthest_to_nearest (
reinterpret_cast<volatile uint32_t*>(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE;
sender.wait_for_empty_write_slot();
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast<uint8_t>(distance)})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
termination_sig_noc_addr, sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)});
.to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr},
sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t));
sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header());
noc_async_writes_flushed();
}
Expand Down
11 changes: 11 additions & 0 deletions tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,17 @@ def test_all_gather(
None,
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
),
(
4,
[1, 4, 32, 1280],
3,
ttnn.TILE_LAYOUT,
(32, 128),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(1, 4))}),
None,
None,
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
),
],
)
@pytest.mark.parametrize("num_links", [1])
Expand Down
6 changes: 6 additions & 0 deletions tt_metal/hw/inc/ethernet/tunneling.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ void eth_write_remote_reg(uint32_t q_num, uint32_t reg_addr, uint32_t val) {
eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val);
eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG);
}
FORCE_INLINE
void eth_write_remote_reg_no_txq_check(uint32_t q_num, uint32_t reg_addr, uint32_t val) {
eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, reg_addr);
eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val);
eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG);
}

void check_and_context_switch() {
uint32_t start_time = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
pkt_hdr->reserved2 = my_chip_id;
#endif

size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader);
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes});
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes);

switch (current_cmd_header.dest_type) {
case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -438,16 +438,14 @@ void try_advance_inline_write_or_atomic_inc(command_context_t<Addrgen>& cmd_ctx)

ASSERT(cmd_ctx.packet_header_buffer_addr != 0);
auto* pkt_hdr = reinterpret_cast<tt::fabric::PacketHeader*>(cmd_ctx.packet_header_buffer_addr);
#ifdef DEBUG_PRINT_ENABLED
pkt_hdr->reserved2 = my_chip_id;
#endif

uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0);
if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) {
pkt_hdr->to_noc_unicast_atomic_inc(
tt::fabric::NocUnicastAtomicIncCommandHeader{dest_noc_addr_for_pkt, static_cast<uint16_t>(value), 32});
} else {
pkt_hdr->to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{dest_noc_addr_for_pkt, static_cast<uint16_t>(value)});
pkt_hdr->to_noc_unicast_inline_write(
tt::fabric::NocUnicastInlineWriteCommandHeader{dest_noc_addr_for_pkt, static_cast<uint16_t>(value)});
}

switch (cmd_ctx.current_cmd_header.dest_type) {
Expand Down Expand Up @@ -563,13 +561,8 @@ void write_and_advance_local_read_address_for_fabric_write(
const size_t payload_l1_address = l1_read_addr;

auto pkt_hdr = reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr);
#ifdef DEBUG_PRINT_ENABLED
pkt_hdr->reserved2 = my_chip_id;
#endif

size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader);
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_noc_addr, packet_send_size_bytes});
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes);

switch (current_cmd_header.dest_type) {
case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,7 @@ void mcast_contig_pages_to_noc_address(
pkt_hdr
.to_chip_multicast(
tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(forward_direction_num_hops)})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_addr,
packet_send_size_bytes});
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes);
forward_fabric_sender.wait_for_empty_write_slot();
forward_fabric_sender.send_payload_flush_blocking_from_address(l1_read_addr, packet_send_size_bytes);
}
Expand All @@ -131,9 +129,7 @@ void mcast_contig_pages_to_noc_address(
pkt_hdr
.to_chip_multicast(
tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(backward_direction_num_hops)})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_addr,
packet_send_size_bytes});
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes);
backward_fabric_sender.wait_for_empty_write_slot();
backward_fabric_sender.send_payload_non_blocking_from_address(l1_read_addr, packet_send_size_bytes);
}
Expand Down
Loading

0 comments on commit 960bbde

Please sign in to comment.