From 1352cf639dd625c47f9b9d5e43a98f7fba6ef051 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Fri, 7 Feb 2025 01:35:06 +0000 Subject: [PATCH 1/7] optimize size field placement in edm fabric packet header Make it in the same spot for all command types to remove any sort of conditional lookup to determin packet size. Also switch the size to now represent payload size only (excludes header). This simplifies some caller code as well. > Conflicts: > ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp > ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp > ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp --- .../gtests/ccl/kernels/edm_fabric_writer.cpp | 10 +- ...c_erisc_datamover_sender_worker_sender.cpp | 12 +-- .../fabric_worker_sender_multi_input.cpp | 6 +- .../ccl/kernels/test_kernels.common.hpp | 5 +- .../operations/ccl/test_new_all_gather.py | 11 +++ tt_metal/hw/inc/ethernet/tunneling.h | 6 ++ .../kernel_common/kernel_writers.hpp | 3 +- .../kernels/ccl_send_reader_two_input.cpp | 15 +-- .../ccl/common/kernels/ccl_send_utils.hpp | 8 +- .../edm_fabric/fabric_edm_packet_header.hpp | 91 +++++++++---------- .../fabric_edm_packet_transmission.hpp | 27 ++++-- .../edm_fabric/fabric_erisc_datamover.cpp | 34 +++---- .../device/kernels/minimal_ccl_common.hpp | 7 +- 13 files changed, 116 insertions(+), 119 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index cd142bef8fd..2a41846b929 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -140,8 +140,8 @@ void kernel_main() { noc_async_write(source_l1_buffer_address, dest_addr, packet_payload_size_bytes); if (fabric_connection.has_forward_connection()) { DeviceZoneScopedN("WR-FWD"); - mcast_fwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ - noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); + mcast_fwd_packet_header->to_noc_unicast_write( + NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); { DeviceZoneScopedN("WR-FWD-WAIT"); fabric_connection.get_forward_connection().wait_for_empty_write_slot(); @@ -155,8 +155,8 @@ void kernel_main() { if (fabric_connection.has_backward_connection()) { DeviceZoneScopedN("WR-BWD"); - mcast_bwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ - noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); + mcast_bwd_packet_header->to_noc_unicast_write( + NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); { DeviceZoneScopedN("WR-BWD-WAIT"); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); @@ -179,7 +179,7 @@ void kernel_main() { DeviceZoneScopedN("UNICAST-WRITE"); auto& fabric_conn = unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr, packet_payload_size_bytes}); + unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index d0b384fc55f..98895a2c1de 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -122,20 +122,18 @@ void kernel_main() { // bit of a hack to extract X/Y const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); + const size_t packet_size = page_size; auto packet_addr = get_read_ptr(cb_id_in0); auto& packet_header = *reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); - packet_header.reserved2 = 0x1111; // debug only + .to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } else { packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); - packet_header.reserved2 = 0x1111; // debug only + .to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } sender.send_payload_blocking_from_address(packet_addr, packet_size); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index 98a60766922..704b516c48c 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -59,12 +59,10 @@ auto forward_to_fabric_from_cb( if constexpr (mcast_mode) { packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } else { packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t))); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp index cae2798e893..4ef80a5f68f 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp @@ -33,8 +33,9 @@ bool terminate_fabric_endpoints_farthest_to_nearest ( reinterpret_cast(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; sender.wait_for_empty_write_slot(); packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast(distance)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - termination_sig_noc_addr, sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)}); + .to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr}, + sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)); sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header()); noc_async_writes_flushed(); } diff --git a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py index 08d359325c2..a9816aefdcb 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py @@ -464,6 +464,17 @@ def test_all_gather( None, ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ), + ( + 4, + [1, 4, 32, 1280], + 3, + ttnn.TILE_LAYOUT, + (32, 128), + ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(1, 4))}), + None, + None, + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ), ], ) @pytest.mark.parametrize("num_links", [1]) diff --git a/tt_metal/hw/inc/ethernet/tunneling.h b/tt_metal/hw/inc/ethernet/tunneling.h index 37d1422d2f6..a4070cbb24b 100644 --- a/tt_metal/hw/inc/ethernet/tunneling.h +++ b/tt_metal/hw/inc/ethernet/tunneling.h @@ -96,6 +96,12 @@ void eth_write_remote_reg(uint32_t q_num, uint32_t reg_addr, uint32_t val) { eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val); eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG); } +FORCE_INLINE +void eth_write_remote_reg_no_txq_check(uint32_t q_num, uint32_t reg_addr, uint32_t val) { + eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, reg_addr); + eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val); + eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG); +} void check_and_context_switch() { uint32_t start_time = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp index b69b5caaad2..0df726fd332 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp @@ -33,8 +33,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( pkt_hdr->reserved2 = my_chip_id; #endif - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 4225247db41..50c565ca9f6 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -438,16 +438,14 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) ASSERT(cmd_ctx.packet_header_buffer_addr != 0); auto* pkt_hdr = reinterpret_cast(cmd_ctx.packet_header_buffer_addr); -#ifdef DEBUG_PRINT_ENABLED - pkt_hdr->reserved2 = my_chip_id; -#endif + uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0); if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) { pkt_hdr->to_noc_unicast_atomic_inc( tt::fabric::NocUnicastAtomicIncCommandHeader{dest_noc_addr_for_pkt, static_cast(value), 32}); } else { - pkt_hdr->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{dest_noc_addr_for_pkt, static_cast(value)}); + pkt_hdr->to_noc_unicast_inline_write( + tt::fabric::NocUnicastInlineWriteCommandHeader{dest_noc_addr_for_pkt, static_cast(value)}); } switch (cmd_ctx.current_cmd_header.dest_type) { @@ -563,13 +561,8 @@ void write_and_advance_local_read_address_for_fabric_write( const size_t payload_l1_address = l1_read_addr; auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); -#ifdef DEBUG_PRINT_ENABLED - pkt_hdr->reserved2 = my_chip_id; -#endif - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp index 0f662c4bfd4..904cd775a9a 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp @@ -118,9 +118,7 @@ void mcast_contig_pages_to_noc_address( pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(forward_direction_num_hops)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_addr, - packet_send_size_bytes}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes); forward_fabric_sender.wait_for_empty_write_slot(); forward_fabric_sender.send_payload_flush_blocking_from_address(l1_read_addr, packet_send_size_bytes); } @@ -131,9 +129,7 @@ void mcast_contig_pages_to_noc_address( pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(backward_direction_num_hops)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_addr, - packet_send_size_bytes}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes); backward_fabric_sender.wait_for_empty_write_slot(); backward_fabric_sender.send_payload_non_blocking_from_address(l1_read_addr, packet_send_size_bytes); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index be4f8c42ce4..3d13532e569 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -19,13 +19,13 @@ enum TerminationSignal : uint32_t { IMMEDIATELY_TERMINATE = 2 }; - -// 2 bits +// 3 bits enum NocSendType : uint8_t { NOC_UNICAST_WRITE = 0, - NOC_MULTICAST_WRITE = 1, - NOC_UNICAST_ATOMIC_INC = 2, - NOC_MULTICAST_ATOMIC_INC = 3 + NOC_UNICAST_INLINE_WRITE = 1, + NOC_MULTICAST_WRITE = 2, + NOC_UNICAST_ATOMIC_INC = 3, + NOC_MULTICAST_ATOMIC_INC = 4 }; // How to send the payload across the cluster // 1 bit @@ -34,7 +34,6 @@ enum ChipSendType : uint8_t { CHIP_MULTICAST = 1, }; - struct UnicastRoutingCommandHeader { uint8_t distance_in_hops; }; @@ -52,11 +51,10 @@ static_assert(sizeof(RoutingFields) == sizeof(UnicastRoutingCommandHeader), "Rou struct NocUnicastCommandHeader { uint64_t noc_address; - uint32_t size; - // ignores header size - inline uint32_t get_payload_only_size() const { - return size; - } +}; +struct NocUnicastInlineWriteCommandHeader { + uint64_t noc_address; + uint32_t value; }; struct NocUnicastAtomicIncCommandHeader { NocUnicastAtomicIncCommandHeader(uint64_t noc_address, uint16_t val, uint16_t wrap) @@ -68,16 +66,10 @@ struct NocUnicastAtomicIncCommandHeader { }; struct NocMulticastCommandHeader { uint32_t address; - uint32_t size; uint8_t noc_x_start; uint8_t noc_y_start; uint8_t mcast_rect_size_x; uint8_t mcast_rect_size_y; - - // ignores header size - inline uint32_t get_payload_only_size() const { - return size; - } }; struct NocMulticastAtomicIncCommandHeader { uint32_t address; @@ -88,12 +80,14 @@ struct NocMulticastAtomicIncCommandHeader { uint8_t size_x; uint8_t size_y; }; -static_assert(sizeof(NocUnicastCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocMulticastCommandHeader) == 12, "NocMulticastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 1 byte"); static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 1 byte"); union NocCommandFields{ NocUnicastCommandHeader unicast_write; + NocUnicastInlineWriteCommandHeader unicast_inline_write; NocMulticastCommandHeader mcast_write; NocUnicastAtomicIncCommandHeader unicast_seminc; NocMulticastAtomicIncCommandHeader mcast_seminc; @@ -106,16 +100,16 @@ struct PacketHeader { // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc // For now, kept it separate so I could do reads which would be handled differently // but for our purposes we shouldn't need read so we should be able to omit the support - NocSendType noc_send_type : 2; + NocSendType noc_send_type : 3; ChipSendType chip_send_type : 1; - uint8_t reserved : 1; + // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to // indicate to the receiver channel what channel was the source of this packet. Reserved // otherwise. uint8_t src_ch_id : 4; RoutingFields routing_fields; - uint16_t reserved2; // can be tagged with src device for debug + uint16_t payload_size_bytes; // excludes header size NocCommandFields command_fields; // size = 16B due to uint64_t alignment // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned @@ -134,23 +128,9 @@ struct PacketHeader { inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } + // Returns size of payload in bytes - TODO: convert to words (4B) size_t get_payload_size_excluding_header() volatile const { - switch(this->noc_send_type) { - case NOC_UNICAST_WRITE: { - return this->command_fields.unicast_write.size - sizeof(PacketHeader); - } break; - case NOC_MULTICAST_WRITE: { - return this->command_fields.mcast_write.size - sizeof(PacketHeader); - } break; - case NOC_UNICAST_ATOMIC_INC: - case NOC_MULTICAST_ATOMIC_INC: - return 0; - default: - #if defined(KERNEL_BUILD) || defined(FW_BUILD) - ASSERT(false); - #endif - return 0; - }; + return this->payload_size_bytes; } inline size_t get_payload_size_including_header() volatile const { return get_payload_size_excluding_header() + sizeof(PacketHeader); @@ -167,26 +147,36 @@ struct PacketHeader { return *this; } - inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) { + inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write = noc_unicast_command_header; + this->payload_size_bytes = payload_size_bytes; + return *this; + } + inline PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write = noc_unicast_command_header; + this->payload_size_bytes = 0; return *this; } - inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header) { + inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write = noc_multicast_command_header; + this->payload_size_bytes = payload_size_bytes; return *this; } inline PacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; + this->payload_size_bytes = 0; return *this; } - inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header) { + inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { #if defined(KERNEL_BUILD) || defined(FW_BUILD) ASSERT(false); while (1) {}; #endif + this->payload_size_bytes = payload_size_bytes; return *this; } @@ -201,20 +191,27 @@ struct PacketHeader { this->routing_fields.chip_mcast.start_distance_in_hops = chip_multicast_command_header.start_distance_in_hops; return this; } - inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) volatile { + inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; - this->command_fields.unicast_write.size = noc_unicast_command_header.size; + this->payload_size_bytes = payload_size_bytes; return this; } - inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header) volatile { + inline volatile PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address; + this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value; + this->payload_size_bytes = 0; + return *this; + } + inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; this->command_fields.mcast_write.noc_x_start = noc_multicast_command_header.noc_x_start; this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start; - this->command_fields.mcast_write.size = noc_multicast_command_header.size; + this->payload_size_bytes = payload_size_bytes; this->command_fields.mcast_write.address = noc_multicast_command_header.address; return this; @@ -225,11 +222,12 @@ struct PacketHeader { this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = 0; return this; } inline volatile PacketHeader *to_noc_multicast_atomic_inc( - NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header) volatile { + NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; this->command_fields.mcast_seminc.noc_x_start = noc_multicast_atomic_inc_command_header.noc_x_start; @@ -238,6 +236,7 @@ struct PacketHeader { this->command_fields.mcast_seminc.size_y = noc_multicast_atomic_inc_command_header.size_y; this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val; this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = payload_size_bytes; return this; } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index 16d003b1c71..355930b41ef 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -17,6 +17,7 @@ static constexpr size_t DESTINATION_HOP_COUNT = 1; static constexpr size_t LAST_MCAST_DESTINATION = 1; void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED switch (packet_start->chip_send_type) { case tt::fabric::CHIP_UNICAST: { DPRINT << "C_UNI: dist:" << (uint32_t) packet_start->routing_fields.chip_unicast.distance_in_hops << "\n"; @@ -28,13 +29,14 @@ void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packe break; } }; +#endif } void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED switch (packet_start->noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { - DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_write.noc_address << - ", size:" << (uint32_t) packet_start->command_fields.unicast_write.size << "\n"; + DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_write.noc_address << "\n"; } break; case tt::fabric::NocSendType::NOC_UNICAST_ATOMIC_INC: { DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_seminc.noc_address << @@ -45,15 +47,19 @@ void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet ASSERT(false); // unimplemented break; }; +#endif } void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED auto const& header = *packet_start; DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << ", csnd_t:" << (uint32_t) packet_start->chip_send_type << - ", src_chip:" << (uint32_t) packet_start->reserved2 << "\n"; + ", src_chip:" << (uint32_t) packet_start->src_ch_id << + ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n"; print_pkt_hdr_routing_fields(packet_start); print_pkt_header_noc_fields(packet_start); +#endif } @@ -63,12 +69,11 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type; + auto const payload_size_bytes = header.payload_size_bytes; switch (noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { auto const dest_address = header.command_fields.unicast_write.noc_address; - auto const size = header.command_fields.unicast_write.size - sizeof(tt::fabric::PacketHeader); - noc_async_write_one_packet_with_trid(payload_start_address, dest_address, size, transaction_id); - + noc_async_write_one_packet_with_trid(payload_start_address, dest_address, payload_size_bytes, transaction_id); } break; case tt::fabric::NocSendType::NOC_MULTICAST_WRITE: { @@ -80,9 +85,7 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const header.command_fields.mcast_write.noc_y_start + header.command_fields.mcast_write.mcast_rect_size_y, header.command_fields.mcast_write.address); auto const num_dests = header.command_fields.mcast_write.mcast_rect_size_x * header.command_fields.mcast_write.mcast_rect_size_y; - auto const size = header.command_fields.mcast_write.size - sizeof(tt::fabric::PacketHeader); - noc_async_write_one_packet_with_trid(payload_start_address, mcast_dest_address, size, num_dests, transaction_id); - + noc_async_write_one_packet_with_trid(payload_start_address, mcast_dest_address, payload_size_bytes, num_dests, transaction_id); } break; case tt::fabric::NocSendType::NOC_UNICAST_ATOMIC_INC: { @@ -92,6 +95,12 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const } break; + case tt::fabric::NocSendType::NOC_UNICAST_INLINE_WRITE: { + auto const dest_address = header.command_fields.unicast_inline_write.noc_address; + auto const value = header.command_fields.unicast_inline_write.value; + noc_inline_dw_write(dest_address, value); + } break; + case tt::fabric::NocSendType::NOC_MULTICAST_ATOMIC_INC: default: { ASSERT(false); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index e913c18f7aa..96f245b1a6b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -4,7 +4,7 @@ #include "dataflow_api.h" -#include "tt_metal/hw/inc/ethernet/dataflow_api.h" +#include "tt_metal/hw/inc/ethernet/tunneling.h" #include "cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp" #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp" #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" @@ -298,6 +298,7 @@ struct WriteTransactionIdTracker { TransactionIdCounter trid_counter; }; +static constexpr uint32_t DEFAULT_ETH_TXQ = 0; // senders update this stream constexpr uint32_t to_receiver_pkts_sent_id = 0; @@ -339,11 +340,11 @@ void increment_local_update_ptr_val(uint8_t stream_id, int32_t val) { template void remote_update_ptr_val(int32_t val) { constexpr uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); - eth_write_remote_reg(addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); + internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } void remote_update_ptr_val(uint32_t stream_id, int32_t val) { const uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); - eth_write_remote_reg(addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); + internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } template @@ -494,12 +495,7 @@ void send_channel_sync( ) { auto src_addr = sender_buffer_channel.get_bytes_sent_address(sender_wrptr.get_buffer_index()); auto dest_addr = receiver_buffer_channel.get_bytes_sent_address(remote_receiver_wrptr.get_buffer_index()); - eth_send_bytes_over_channel_payload_only_unsafe( - reinterpret_cast(src_addr), - reinterpret_cast(dest_addr), - sizeof(eth_channel_sync_t), - sizeof(eth_channel_sync_t), - sizeof(eth_channel_sync_t) >> ETH_BYTES_TO_WORDS_SHIFT); + internal_::eth_send_packet_bytes_unsafe(DEFAULT_ETH_TXQ, src_addr, dest_addr, sizeof(eth_channel_sync_t)); } template @@ -514,7 +510,7 @@ void send_next_data( auto &local_sender_wrptr = sender_worker_interface.local_wrptr; auto local_sender_wrptr_buffer_index = local_sender_wrptr.get_buffer_index(); - ASSERT(!eth_txq_is_busy()); + ASSERT(!internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)); // TODO: TUNING - experiment with only conditionally breaking the transfer up into multiple packets if we are // a certain threshold less than full packet @@ -525,25 +521,19 @@ void send_next_data( auto volatile *pkt_header = reinterpret_cast(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index)); ASSERT(tt::fabric::is_valid(*const_cast(pkt_header))); - size_t payload_size = 0; - payload_size = pkt_header->get_payload_size_including_header(); + size_t payload_size_bytes = pkt_header->get_payload_size_including_header(); pkt_header->src_ch_id = sender_channel_index; auto src_addr = sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index); auto dest_addr = receiver_buffer_channel.get_buffer_address(remote_receiver_wrptr.get_buffer_index()); - eth_send_bytes_over_channel_payload_only_unsafe( - src_addr, - dest_addr, - payload_size, - payload_size, - payload_size >> ETH_BYTES_TO_WORDS_SHIFT); - + internal_::eth_send_packet_bytes_unsafe(DEFAULT_ETH_TXQ, src_addr, dest_addr, payload_size_bytes); // Note: We can only advance to the next buffer index if we have fully completed the send (both the payload and sync // messages) local_sender_wrptr.increment(); // update the remote reg static constexpr uint32_t words_to_forward = 1; + while (internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) {}; remote_update_ptr_val(words_to_forward); remote_receiver_wrptr.increment(); } @@ -666,7 +656,7 @@ bool run_sender_channel_step( // when moving to stream regs to manage rd/wr ptrs // TODO: update to be stream reg based. Initialize to space available and simply check for non-zero bool receiver_has_space_for_packet = outbound_to_receiver_channel_pointers.has_space_for_packet(); - if (receiver_has_space_for_packet && !eth_txq_is_busy()) { + if (receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload(); if (has_unsent_packet) { bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); @@ -757,7 +747,7 @@ void run_receiver_channel_step( auto &ack_ptr = receiver_channel_pointers.ack_ptr; auto pkts_received_since_last_check = get_ptr_val(); bool pkts_received = pkts_received_since_last_check > 0; - bool can_send_over_eth = !eth_txq_is_busy(); + bool can_send_over_eth = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); ASSERT(receiver_channel_pointers.completion_ptr.distance_behind(ack_ptr) < RECEIVER_NUM_BUFFERS); if (pkts_received && can_send_over_eth) { // currently only support processing one packet at a time, so we only decrement by 1 @@ -803,7 +793,7 @@ void run_receiver_channel_step( auto &completion_ptr = receiver_channel_pointers.completion_ptr; bool unsent_completions = !completion_ptr.is_caught_up_to(wr_flush_ptr); if (unsent_completions) { - bool can_send_without_blocking = !eth_txq_is_busy(); + bool can_send_without_blocking = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); if (can_send_without_blocking) { // completion ptr incremented in callee receiver_send_completion_ack( diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp index a281806cafc..641e6cee244 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp @@ -20,11 +20,8 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); const size_t payload_l1_address = l1_read_addr; - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr_forward->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); - pkt_hdr_backward->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr_forward->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); + pkt_hdr_backward->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); noc_async_write(payload_l1_address, safe_get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr), payload_size_bytes); if (fabric_connection.has_forward_connection()) { From 684b4e5b0cc230c87e3a8778a582516608a941bb Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Fri, 7 Feb 2025 17:21:03 +0000 Subject: [PATCH 2/7] unsure why this is needed now... maybe local environment/build issue...? --- ttnn/cpp/pybind11/global_semaphore.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ttnn/cpp/pybind11/global_semaphore.cpp b/ttnn/cpp/pybind11/global_semaphore.cpp index bf9f82673c7..bdc7a2d977b 100644 --- a/ttnn/cpp/pybind11/global_semaphore.cpp +++ b/ttnn/cpp/pybind11/global_semaphore.cpp @@ -7,6 +7,7 @@ #include #include "cpp/ttnn/global_semaphore.hpp" #include "pybind11/pybind11.h" +#include "pybind11/stl.h" namespace ttnn::global_semaphore { From 792459a7da796f67f2f1418e2172949d4d9317c1 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Fri, 7 Feb 2025 17:51:10 +0000 Subject: [PATCH 3/7] revert accidental fix revert --- tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py index a9816aefdcb..41f1076a2af 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py @@ -469,7 +469,7 @@ def test_all_gather( [1, 4, 32, 1280], 3, ttnn.TILE_LAYOUT, - (32, 128), + (32, 320), ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(1, 4))}), None, None, From 8160088c1df2754d0c763fca59824a3d8c6cae53 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Fri, 7 Feb 2025 18:34:17 +0000 Subject: [PATCH 4/7] sq --- .../ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index 96f245b1a6b..ee7bf23295c 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -780,8 +780,6 @@ void run_receiver_channel_step( bool unflushed_writes = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr); if (unflushed_writes) { auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); - // Temporary patch for instability. Issue was not caught due to what appears to be a bug in CI - // not running all tests. Issue tracked here: https://github.com/tenstorrent/tt-metal/issues/17702 bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); if (next_trid_flushed) { local_receiver_channel.eth_clear_sender_channel_ack(receiver_buffer_index); From 9e4c555981476b25fa24f910475104c732e73791 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Fri, 7 Feb 2025 22:28:41 +0000 Subject: [PATCH 5/7] simplify the fabric routing fields for edm fabric packet header --- .../gtests/ccl/kernels/edm_fabric_writer.cpp | 2 +- ...c_erisc_datamover_sender_worker_sender.cpp | 4 +- .../fabric_worker_sender_multi_input.cpp | 4 +- .../ccl/kernels/test_kernels.common.hpp | 2 +- .../kernel_common/kernel_writers.hpp | 2 +- .../kernels/ccl_send_reader_two_input.cpp | 5 +- .../edm_fabric/fabric_edm_packet_header.hpp | 42 ++++++----- .../fabric_edm_packet_transmission.hpp | 68 ++++-------------- .../edm_fabric/fabric_erisc_datamover.cpp | 70 +++++++------------ 9 files changed, 71 insertions(+), 128 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index 2a41846b929..952a4963104 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -128,7 +128,7 @@ void kernel_main() { mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); - unicast_packet_header->to_chip_unicast(UnicastRoutingCommandHeader{static_cast(unicast_hops)}); + unicast_packet_header->to_chip_unicast(static_cast(unicast_hops)); { DeviceZoneScopedN("MAIN-WRITE-ZONE"); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index 98895a2c1de..b1ff19c7bec 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -131,7 +131,7 @@ void kernel_main() { .to_noc_unicast_write( tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } else { - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) + packet_header.to_chip_unicast(config.unicast.distance) .to_noc_unicast_write( tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } @@ -148,7 +148,7 @@ void kernel_main() { ASSERT(*last_message_semaphore_address == 0); uint64_t last_message_semaphore_noc0_addr = safe_get_noc_addr(my_x[0], my_y[0], (uint32_t)last_message_semaphore_address, 0); - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2}); + packet_header.to_chip_unicast(2); packet_header.to_noc_unicast_atomic_inc( tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_semaphore_noc0_addr, 1, 32)); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index 704b516c48c..eaa14a0e40f 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -61,7 +61,7 @@ auto forward_to_fabric_from_cb( .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } else { - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) + packet_header.to_chip_unicast(config.unicast.distance) .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } @@ -187,7 +187,7 @@ void kernel_main() { packet_header.reserved = 0xE; packet_header.reserved2 = 0xFFFF; uint64_t last_message_sem_noc_addr = get_noc_addr(my_x[0], my_y[0], last_message_semaphore_address); - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{kLoopbackNumHopsToMyChip}); + packet_header.to_chip_unicast(kLoopbackNumHopsToMyChip); packet_header.to_noc_unicast_atomic_inc( tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_sem_noc_addr, 1, 32)); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp index 4ef80a5f68f..ae5e9135a2b 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp @@ -32,7 +32,7 @@ bool terminate_fabric_endpoints_farthest_to_nearest ( auto &packet_header = *reinterpret_cast(a_packet_header_addr); reinterpret_cast(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; sender.wait_for_empty_write_slot(); - packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast(distance)}) + packet_header.to_chip_unicast(static_cast(distance)) .to_noc_unicast_write( tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr}, sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp index 0df726fd332..fd6bae7f5ee 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp @@ -41,7 +41,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( auto& fabric_conn = unicast_args.is_forward_direction ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{unicast_args.distance_in_hops}); + pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 50c565ca9f6..731ed70359e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -450,8 +450,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) switch (cmd_ctx.current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { - pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{ - cmd_ctx.current_cmd_header.get_unicast_dest_args().distance_in_hops}); + pkt_hdr->to_chip_unicast(cmd_ctx.current_cmd_header.get_unicast_dest_args().distance_in_hops); auto& fabric_connection = cmd_ctx.current_cmd_header.get_unicast_dest_args().is_forward_direction ? cmd_ctx.fabric_connection.get_forward_connection() @@ -570,7 +569,7 @@ void write_and_advance_local_read_address_for_fabric_write( auto& fabric_conn = unicast_args.is_forward_direction ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{unicast_args.distance_in_hops}); + pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index 3d13532e569..9a5cfcb40f9 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -6,6 +6,7 @@ #include #include +#include namespace tt::fabric { @@ -34,20 +35,26 @@ enum ChipSendType : uint8_t { CHIP_MULTICAST = 1, }; -struct UnicastRoutingCommandHeader { - uint8_t distance_in_hops; +struct RoutingFields { + static constexpr uint8_t START_DISTANCE_FIELD_BIT_WIDTH = 4; + static constexpr uint8_t RANGE_HOPS_FIELD_BIT_WIDTH = 4; + static constexpr uint8_t LAST_HOP_DISTANCE_VAL = 1; + static constexpr uint8_t LAST_CHIP_IN_MCAST_VAL = 1 << tt::fabric::RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH; + static constexpr uint8_t HOP_DISTANCE_MASK = (1 << tt::fabric::RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH) - 1; + static constexpr uint8_t RANGE_MASK = ((1 << tt::fabric::RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH) - 1) + << tt::fabric::RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH; + static constexpr uint8_t LAST_MCAST_VAL = LAST_CHIP_IN_MCAST_VAL | LAST_HOP_DISTANCE_VAL; + + uint8_t value; }; -static_assert(sizeof(UnicastRoutingCommandHeader) == 1, "UnicastRoutingCommandHeader size is not 1 byte"); +static_assert(sizeof(RoutingFields) == sizeof(uint8_t), "RoutingFields size is not 1 bytes"); +static_assert((RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH + RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH) <= sizeof(RoutingFields) * 8, "START_DISTANCE_FIELD_BIT_WIDTH + RANGE_HOPS_FIELD_BIT_WIDTH must equal 8"); + struct MulticastRoutingCommandHeader { - uint8_t start_distance_in_hops: 4; - uint8_t range_hops: 4; // 0 implies unicast -}; -static_assert(sizeof(MulticastRoutingCommandHeader) == 1, "MulticastRoutingCommandHeader size is not 1 byte"); -union RoutingFields { - UnicastRoutingCommandHeader chip_unicast; - MulticastRoutingCommandHeader chip_mcast; + uint8_t start_distance_in_hops: RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH; + uint8_t range_hops: RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH; // 0 implies unicast }; -static_assert(sizeof(RoutingFields) == sizeof(UnicastRoutingCommandHeader), "RoutingFields size is not 1 bytes"); +static_assert(sizeof(MulticastRoutingCommandHeader) <= sizeof(RoutingFields), "MulticastRoutingCommandHeader size is not 1 byte"); struct NocUnicastCommandHeader { uint64_t noc_address; @@ -136,14 +143,14 @@ struct PacketHeader { return get_payload_size_excluding_header() + sizeof(PacketHeader); } - inline PacketHeader &to_chip_unicast(UnicastRoutingCommandHeader const &chip_unicast_command_header) { + inline PacketHeader &to_chip_unicast(uint8_t distance_in_hops) { this->chip_send_type = CHIP_UNICAST; - this->routing_fields.chip_unicast = chip_unicast_command_header; + this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; return *this; } inline PacketHeader &to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) { this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.chip_mcast = chip_multicast_command_header; + this->routing_fields.value = ((static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast(chip_multicast_command_header.start_distance_in_hops); return *this; } @@ -180,15 +187,14 @@ struct PacketHeader { return *this; } - inline volatile PacketHeader *to_chip_unicast(UnicastRoutingCommandHeader const &chip_unicast_command_header) volatile { + inline volatile PacketHeader *to_chip_unicast(uint8_t distance_in_hops) volatile { this->chip_send_type = CHIP_UNICAST; - this->routing_fields.chip_unicast.distance_in_hops = chip_unicast_command_header.distance_in_hops; + this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; return this; } inline volatile PacketHeader *to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile { this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.chip_mcast.range_hops = chip_multicast_command_header.range_hops; - this->routing_fields.chip_mcast.start_distance_in_hops = chip_multicast_command_header.start_distance_in_hops; + this->routing_fields.value = (static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH) | chip_multicast_command_header.start_distance_in_hops; return this; } inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index 355930b41ef..35533d4d26e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -20,12 +20,12 @@ void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packe #ifdef DEBUG_PRINT_ENABLED switch (packet_start->chip_send_type) { case tt::fabric::CHIP_UNICAST: { - DPRINT << "C_UNI: dist:" << (uint32_t) packet_start->routing_fields.chip_unicast.distance_in_hops << "\n"; + DPRINT << "C_UNI: dist:" << (uint32_t) (packet_start->routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) << "\n"; break; } case tt::fabric::CHIP_MULTICAST: { - DPRINT << "C_MCST: dist:" << (uint32_t) packet_start->routing_fields.chip_mcast.start_distance_in_hops << - ", rng:" << (uint32_t) packet_start->routing_fields.chip_mcast.range_hops << "\n"; + DPRINT << "C_MCST: dist:" << (uint32_t) (packet_start->routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) << + ", rng:" << (uint32_t)((packet_start->routing_fields.value & tt::fabric::RoutingFields::RANGE_MASK) >> tt::fabric::RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH) << "\n"; break; } }; @@ -64,7 +64,7 @@ void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { // Since we unicast to local, we must omit the packet header -void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const packet_start, uint32_t transaction_id) { +FORCE_INLINE void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const packet_start, uint32_t transaction_id) { auto const& header = *packet_start; uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); @@ -108,24 +108,12 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const }; } - - -void update_packet_header_for_next_hop(volatile tt::fabric::PacketHeader * packet_header) { - switch (packet_header->chip_send_type) { - case tt::fabric::CHIP_UNICAST: { - ASSERT(packet_header->routing_fields.chip_unicast.distance_in_hops > 0); - packet_header->routing_fields.chip_unicast.distance_in_hops--; - } break; - case tt::fabric::CHIP_MULTICAST: { - if (packet_header->routing_fields.chip_mcast.start_distance_in_hops == DESTINATION_HOP_COUNT) { - ASSERT(packet_header->routing_fields.chip_mcast.range_hops > 0); - packet_header->routing_fields.chip_mcast.range_hops--; - } else { - ASSERT(packet_header->routing_fields.chip_mcast.start_distance_in_hops > 0); - packet_header->routing_fields.chip_mcast.start_distance_in_hops--; - } - } break; - } +FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketHeader * packet_header, tt::fabric::RoutingFields cached_routing_fields) { + // if the distance field is one, it means the range field decrements, else the start distance field decrements + // TODO [optimization]: If we can make the terminal value 0, then we can save an instruction on the eq insn + bool decrement_range = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; + uint8_t decrement_val = static_cast(1) << (decrement_range * tt::fabric::RoutingFields::RANGE_HOPS_FIELD_BIT_WIDTH); + packet_header->routing_fields.value = cached_routing_fields.value - decrement_val; } // This function forwards a packet to the downstream EDM channel for eventual sending @@ -137,8 +125,9 @@ void update_packet_header_for_next_hop(volatile tt::fabric::PacketHeader * packe // !!!WARNING!!! * do NOT call before determining if the packet should be consumed locally or forwarded // !!!WARNING!!! * ENSURE DOWNSTREAM EDM HAS SPACE FOR PACKET BEFORE CALLING // !!!WARNING!!! -void forward_payload_to_downstream_edm( +FORCE_INLINE void forward_payload_to_downstream_edm( volatile tt::fabric::PacketHeader *packet_header, + tt::fabric::RoutingFields cached_routing_fields, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, uint8_t transaction_id ) { @@ -148,40 +137,9 @@ void forward_payload_to_downstream_edm( // This is a good place to print the packet header for debug if you are trying to inspect packets // because it is before we start manipulating the header for forwarding - update_packet_header_for_next_hop(packet_header); + update_packet_header_for_next_hop(packet_header, cached_routing_fields); downstream_edm_interface.send_payload_non_blocking_from_address_with_trid( reinterpret_cast(packet_header), packet_header->get_payload_size_including_header(), transaction_id); } - - -bool packet_must_be_consumed_locally(volatile tt::fabric::PacketHeader const& packet_header) { - switch (packet_header.chip_send_type) { - case tt::fabric::ChipSendType::CHIP_UNICAST: { - return packet_header.routing_fields.chip_unicast.distance_in_hops == DESTINATION_HOP_COUNT; - } - case tt::fabric::ChipSendType::CHIP_MULTICAST: { - return packet_header.routing_fields.chip_mcast.start_distance_in_hops == DESTINATION_HOP_COUNT; - } - default: { - ASSERT(false); - return false; - } - } -} - - -bool packet_must_be_forwarded_to_next_chip(volatile tt::fabric::PacketHeader const& packet_header) { - switch (packet_header.chip_send_type) { - case tt::fabric::ChipSendType::CHIP_UNICAST: - return packet_header.routing_fields.chip_unicast.distance_in_hops != DESTINATION_HOP_COUNT; - - case tt::fabric::ChipSendType::CHIP_MULTICAST: - return packet_header.routing_fields.chip_mcast.range_hops != LAST_MCAST_DESTINATION; - - default: - ASSERT(false); - return false; - } -} diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index ee7bf23295c..c954564109b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -584,54 +584,32 @@ FORCE_INLINE void receiver_send_completion_ack( } -PacketLocalForwardType get_packet_local_forward_type(const volatile tt::fabric::PacketHeader &packet_header) { - const bool local_chip_is_packet_destination = packet_must_be_consumed_locally(packet_header); - const bool packet_needs_forwarding = packet_must_be_forwarded_to_next_chip(packet_header); - PacketLocalForwardType forward_type = - static_cast(packet_needs_forwarding << 1 | local_chip_is_packet_destination); - return forward_type; -} - FORCE_INLINE bool can_forward_packet_completely( - const volatile tt::fabric::PacketHeader *packet_header, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface) { - auto forward_status = get_packet_local_forward_type(*packet_header); - - switch (forward_status) { - case PACKET_FORWARD_INVALID: return false; - case PACKET_FORWARD_LOCAL_ONLY: return true; - - case PACKET_FORWARD_REMOTE_ONLY: - case PACKET_FORWARD_LOCAL_AND_REMOTE: return downstream_edm_interface.edm_has_space_for_packet(); - default: ASSERT(false); return false; - }; + const volatile tt::fabric::PacketHeader* packet_header, + tt::fabric::RoutingFields cached_routing_fields, + tt::fabric::WorkerToFabricEdmSender& downstream_edm_interface) { + // We always check if it is the terminal mcast packet value. We can do this because all unicast packets have the + // mcast terminal value masked in to the routing field. This simplifies the check here to a single compare. + bool deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL; + return deliver_locally_only || downstream_edm_interface.edm_has_space_for_packet(); } // !!!WARNING!!! - MAKE SURE CONSUMER HAS SPACE BEFORE CALLING void receiver_forward_packet( - volatile tt::fabric::PacketHeader *packet_start, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, uint8_t transaction_id) { - // Just cache the packet_header - we don't really expect (or care) if contents change during this function. - volatile tt::fabric::PacketHeader const &packet_header = *packet_start; - ASSERT(tt::fabric::is_valid(const_cast(packet_header))); - auto forward_status = get_packet_local_forward_type(packet_header); - switch (forward_status) { - case PACKET_FORWARD_LOCAL_ONLY: { - execute_chip_unicast_to_local_chip(packet_start, transaction_id); - } break; - - case PACKET_FORWARD_REMOTE_ONLY: { - forward_payload_to_downstream_edm(packet_start, downstream_edm_interface, transaction_id); - } break; - - case PACKET_FORWARD_LOCAL_AND_REMOTE: { - ASSERT(packet_header.chip_send_type == tt::fabric::ChipSendType::CHIP_MULTICAST); - // TODO: make local chip write non-blocking - execute_chip_unicast_to_local_chip(packet_start, transaction_id); - forward_payload_to_downstream_edm(packet_start, downstream_edm_interface, transaction_id); - } break; - - case PACKET_FORWARD_INVALID: - default: ASSERT(false); - }; + // TODO: have a separate cached copy of the packet header to save some additional L1 loads + volatile tt::fabric::PacketHeader *packet_start, + tt::fabric::RoutingFields cached_routing_fields, + tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, + uint8_t transaction_id) { + + bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; + if (start_distance_is_terminal_value) { + execute_chip_unicast_to_local_chip(packet_start, transaction_id); + } + bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; + if (not_last_destination_device) { + forward_payload_to_downstream_edm(packet_start, cached_routing_fields, downstream_edm_interface, transaction_id); + } } //////////////////////////////////// @@ -765,13 +743,15 @@ void run_receiver_channel_step( if (unwritten_packets) { auto receiver_buffer_index = wr_sent_ptr.get_buffer_index(); volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index); + + tt::fabric::RoutingFields cached_routing_fields = const_cast(packet_header)->routing_fields; print_pkt_header(packet_header); bool can_send_to_all_local_chip_receivers = - can_forward_packet_completely(packet_header, downstream_edm_interface); + can_forward_packet_completely(packet_header, cached_routing_fields, downstream_edm_interface); bool trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); if (can_send_to_all_local_chip_receivers && trid_flushed) { uint8_t trid = receiver_channel_trid_tracker.update_buffer_slot_to_next_trid_and_advance_trid_counter(receiver_buffer_index); - receiver_forward_packet(packet_header, downstream_edm_interface, trid); + receiver_forward_packet(packet_header, cached_routing_fields, downstream_edm_interface, trid); wr_sent_ptr.increment(); } } From 8f653f59962ccc52d59429038088c3e23a92fd21 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Fri, 7 Feb 2025 22:46:57 +0000 Subject: [PATCH 6/7] FORCE_INLINE small edm functions --- .../edm_fabric/fabric_erisc_datamover.cpp | 6 ++- .../fabric_erisc_datamover_channels.hpp | 45 ++++++++++--------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index c954564109b..f8089a768ac 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -618,7 +618,7 @@ void receiver_forward_packet( //////////////////////////////////// //////////////////////////////////// template -bool run_sender_channel_step( +FORCE_INLINE bool run_sender_channel_step( tt::fabric::EthChannelBuffer &local_sender_channel, tt::fabric::EdmChannelWorkerInterface &local_sender_channel_worker_interface, OutboundReceiverChannelPointers &outbound_to_receiver_channel_pointers, @@ -711,7 +711,7 @@ bool run_sender_channel_step( }; template -void run_receiver_channel_step( +FORCE_INLINE void run_receiver_channel_step( tt::fabric::EthChannelBuffer &local_receiver_channel, std::array, NUM_SENDER_CHANNELS> &remote_sender_channnels, tt::fabric::WorkerToFabricEdmSender &downstream_edm_interface, @@ -741,6 +741,7 @@ void run_receiver_channel_step( auto &wr_sent_ptr = receiver_channel_pointers.wr_sent_ptr; bool unwritten_packets = !wr_sent_ptr.is_caught_up_to(ack_ptr); if (unwritten_packets) { + DeviceZoneScopedN("EDMR-Send-Chk"); auto receiver_buffer_index = wr_sent_ptr.get_buffer_index(); volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index); @@ -750,6 +751,7 @@ void run_receiver_channel_step( can_forward_packet_completely(packet_header, cached_routing_fields, downstream_edm_interface); bool trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); if (can_send_to_all_local_chip_receivers && trid_flushed) { + DeviceZoneScopedN("EDMR-Send-Impl"); uint8_t trid = receiver_channel_trid_tracker.update_buffer_slot_to_next_trid_and_advance_trid_counter(receiver_buffer_index); receiver_forward_packet(packet_header, cached_routing_fields, downstream_edm_interface, trid); wr_sent_ptr.increment(); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index a5d8298bbff..9c6ae1924bc 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -24,13 +24,13 @@ template class NamedType { public: - explicit NamedType(T const& value) : value_(value) {} - explicit NamedType(T&& value) : value_(std::move(value)) {} - NamedType &operator=(NamedType const& rhs) = default; - T& get() { return value_; } - T const& get() const {return value_; } - operator T() const { return value_; } - operator T&() { return value_; } + FORCE_INLINE explicit NamedType(T const& value) : value_(value) {} + FORCE_INLINE explicit NamedType(T&& value) : value_(std::move(value)) {} + FORCE_INLINE NamedType &operator=(NamedType const& rhs) = default; + FORCE_INLINE T& get() { return value_; } + FORCE_INLINE T const& get() const {return value_; } + FORCE_INLINE operator T() const { return value_; } + FORCE_INLINE operator T&() { return value_; } private: T value_; }; @@ -41,6 +41,7 @@ using BufferPtr = NamedType; // Increments val and wraps to 0 if it reaches limit template +FORCE_INLINE auto wrap_increment(T val) -> T { static_assert(LIMIT != 0, "wrap_increment called with limit of 0; it must be greater than 0"); constexpr bool is_pow2 = is_power_of_2(LIMIT); @@ -55,6 +56,7 @@ auto wrap_increment(T val) -> T { } } template +FORCE_INLINE auto wrap_increment_n(T val, uint8_t increment) -> T { static_assert(LIMIT != 0, "wrap_increment called with limit of 0; it must be greater than 0"); constexpr bool is_pow2 = is_power_of_2(LIMIT); @@ -72,6 +74,7 @@ auto wrap_increment_n(T val, uint8_t increment) -> T { } template +FORCE_INLINE auto normalize_ptr(BufferPtr ptr) -> BufferIndex { static_assert(NUM_BUFFERS != 0, "normalize_ptr called with NUM_BUFFERS of 0; it must be greater than 0"); constexpr bool is_size_pow2 = (NUM_BUFFERS & (NUM_BUFFERS - 1)) == 0; @@ -112,38 +115,38 @@ class ChannelBufferPointer { /* * Returns the "raw" pointer - not usable to index the buffer channel */ - BufferPtr get_ptr() const { + FORCE_INLINE BufferPtr get_ptr() const { return this->ptr; } - bool is_caught_up_to(ChannelBufferPointer const& leading_ptr) const { + FORCE_INLINE bool is_caught_up_to(ChannelBufferPointer const& leading_ptr) const { return this->is_caught_up_to(leading_ptr.get_ptr()); } - uint8_t distance_behind(ChannelBufferPointer const& leading_ptr) const { + FORCE_INLINE uint8_t distance_behind(ChannelBufferPointer const& leading_ptr) const { return this->distance_behind(leading_ptr.get_ptr()); } /* * Returns the buffer index pointer which is usable to index into the buffer memory */ - BufferIndex get_buffer_index() const { + FORCE_INLINE BufferIndex get_buffer_index() const { return BufferIndex{normalize_ptr(this->ptr)}; } - void increment_n(uint8_t n) { + FORCE_INLINE void increment_n(uint8_t n) { this->ptr = BufferPtr{wrap_increment_n<2*NUM_BUFFERS>(this->ptr.get(), n)}; } - void increment() { + FORCE_INLINE void increment() { this->ptr = wrap_increment<2*NUM_BUFFERS>(this->ptr); } private: // Make these private to make sure caller doesn't accidentally mix two pointers pointing to // different sized channels - bool is_caught_up_to(BufferPtr const& leading_ptr) const { + FORCE_INLINE bool is_caught_up_to(BufferPtr const& leading_ptr) const { return this->get_ptr() == leading_ptr; } - uint8_t distance_behind(BufferPtr const& leading_ptr) const { + FORCE_INLINE uint8_t distance_behind(BufferPtr const& leading_ptr) const { bool leading_gte_trailing_ptr = leading_ptr >= this->ptr; if constexpr (is_size_pow2) { return (leading_ptr - this->ptr) & ptr_wrap_mask; @@ -281,15 +284,15 @@ class EthChannelBuffer final { return drained; } - bool needs_to_send_channel_sync() const { + FORCE_INLINE bool needs_to_send_channel_sync() const { return this->need_to_send_channel_sync; } - void set_need_to_send_channel_sync(bool need_to_send_channel_sync) { + FORCE_INLINE void set_need_to_send_channel_sync(bool need_to_send_channel_sync) { this->need_to_send_channel_sync = need_to_send_channel_sync; } - void clear_need_to_send_channel_sync() { + FORCE_INLINE void clear_need_to_send_channel_sync() { this->need_to_send_channel_sync = false; } @@ -376,15 +379,15 @@ struct EdmChannelWorkerInterface { noc_semaphore_inc(worker_semaphore_address, 1); } - bool all_eth_packets_acked() const { + FORCE_INLINE bool all_eth_packets_acked() const { return this->local_ackptr.is_caught_up_to(this->local_wrptr); } - bool all_eth_packets_completed() const { + FORCE_INLINE bool all_eth_packets_completed() const { return this->local_rdptr.is_caught_up_to(this->local_wrptr); } // Call to keep the connection flow control info fresh with worker. - void propagate_ackptr_to_connection_info() { + FORCE_INLINE void propagate_ackptr_to_connection_info() { worker_location_info_ptr->edm_rdptr = local_ackptr.get_ptr(); } From 78f8965260a29f79ee5483af85caef9eaf9e07e7 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Sun, 9 Feb 2025 18:05:45 +0000 Subject: [PATCH 7/7] Unit test regression fix and dead code removal --- ...c_erisc_datamover_sender_worker_sender.cpp | 13 +- ...erisc_data_mover_loopback_with_workers.cpp | 14 +- .../edm_fabric/fabric_erisc_datamover.cpp | 176 ++++++++++-------- .../fabric_erisc_datamover_channels.hpp | 68 +------ 4 files changed, 122 insertions(+), 149 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index b1ff19c7bec..b210f32efb5 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -122,17 +122,18 @@ void kernel_main() { // bit of a hack to extract X/Y const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t packet_size = page_size; + const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); auto packet_addr = get_read_ptr(cb_id_in0); - auto& packet_header = *reinterpret_cast(packet_addr); + auto* packet_header = reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header - .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast_write( + ->to_chip_multicast( + tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) + ->to_noc_unicast_write( tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } else { - packet_header.to_chip_unicast(config.unicast.distance) - .to_noc_unicast_write( + packet_header->to_chip_unicast(config.unicast.distance) + ->to_noc_unicast_write( tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index ee3a644e06e..4f9eadf730c 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -3266,7 +3266,6 @@ TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWra RunWriteThroughputStabilityTestWithPersistentFabric( num_mcasts, num_unicasts, num_links, num_op_invocations, params); } -// hangs with DPRINT TEST(EdmFabric, BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWrap_2Device) { const size_t num_mcasts = 9; const size_t num_unicasts = 0; @@ -3294,7 +3293,6 @@ TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWra RunWriteThroughputStabilityTestWithPersistentFabric( num_mcasts, num_unicasts, num_links, num_op_invocations, params); } -// First to hang - maybe somethign to do with merging traffic TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWrap_TwoWorkers_4Device) { const size_t num_mcasts = 9; const size_t num_unicasts = 0; @@ -3603,6 +3601,18 @@ TEST(EdmFabric, BasicMcastThroughputTest_3) { RunWriteThroughputStabilityTestWithPersistentFabric( num_mcasts, num_unicasts, num_links, num_op_invocations, params); } +TEST(EdmFabric, BasicMcastThroughputTest_3_onehop) { + const size_t num_mcasts = 200000; + const size_t num_unicasts = 2; + const size_t num_links = 1; + const size_t num_op_invocations = 1; + const bool line_sync = true; + WriteThroughputStabilityTestWithPersistentFabricParams params; + params.line_sync = line_sync; + params.line_size = 2; + RunWriteThroughputStabilityTestWithPersistentFabric( + num_mcasts, num_unicasts, num_links, num_op_invocations, params); +} TEST(EdmFabric, BasicMcastThroughputTest_4) { const size_t num_mcasts = 800000; const size_t num_unicasts = 2; diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index f8089a768ac..b0c732ee00b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -23,6 +23,9 @@ using ttnn::ccl::WorkerXY; +static constexpr bool enable_first_level_ack = true; +static constexpr bool fuse_receiver_flush_and_completion_ptr = true; + /* The fabric Erisc Data Mover (EDM) is a component that can be used to build *very* simple linear topology fabrics. @@ -247,11 +250,11 @@ constexpr uint8_t NUM_TRANSACTION_IDS = 4; template struct TransactionIdCounter { - void increment() { + FORCE_INLINE void increment() { this->next_trid = tt::fabric::wrap_increment(this->next_trid); } - uint8_t get() const { + FORCE_INLINE uint8_t get() const { return this->next_trid; } @@ -314,15 +317,11 @@ constexpr uint32_t to_sender_1_pkts_completed_id = 4; // This will be an atomic register read to the register template -int32_t get_ptr_val() { +FORCE_INLINE int32_t get_ptr_val() { return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - constexpr uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - return *reinterpret_cast(addr); } -int32_t get_ptr_val(uint8_t stream_id) { +FORCE_INLINE int32_t get_ptr_val(uint8_t stream_id) { return NOC_STREAM_READ_REG(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - const uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_REG_INDEX); - return *reinterpret_cast(addr); } // Writing to this register will leverage the built-in stream hardware which will automatically perform an atomic increment @@ -330,25 +329,25 @@ int32_t get_ptr_val(uint8_t stream_id) { // Additionally, these registers are accessible via eth_reg_write calls which can be used to write a value, // inline the eth command (without requiring source L1) template -void increment_local_update_ptr_val(int32_t val) { +FORCE_INLINE void increment_local_update_ptr_val(int32_t val) { NOC_STREAM_WRITE_REG_FIELD(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX, REMOTE_DEST_BUF_WORDS_FREE_INC, val); } -void increment_local_update_ptr_val(uint8_t stream_id, int32_t val) { +FORCE_INLINE void increment_local_update_ptr_val(uint8_t stream_id, int32_t val) { NOC_STREAM_WRITE_REG_FIELD(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX, REMOTE_DEST_BUF_WORDS_FREE_INC, val); } template -void remote_update_ptr_val(int32_t val) { +FORCE_INLINE void remote_update_ptr_val(int32_t val) { constexpr uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } -void remote_update_ptr_val(uint32_t stream_id, int32_t val) { +FORCE_INLINE void remote_update_ptr_val(uint32_t stream_id, int32_t val) { const uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } template -void init_ptr_val(int32_t val) { +FORCE_INLINE void init_ptr_val(int32_t val) { NOC_STREAM_WRITE_REG(stream_id, STREAM_REMOTE_DEST_BUF_SIZE_REG_INDEX, val); } @@ -371,19 +370,19 @@ struct OutboundReceiverChannelPointers { tt::fabric::ChannelBufferPointer ack_ptr; tt::fabric::ChannelBufferPointer completion_ptr; - bool has_space_for_packet() const { + FORCE_INLINE bool has_space_for_packet() const { return completion_ptr.distance_behind(wrptr) < RECEIVER_NUM_BUFFERS; } - bool has_unacknowledged_eth_packets() const { + FORCE_INLINE bool has_unacknowledged_eth_packets() const { return ack_ptr.get_ptr() != wrptr.get_ptr(); } - bool has_incomplete_eth_packets() const { + FORCE_INLINE bool has_incomplete_eth_packets() const { return completion_ptr.get_ptr() != wrptr.get_ptr(); } - bool has_unacknowledged_or_incomplete_eth_packets() const { + FORCE_INLINE bool has_unacknowledged_or_incomplete_eth_packets() const { return has_incomplete_eth_packets() || has_unacknowledged_eth_packets(); } }; @@ -486,20 +485,9 @@ static constexpr size_t worker_info_offset_past_connection_semaphore = 32; // SENDER SIDE HELPERS ///////////////////////////////////////////// -template -void send_channel_sync( - tt::fabric::EthChannelBuffer &sender_buffer_channel, - tt::fabric::ChannelBufferPointer &sender_wrptr, - tt::fabric::EthChannelBuffer &receiver_buffer_channel, - tt::fabric::ChannelBufferPointer &remote_receiver_wrptr - ) { - auto src_addr = sender_buffer_channel.get_bytes_sent_address(sender_wrptr.get_buffer_index()); - auto dest_addr = receiver_buffer_channel.get_bytes_sent_address(remote_receiver_wrptr.get_buffer_index()); - internal_::eth_send_packet_bytes_unsafe(DEFAULT_ETH_TXQ, src_addr, dest_addr, sizeof(eth_channel_sync_t)); -} template -void send_next_data( +FORCE_INLINE void send_next_data( tt::fabric::EthChannelBuffer &sender_buffer_channel, tt::fabric::EdmChannelWorkerInterface &sender_worker_interface, OutboundReceiverChannelPointers &outbound_to_receiver_channel_pointers, @@ -550,7 +538,7 @@ void send_next_data( * MUST CHECK !is_eth_txq_busy() before calling */ template -void receiver_send_received_ack( +FORCE_INLINE void receiver_send_received_ack( std::array, NUM_SENDER_CHANNELS> &remote_eth_sender_ackptrs, std::array, NUM_SENDER_CHANNELS> &remote_sender_channels, // currently the pointer is working multiple jobs (ack, completion, read) because we haven't implemented the @@ -595,7 +583,7 @@ FORCE_INLINE bool can_forward_packet_completely( } // !!!WARNING!!! - MAKE SURE CONSUMER HAS SPACE BEFORE CALLING -void receiver_forward_packet( +FORCE_INLINE void receiver_forward_packet( // TODO: have a separate cached copy of the packet header to save some additional L1 loads volatile tt::fabric::PacketHeader *packet_start, tt::fabric::RoutingFields cached_routing_fields, @@ -663,22 +651,30 @@ FORCE_INLINE bool run_sender_channel_step( outbound_to_receiver_channel_pointers.completion_ptr.increment_n(completions_since_last_check); sender_rdptr.increment_n(completions_since_last_check); increment_local_update_ptr_val(to_sender_packets_completed_streams[sender_channel_index], -completions_since_last_check); + if constexpr (!enable_first_level_ack) { + if (channel_connection_established) { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(sender_rdptr.get_ptr()); + } + } } // Process ACKs from receiver // ACKs are processed second to avoid any sort of races. If we process acks second, // we are guaranteed to see equal to or greater the number of acks than completions - auto acks_since_last_check = get_ptr_val(to_sender_packets_acked_streams[sender_channel_index]); - - auto& sender_ackptr = local_sender_channel_worker_interface.local_ackptr; - if (acks_since_last_check > 0) { - sender_ackptr.increment_n(acks_since_last_check); - if (channel_connection_established) { - local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(); + if constexpr (enable_first_level_ack) { + auto acks_since_last_check = get_ptr_val(to_sender_packets_acked_streams[sender_channel_index]); + auto& sender_ackptr = local_sender_channel_worker_interface.local_ackptr; + if (acks_since_last_check > 0) { + sender_ackptr.increment_n(acks_since_last_check); + if (channel_connection_established) { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(sender_ackptr.get_ptr()); + } + increment_local_update_ptr_val(to_sender_packets_acked_streams[sender_channel_index], -acks_since_last_check); } - increment_local_update_ptr_val(to_sender_packets_acked_streams[sender_channel_index], -acks_since_last_check); + did_something = did_something || (completions_since_last_check + acks_since_last_check) > 0; + } else { + did_something = did_something || (completions_since_last_check > 0); } - did_something = did_something || (completions_since_last_check + acks_since_last_check) > 0; if (!channel_connection_established) { @@ -698,7 +694,11 @@ FORCE_INLINE bool run_sender_channel_step( } did_something = true; channel_connection_established = true; - local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(); + if constexpr (enable_first_level_ack) { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_ackptr.get_ptr()); + } else { + local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_rdptr.get_ptr()); + } } } else if (local_sender_channel_worker_interface.has_worker_teardown_request()) { did_something = true; @@ -725,23 +725,27 @@ FORCE_INLINE void run_receiver_channel_step( auto &ack_ptr = receiver_channel_pointers.ack_ptr; auto pkts_received_since_last_check = get_ptr_val(); bool pkts_received = pkts_received_since_last_check > 0; - bool can_send_over_eth = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); - ASSERT(receiver_channel_pointers.completion_ptr.distance_behind(ack_ptr) < RECEIVER_NUM_BUFFERS); - if (pkts_received && can_send_over_eth) { - // currently only support processing one packet at a time, so we only decrement by 1 - increment_local_update_ptr_val(-1); - receiver_send_received_ack( - remote_eth_sender_wrptrs, - remote_sender_channnels, - ack_ptr, - local_receiver_channel); - ack_ptr.increment(); + if constexpr (enable_first_level_ack) { + bool can_send_over_eth = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); + ASSERT(receiver_channel_pointers.completion_ptr.distance_behind(ack_ptr) < RECEIVER_NUM_BUFFERS); + if (pkts_received && can_send_over_eth) { + // currently only support processing one packet at a time, so we only decrement by 1 + increment_local_update_ptr_val(-1); + receiver_send_received_ack( + remote_eth_sender_wrptrs, + remote_sender_channnels, + ack_ptr, + local_receiver_channel); + ack_ptr.increment(); + } + } else { + increment_local_update_ptr_val(-pkts_received_since_last_check); + ack_ptr.increment_n(pkts_received_since_last_check); } auto &wr_sent_ptr = receiver_channel_pointers.wr_sent_ptr; bool unwritten_packets = !wr_sent_ptr.is_caught_up_to(ack_ptr); if (unwritten_packets) { - DeviceZoneScopedN("EDMR-Send-Chk"); auto receiver_buffer_index = wr_sent_ptr.get_buffer_index(); volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index); @@ -751,37 +755,57 @@ FORCE_INLINE void run_receiver_channel_step( can_forward_packet_completely(packet_header, cached_routing_fields, downstream_edm_interface); bool trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); if (can_send_to_all_local_chip_receivers && trid_flushed) { - DeviceZoneScopedN("EDMR-Send-Impl"); + // DeviceZoneScopedN("EDMR-Send-Impl"); uint8_t trid = receiver_channel_trid_tracker.update_buffer_slot_to_next_trid_and_advance_trid_counter(receiver_buffer_index); receiver_forward_packet(packet_header, cached_routing_fields, downstream_edm_interface, trid); wr_sent_ptr.increment(); } } - auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; - bool unflushed_writes = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr); - if (unflushed_writes) { - auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); - bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); - if (next_trid_flushed) { - local_receiver_channel.eth_clear_sender_channel_ack(receiver_buffer_index); - wr_flush_ptr.increment(); - receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); + if constexpr (!fuse_receiver_flush_and_completion_ptr) { + auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; + bool unflushed_writes = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr); + if (unflushed_writes) { + auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); + bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); + if (next_trid_flushed) { + wr_flush_ptr.increment(); + receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); + } } - } - auto &completion_ptr = receiver_channel_pointers.completion_ptr; - bool unsent_completions = !completion_ptr.is_caught_up_to(wr_flush_ptr); - if (unsent_completions) { - bool can_send_without_blocking = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); - if (can_send_without_blocking) { - // completion ptr incremented in callee - receiver_send_completion_ack( - remote_eth_sender_wrptrs, - remote_sender_channnels, - completion_ptr, - local_receiver_channel); + auto &completion_ptr = receiver_channel_pointers.completion_ptr; + bool unsent_completions = !completion_ptr.is_caught_up_to(wr_flush_ptr); + if (unsent_completions) { + bool can_send_without_blocking = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); + if (can_send_without_blocking) { + // completion ptr incremented in callee + receiver_send_completion_ack( + remote_eth_sender_wrptrs, + remote_sender_channnels, + completion_ptr, + local_receiver_channel); + } + } + } else { + auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; + // Currently unclear if it's better to loop here or not... Also unclear if merging these + // two pointers is better or not... Seems to be maybe 5-10% better merged but need more data + if (!wr_flush_ptr.is_caught_up_to(wr_sent_ptr) && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { + auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); + bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); + if (next_trid_flushed) { + auto &completion_ptr = receiver_channel_pointers.completion_ptr; + wr_flush_ptr.increment(); + receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index); + receiver_send_completion_ack( + remote_eth_sender_wrptrs, + remote_sender_channnels, + completion_ptr, + local_receiver_channel); + } } + } }; @@ -976,7 +1000,7 @@ void kernel_main() { static constexpr size_t sender_channel_0_counters_address = get_compile_time_arg_val(18); static constexpr size_t sender_channel_1_counters_address = get_compile_time_arg_val(19); - static constexpr bool enable_packet_header_recording = get_compile_time_arg_val(20) != 0; + static constexpr bool enable_packet_header_recording = false; //get_compile_time_arg_val(20) != 0; static constexpr size_t receiver_completed_packet_header_cb_address = get_compile_time_arg_val(21); static constexpr size_t receiver_completed_packet_header_cb_size_headers = get_compile_time_arg_val(22); static constexpr size_t sender_0_completed_packet_header_cb_address = get_compile_time_arg_val(23); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index 9c6ae1924bc..2285a6c42cb 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -178,7 +178,7 @@ class EthChannelBuffer final { // &channel_sync-> |----------------| // | channel_sync | // ------------------ - EthChannelBuffer() : buffer_size_in_bytes(0), eth_transaction_ack_word_addr(0), max_eth_payload_size_in_bytes(0) {} + EthChannelBuffer() : buffer_size_in_bytes(0), max_eth_payload_size_in_bytes(0) {} /* * Expected that *buffer_index_ptr is initialized outside of this object @@ -191,30 +191,11 @@ class EthChannelBuffer final { // that can fit 2 eth_channel_syncs cfor ack uint8_t channel_id) : buffer_size_in_bytes(buffer_size_bytes), - eth_transaction_ack_word_addr(eth_transaction_ack_word_addr), max_eth_payload_size_in_bytes(buffer_size_in_bytes + sizeof(eth_channel_sync_t)), channel_id(channel_id) { for (uint8_t i = 0; i < NUM_BUFFERS; i++) { this->buffer_addresses[i] = channel_base_address + i * this->max_eth_payload_size_in_bytes; - - uint32_t channel_sync_addr = this->buffer_addresses[i] + buffer_size_in_bytes; - auto channel_sync_ptr = reinterpret_cast(channel_sync_addr); - - channel_bytes_sent_addresses[i] = - reinterpret_cast(&(channel_sync_ptr->bytes_sent)); - channel_bytes_acked_addresses[i] = - reinterpret_cast(&(channel_sync_ptr->receiver_ack)); - channel_src_id_addresses[i] = reinterpret_cast(&(channel_sync_ptr->src_id)); - - ASSERT((uint32_t)channel_bytes_acked_addresses[i] != (uint32_t)(channel_bytes_sent_addresses[i])); - *(channel_bytes_sent_addresses[i]) = 0; - *(channel_bytes_acked_addresses[i]) = 0; - *(channel_src_id_addresses[i]) = 0x1c0ffee1; - (channel_src_id_addresses[i])[1] = 0x1c0ffee2; - - // Note we don't need to overwrite the `channel_src_id_addresses` except for perhapse - // debug purposes where we may wish to tag this with a special value } } @@ -229,22 +210,6 @@ class EthChannelBuffer final { [[nodiscard]] FORCE_INLINE size_t get_payload_size(BufferIndex const& buffer_index) const { return get_packet_header(buffer_index)->get_payload_size_including_header(); } - [[nodiscard]] FORCE_INLINE size_t get_payload_plus_channel_sync_size(BufferIndex const& buffer_index) const { - return get_packet_header(buffer_index)->get_payload_size_including_header() + sizeof(eth_channel_sync_t); - } - - [[nodiscard]] FORCE_INLINE volatile tt_l1_ptr size_t *get_bytes_sent_address(BufferIndex const& buffer_index) const { - return this->channel_bytes_sent_addresses[buffer_index]; - } - - [[nodiscard]] FORCE_INLINE volatile tt_l1_ptr size_t *get_bytes_acked_address(BufferIndex const& buffer_index) const { - return this->channel_bytes_acked_addresses[buffer_index]; - } - - [[nodiscard]] FORCE_INLINE volatile tt_l1_ptr size_t *get_src_id_address(BufferIndex const& buffer_index) const { - return this->channel_src_id_addresses[buffer_index]; - } - [[nodiscard]] FORCE_INLINE size_t get_channel_buffer_max_size_in_bytes(BufferIndex const& buffer_index) const { return this->buffer_size_in_bytes; } @@ -256,33 +221,10 @@ class EthChannelBuffer final { [[nodiscard]] FORCE_INLINE size_t get_id() const { return this->channel_id; } - [[nodiscard]] FORCE_INLINE bool eth_is_receiver_channel_send_done(BufferIndex const& buffer_index) const { - return *(this->get_bytes_sent_address(buffer_index)) == 0; - } - [[nodiscard]] FORCE_INLINE bool eth_bytes_are_available_on_channel(BufferIndex const& buffer_index) const { - return *(this->get_bytes_sent_address(buffer_index)) != 0; - } - [[nodiscard]] FORCE_INLINE bool eth_is_receiver_channel_send_acked(BufferIndex const& buffer_index) const { - return *(this->get_bytes_acked_address(buffer_index)) != 0; - } - FORCE_INLINE void eth_clear_sender_channel_ack(BufferIndex const& buffer_index) const { - *(this->channel_bytes_acked_addresses[buffer_index]) = 0; - } [[nodiscard]] FORCE_INLINE bool eth_is_acked_or_completed(BufferIndex const& buffer_index) const { return eth_is_receiver_channel_send_acked(buffer_index) || eth_is_receiver_channel_send_done(buffer_index); } - [[nodiscard]] FORCE_INLINE size_t get_eth_transaction_ack_word_addr() const { - return this->eth_transaction_ack_word_addr; - } - - [[nodiscard]] FORCE_INLINE bool all_buffers_drained() const { - bool drained = true; - for (size_t i = 0; i < NUM_BUFFERS && drained; i++) { - drained &= *(channel_bytes_sent_addresses[i]) == 0; - } - return drained; - } FORCE_INLINE bool needs_to_send_channel_sync() const { return this->need_to_send_channel_sync; @@ -299,14 +241,10 @@ class EthChannelBuffer final { private: std::array buffer_addresses; - std::array channel_bytes_sent_addresses; - std::array channel_bytes_acked_addresses; - std::array channel_src_id_addresses; // header + payload regions only const std::size_t buffer_size_in_bytes; // Includes header + payload + channel_sync - const std::size_t eth_transaction_ack_word_addr; const std::size_t max_eth_payload_size_in_bytes; uint8_t channel_id; }; @@ -357,11 +295,11 @@ struct EdmChannelWorkerInterface { return worker_location_info_ptr->worker_semaphore_address; } - FORCE_INLINE void update_worker_copy_of_read_ptr() { + FORCE_INLINE void update_worker_copy_of_read_ptr(BufferPtr new_ptr_val) { auto const &worker_info = *worker_location_info_ptr; uint64_t worker_semaphore_address = get_noc_addr( (uint32_t)worker_info.worker_xy.x, (uint32_t)worker_info.worker_xy.y, worker_info.worker_semaphore_address); - noc_inline_dw_write(worker_semaphore_address, local_ackptr.get_ptr()); + noc_inline_dw_write(worker_semaphore_address, new_ptr_val); } // Connection management methods