diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index cd142bef8fdf..2a41846b929f 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -140,8 +140,8 @@ void kernel_main() { noc_async_write(source_l1_buffer_address, dest_addr, packet_payload_size_bytes); if (fabric_connection.has_forward_connection()) { DeviceZoneScopedN("WR-FWD"); - mcast_fwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ - noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); + mcast_fwd_packet_header->to_noc_unicast_write( + NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); { DeviceZoneScopedN("WR-FWD-WAIT"); fabric_connection.get_forward_connection().wait_for_empty_write_slot(); @@ -155,8 +155,8 @@ void kernel_main() { if (fabric_connection.has_backward_connection()) { DeviceZoneScopedN("WR-BWD"); - mcast_bwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{ - noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)}); + mcast_bwd_packet_header->to_noc_unicast_write( + NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); { DeviceZoneScopedN("WR-BWD-WAIT"); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); @@ -179,7 +179,7 @@ void kernel_main() { DeviceZoneScopedN("UNICAST-WRITE"); auto& fabric_conn = unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection(); - unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr, packet_payload_size_bytes}); + unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index d0b384fc55f7..98895a2c1de1 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -122,20 +122,18 @@ void kernel_main() { // bit of a hack to extract X/Y const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); + const size_t packet_size = page_size; auto packet_addr = get_read_ptr(cb_id_in0); auto& packet_header = *reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); - packet_header.reserved2 = 0x1111; // debug only + .to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } else { packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); - packet_header.reserved2 = 0x1111; // debug only + .to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size)); } sender.send_payload_blocking_from_address(packet_addr, packet_size); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index 98a607669229..704b516c48c2 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -59,12 +59,10 @@ auto forward_to_fabric_from_cb( if constexpr (mcast_mode) { packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } else { packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t))); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp index cae2798e8933..4ef80a5f68f9 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp @@ -33,8 +33,9 @@ bool terminate_fabric_endpoints_farthest_to_nearest ( reinterpret_cast(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; sender.wait_for_empty_write_slot(); packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast(distance)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - termination_sig_noc_addr, sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)}); + .to_noc_unicast_write( + tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr}, + sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)); sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header()); noc_async_writes_flushed(); } diff --git a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py index 08d359325c2d..a9816aefdcb9 100644 --- a/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py +++ b/tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py @@ -464,6 +464,17 @@ def test_all_gather( None, ttnn.TensorMemoryLayout.HEIGHT_SHARDED, ), + ( + 4, + [1, 4, 32, 1280], + 3, + ttnn.TILE_LAYOUT, + (32, 128), + ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(1, 4))}), + None, + None, + ttnn.TensorMemoryLayout.HEIGHT_SHARDED, + ), ], ) @pytest.mark.parametrize("num_links", [1]) diff --git a/tt_metal/hw/inc/ethernet/tunneling.h b/tt_metal/hw/inc/ethernet/tunneling.h index 37d1422d2f62..a4070cbb24b7 100644 --- a/tt_metal/hw/inc/ethernet/tunneling.h +++ b/tt_metal/hw/inc/ethernet/tunneling.h @@ -96,6 +96,12 @@ void eth_write_remote_reg(uint32_t q_num, uint32_t reg_addr, uint32_t val) { eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val); eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG); } +FORCE_INLINE +void eth_write_remote_reg_no_txq_check(uint32_t q_num, uint32_t reg_addr, uint32_t val) { + eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, reg_addr); + eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val); + eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG); +} void check_and_context_switch() { uint32_t start_time = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp index b69b5caaad21..0df726fd3327 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp @@ -33,8 +33,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( pkt_hdr->reserved2 = my_chip_id; #endif - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 4225247db41b..50c565ca9f6b 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -438,16 +438,14 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) ASSERT(cmd_ctx.packet_header_buffer_addr != 0); auto* pkt_hdr = reinterpret_cast(cmd_ctx.packet_header_buffer_addr); -#ifdef DEBUG_PRINT_ENABLED - pkt_hdr->reserved2 = my_chip_id; -#endif + uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0); if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) { pkt_hdr->to_noc_unicast_atomic_inc( tt::fabric::NocUnicastAtomicIncCommandHeader{dest_noc_addr_for_pkt, static_cast(value), 32}); } else { - pkt_hdr->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{dest_noc_addr_for_pkt, static_cast(value)}); + pkt_hdr->to_noc_unicast_inline_write( + tt::fabric::NocUnicastInlineWriteCommandHeader{dest_noc_addr_for_pkt, static_cast(value)}); } switch (cmd_ctx.current_cmd_header.dest_type) { @@ -563,13 +561,8 @@ void write_and_advance_local_read_address_for_fabric_write( const size_t payload_l1_address = l1_read_addr; auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); -#ifdef DEBUG_PRINT_ENABLED - pkt_hdr->reserved2 = my_chip_id; -#endif - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: { diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp index 0f662c4bfd4b..904cd775a9a0 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp @@ -118,9 +118,7 @@ void mcast_contig_pages_to_noc_address( pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(forward_direction_num_hops)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_addr, - packet_send_size_bytes}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes); forward_fabric_sender.wait_for_empty_write_slot(); forward_fabric_sender.send_payload_flush_blocking_from_address(l1_read_addr, packet_send_size_bytes); } @@ -131,9 +129,7 @@ void mcast_contig_pages_to_noc_address( pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(backward_direction_num_hops)}) - .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{ - noc0_dest_addr, - packet_send_size_bytes}); + .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes); backward_fabric_sender.wait_for_empty_write_slot(); backward_fabric_sender.send_payload_non_blocking_from_address(l1_read_addr, packet_send_size_bytes); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index a4670cb781c9..3042d318fccb 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -19,13 +19,13 @@ enum TerminationSignal : uint32_t { IMMEDIATELY_TERMINATE = 2 }; - -// 2 bits +// 3 bits enum NocSendType : uint8_t { NOC_UNICAST_WRITE = 0, - NOC_MULTICAST_WRITE = 1, - NOC_UNICAST_ATOMIC_INC = 2, - NOC_MULTICAST_ATOMIC_INC = 3 + NOC_UNICAST_INLINE_WRITE = 1, + NOC_MULTICAST_WRITE = 2, + NOC_UNICAST_ATOMIC_INC = 3, + NOC_MULTICAST_ATOMIC_INC = 4 }; // How to send the payload across the cluster // 1 bit @@ -34,7 +34,6 @@ enum ChipSendType : uint8_t { CHIP_MULTICAST = 1, }; - struct UnicastRoutingCommandHeader { uint8_t distance_in_hops; }; @@ -52,11 +51,10 @@ static_assert(sizeof(RoutingFields) == sizeof(UnicastRoutingCommandHeader), "Rou struct NocUnicastCommandHeader { uint64_t noc_address; - uint32_t size; - // ignores header size - inline uint32_t get_payload_only_size() const { - return size; - } +}; +struct NocUnicastInlineWriteCommandHeader { + uint64_t noc_address; + uint32_t value; }; struct NocUnicastAtomicIncCommandHeader { NocUnicastAtomicIncCommandHeader(uint64_t noc_address, uint16_t val, uint16_t wrap) @@ -68,16 +66,10 @@ struct NocUnicastAtomicIncCommandHeader { }; struct NocMulticastCommandHeader { uint32_t address; - uint32_t size; uint8_t noc_x_start; uint8_t noc_y_start; uint8_t mcast_rect_size_x; uint8_t mcast_rect_size_y; - - // ignores header size - inline uint32_t get_payload_only_size() const { - return size; - } }; struct NocMulticastAtomicIncCommandHeader { uint32_t address; @@ -88,12 +80,14 @@ struct NocMulticastAtomicIncCommandHeader { uint8_t size_x; uint8_t size_y; }; -static_assert(sizeof(NocUnicastCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocMulticastCommandHeader) == 12, "NocMulticastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 1 byte"); static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 1 byte"); union NocCommandFields{ NocUnicastCommandHeader unicast_write; + NocUnicastInlineWriteCommandHeader unicast_inline_write; NocMulticastCommandHeader mcast_write; NocUnicastAtomicIncCommandHeader unicast_seminc; NocMulticastAtomicIncCommandHeader mcast_seminc; @@ -106,13 +100,12 @@ struct PacketHeader { // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc // For now, kept it separate so I could do reads which would be handled differently // but for our purposes we shouldn't need read so we should be able to omit the support - NocSendType noc_send_type : 2; + NocSendType noc_send_type : 3; ChipSendType chip_send_type : 1; - uint8_t reserved : 1; uint8_t src_ch_id : 4; RoutingFields routing_fields; - uint16_t reserved2; // can be tagged with src device for debug + uint16_t payload_size_bytes; // excludes header size NocCommandFields command_fields; // size = 16B due to uint64_t alignment // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned @@ -131,23 +124,9 @@ struct PacketHeader { inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } + // Returns size of payload in bytes - TODO: convert to words (4B) size_t get_payload_size_excluding_header() volatile const { - switch(this->noc_send_type) { - case NOC_UNICAST_WRITE: { - return this->command_fields.unicast_write.size - sizeof(PacketHeader); - } break; - case NOC_MULTICAST_WRITE: { - return this->command_fields.mcast_write.size - sizeof(PacketHeader); - } break; - case NOC_UNICAST_ATOMIC_INC: - case NOC_MULTICAST_ATOMIC_INC: - return 0; - default: - #if defined(KERNEL_BUILD) || defined(FW_BUILD) - ASSERT(false); - #endif - return 0; - }; + return this->payload_size_bytes; } inline size_t get_payload_size_including_header() volatile const { return get_payload_size_excluding_header() + sizeof(PacketHeader); @@ -164,26 +143,36 @@ struct PacketHeader { return *this; } - inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) { + inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write = noc_unicast_command_header; + this->payload_size_bytes = payload_size_bytes; + return *this; + } + inline PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write = noc_unicast_command_header; + this->payload_size_bytes = 0; return *this; } - inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header) { + inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write = noc_multicast_command_header; + this->payload_size_bytes = payload_size_bytes; return *this; } inline PacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; + this->payload_size_bytes = 0; return *this; } - inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header) { + inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { #if defined(KERNEL_BUILD) || defined(FW_BUILD) ASSERT(false); while (1) {}; #endif + this->payload_size_bytes = payload_size_bytes; return *this; } @@ -198,20 +187,27 @@ struct PacketHeader { this->routing_fields.chip_mcast.start_distance_in_hops = chip_multicast_command_header.start_distance_in_hops; return this; } - inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header) volatile { + inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; - this->command_fields.unicast_write.size = noc_unicast_command_header.size; + this->payload_size_bytes = payload_size_bytes; return this; } - inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header) volatile { + inline volatile PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { + this->noc_send_type = NOC_UNICAST_INLINE_WRITE; + this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address; + this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value; + this->payload_size_bytes = 0; + return *this; + } + inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; this->command_fields.mcast_write.noc_x_start = noc_multicast_command_header.noc_x_start; this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start; - this->command_fields.mcast_write.size = noc_multicast_command_header.size; + this->payload_size_bytes = payload_size_bytes; this->command_fields.mcast_write.address = noc_multicast_command_header.address; return this; @@ -222,11 +218,12 @@ struct PacketHeader { this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = 0; return this; } inline volatile PacketHeader *to_noc_multicast_atomic_inc( - NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header) volatile { + NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; this->command_fields.mcast_seminc.noc_x_start = noc_multicast_atomic_inc_command_header.noc_x_start; @@ -235,6 +232,7 @@ struct PacketHeader { this->command_fields.mcast_seminc.size_y = noc_multicast_atomic_inc_command_header.size_y; this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val; this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap; + this->payload_size_bytes = payload_size_bytes; return this; } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index ca21b5a4f9a9..598af8de1c58 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -25,6 +25,7 @@ void write_unicast_blocking(uint32_t local_address, uint64_t dest_address, uint3 } void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED switch (packet_start->chip_send_type) { case tt::fabric::CHIP_UNICAST: { DPRINT << "C_UNI: dist:" << (uint32_t) packet_start->routing_fields.chip_unicast.distance_in_hops << "\n"; @@ -36,13 +37,14 @@ void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader *const packe break; } }; +#endif } void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED switch (packet_start->noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { - DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_write.noc_address << - ", size:" << (uint32_t) packet_start->command_fields.unicast_write.size << "\n"; + DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_write.noc_address << "\n"; } break; case tt::fabric::NocSendType::NOC_UNICAST_ATOMIC_INC: { DPRINT << "N_WR addr:"<<(uint64_t)packet_start->command_fields.unicast_seminc.noc_address << @@ -53,15 +55,19 @@ void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet ASSERT(false); // unimplemented break; }; +#endif } void print_pkt_header(volatile tt::fabric::PacketHeader *const packet_start) { +#ifdef DEBUG_PRINT_ENABLED auto const& header = *packet_start; DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << ", csnd_t:" << (uint32_t) packet_start->chip_send_type << - ", src_chip:" << (uint32_t) packet_start->reserved2 << "\n"; + ", src_chip:" << (uint32_t) packet_start->src_ch_id << + ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n"; print_pkt_hdr_routing_fields(packet_start); print_pkt_header_noc_fields(packet_start); +#endif } @@ -71,11 +77,11 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type; + auto const payload_size_bytes = header.payload_size_bytes; switch (noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { auto const dest_address = header.command_fields.unicast_write.noc_address; - auto const size = header.command_fields.unicast_write.size - sizeof(tt::fabric::PacketHeader); - write_unicast_blocking(payload_start_address, dest_address, size); + write_unicast_blocking(payload_start_address, dest_address, payload_size_bytes); } break; @@ -88,8 +94,7 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const header.command_fields.mcast_write.noc_y_start + header.command_fields.mcast_write.mcast_rect_size_y, header.command_fields.mcast_write.address); auto const num_dests = header.command_fields.mcast_write.mcast_rect_size_x * header.command_fields.mcast_write.mcast_rect_size_y; - auto const size = header.command_fields.mcast_write.size - sizeof(tt::fabric::PacketHeader); - noc_async_write_multicast_one_packet(payload_start_address, mcast_dest_address, size, num_dests); + noc_async_write_multicast_one_packet(payload_start_address, mcast_dest_address, payload_size_bytes, num_dests); noc_async_write_barrier(); } break; @@ -100,6 +105,12 @@ void execute_chip_unicast_to_local_chip(volatile tt::fabric::PacketHeader *const noc_semaphore_inc(dest_address, increment); } break; + case tt::fabric::NocSendType::NOC_UNICAST_INLINE_WRITE: { + auto const dest_address = header.command_fields.unicast_inline_write.noc_address; + auto const value = header.command_fields.unicast_inline_write.value; + noc_inline_dw_write(dest_address, value); + } break; + case tt::fabric::NocSendType::NOC_MULTICAST_ATOMIC_INC: default: { ASSERT(false); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index 4718c513e0c3..bd526ae8e9df 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -7,7 +7,7 @@ #include #include "dataflow_api.h" -#include "tt_metal/hw/inc/ethernet/dataflow_api.h" +#include "tt_metal/hw/inc/ethernet/tunneling.h" #include "cpp/ttnn/operations/ccl/kernels/edm/edm_handshake.hpp" #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp" #include "cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp" @@ -241,7 +241,7 @@ write to the same receiver channel. //////////////////////////////////////////////// // Data structures, types, enums, and constants //////////////////////////////////////////////// - +static constexpr uint32_t DEFAULT_ETH_TXQ = 0; // senders update this stream static constexpr uint32_t to_receiver_pkts_sent_id = 0; @@ -282,11 +282,11 @@ void increment_local_update_ptr_val(uint8_t stream_id, int32_t val) { template void remote_update_ptr_val(int32_t val) { constexpr uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); - eth_write_remote_reg(addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); + internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } void remote_update_ptr_val(uint32_t stream_id, int32_t val) { const uint32_t addr = STREAM_REG_ADDR(stream_id, STREAM_REMOTE_DEST_BUF_SPACE_AVAILABLE_UPDATE_REG_INDEX); - eth_write_remote_reg(addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); + internal_::eth_write_remote_reg_no_txq_check(DEFAULT_ETH_TXQ, addr, val << REMOTE_DEST_BUF_WORDS_FREE_INC); } template @@ -437,12 +437,7 @@ void send_channel_sync( ) { auto src_addr = sender_buffer_channel.get_bytes_sent_address(sender_wrptr.get_buffer_index()); auto dest_addr = receiver_buffer_channel.get_bytes_sent_address(remote_receiver_wrptr.get_buffer_index()); - eth_send_bytes_over_channel_payload_only_unsafe( - reinterpret_cast(src_addr), - reinterpret_cast(dest_addr), - sizeof(eth_channel_sync_t), - sizeof(eth_channel_sync_t), - sizeof(eth_channel_sync_t) >> ETH_BYTES_TO_WORDS_SHIFT); + internal_::eth_send_packet_bytes_unsafe(DEFAULT_ETH_TXQ, src_addr, dest_addr, sizeof(eth_channel_sync_t)); } template @@ -457,7 +452,7 @@ void send_next_data( auto &local_sender_wrptr = sender_worker_interface.local_wrptr; auto local_sender_wrptr_buffer_index = local_sender_wrptr.get_buffer_index(); - ASSERT(!eth_txq_is_busy()); + ASSERT(!internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)); // TODO: TUNING - experiment with only conditionally breaking the transfer up into multiple packets if we are // a certain threshold less than full packet @@ -468,25 +463,19 @@ void send_next_data( auto volatile *pkt_header = reinterpret_cast(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index)); ASSERT(tt::fabric::is_valid(*const_cast(pkt_header))); - size_t payload_size = 0; - payload_size = pkt_header->get_payload_size_including_header(); + size_t payload_size_bytes = pkt_header->get_payload_size_including_header(); pkt_header->src_ch_id = sender_channel_index; auto src_addr = sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index); auto dest_addr = receiver_buffer_channel.get_buffer_address(remote_receiver_wrptr.get_buffer_index()); - eth_send_bytes_over_channel_payload_only_unsafe( - src_addr, - dest_addr, - payload_size, - payload_size, - payload_size >> ETH_BYTES_TO_WORDS_SHIFT); - + internal_::eth_send_packet_bytes_unsafe(DEFAULT_ETH_TXQ, src_addr, dest_addr, payload_size_bytes); // Note: We can only advance to the next buffer index if we have fully completed the send (both the payload and sync // messages) local_sender_wrptr.increment(); // update the remote reg static constexpr uint32_t words_to_forward = 1; + while (internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) {}; remote_update_ptr_val(words_to_forward); remote_receiver_wrptr.increment(); } @@ -609,7 +598,7 @@ bool run_sender_channel_step( // when moving to stream regs to manage rd/wr ptrs // TODO: update to be stream reg based. Initialize to space available and simply check for non-zero bool receiver_has_space_for_packet = outbound_to_receiver_channel_pointers.has_space_for_packet(); - if (receiver_has_space_for_packet && !eth_txq_is_busy()) { + if (receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload(); if (has_unsent_packet) { bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); @@ -699,7 +688,7 @@ void run_receiver_channel_step( auto &ack_ptr = receiver_channel_pointers.ack_ptr; auto pkts_received_since_last_check = get_ptr_val(); bool pkts_received = pkts_received_since_last_check > 0; - bool can_send_over_eth = !eth_txq_is_busy(); + bool can_send_over_eth = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); ASSERT(receiver_channel_pointers.completion_ptr.distance_behind(ack_ptr) < RECEIVER_NUM_BUFFERS); if (pkts_received && can_send_over_eth) { // currently only support processing one packet at a time, so we only decrement by 1 @@ -741,7 +730,7 @@ void run_receiver_channel_step( auto &completion_ptr = receiver_channel_pointers.completion_ptr; bool unsent_completions = !completion_ptr.is_caught_up_to(wr_flush_ptr); if (unsent_completions) { - bool can_send_without_blocking = !eth_txq_is_busy(); + bool can_send_without_blocking = !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ); if (can_send_without_blocking) { // completion ptr incremented in callee receiver_send_completion_ack( diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp index a281806cafcd..641e6cee244c 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp @@ -20,11 +20,8 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); const size_t payload_l1_address = l1_read_addr; - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); - pkt_hdr_forward->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); - pkt_hdr_backward->to_noc_unicast_write( - tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes}); + pkt_hdr_forward->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); + pkt_hdr_backward->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); noc_async_write(payload_l1_address, safe_get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr), payload_size_bytes); if (fabric_connection.has_forward_connection()) {