From 5a14c57f56503f5b06e13b1d6850f98f7b67a3b9 Mon Sep 17 00:00:00 2001 From: Sean Nijjar Date: Wed, 19 Feb 2025 02:37:54 +0000 Subject: [PATCH] WIP (hanging) - enable larger channe buffer slot --- .../fabric_worker_sender_multi_input.cpp | 2 +- ...erisc_data_mover_loopback_with_workers.cpp | 15 +- .../ccl/erisc_datamover_builder.cpp | 154 ++++++++++++++---- .../ccl/erisc_datamover_builder.hpp | 4 +- .../edm_fabric/edm_fabric_worker_adapters.hpp | 4 +- .../edm_fabric/fabric_erisc_datamover.cpp | 46 ++++-- .../fabric_erisc_datamover_channels.hpp | 2 +- 7 files changed, 172 insertions(+), 55 deletions(-) diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index eaa14a0e40f..19573ae87c3 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -65,7 +65,7 @@ auto forward_to_fabric_from_cb( .to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size)); } - uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t))); + uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * sender.buffer_size_bytes); sender.send_payload_blocking_from_address(packet_addr, packet_size); noc_async_writes_flushed(); // } diff --git a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp index 1ab121ffec7..4bd49c6fc86 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/test_fabric_erisc_data_mover_loopback_with_workers.cpp @@ -427,7 +427,8 @@ bool RunLoopbackTest( // EDM Builder Setup //////////////////////////////////////////////////////////////////////////// - static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES; + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; auto chip0_worker_fabric_connection = chip_0_edm_builder.build_connection_to_worker_channel(); //////////////////////////////////////////////////////////////////////////// @@ -910,7 +911,8 @@ bool RunLineFabricTest( std::size_t page_plus_header_size = page_size + sizeof(tt::fabric::PacketHeader); std::size_t tensor_size_bytes = num_pages_total * page_size; - static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES; + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; const size_t local_chip_id = 0; const size_t remote_chip_id = 1; auto program_ptrs = std::vector(devices.size()); @@ -1237,7 +1239,8 @@ int TestLoopbackEntrypoint( IDevice* sender_device = device_0; IDevice* receiver_device = device_1; - static constexpr std::size_t edm_buffer_size = 4096 + PACKET_HEADER_SIZE_BYTES; + static constexpr std::size_t edm_buffer_size = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes + PACKET_HEADER_SIZE_BYTES; const chip_id_t local_chip_id = 0; const chip_id_t remote_chip_id = 1; auto const& edm_config = ttnn::ccl::FabricEriscDatamoverConfig(edm_buffer_size, 1, 2); @@ -2988,7 +2991,8 @@ void RunWriteThroughputStabilityTestWithPersistentFabric( static constexpr uint32_t source_payload_cb_index = tt::CB::c_in1; static constexpr size_t packet_header_cb_size_in_headers = 4; static constexpr bool enable_persistent_fabric_mode = true; - static constexpr size_t packet_payload_size_bytes = 4096; + static constexpr size_t packet_payload_size_bytes = + ttnn::ccl::FabricEriscDatamoverBuilder::default_packet_payload_size_bytes; static constexpr size_t dest_buffer_size = packet_payload_size_bytes * 4; static constexpr tt::DataFormat cb_df = tt::DataFormat::Bfp8; @@ -3114,7 +3118,8 @@ void RunWriteThroughputStabilityTestWithPersistentFabric( TT_FATAL( local_device_fabric_handle.get_num_links() == num_links, - "Error in test setup. Expected two links between devices but got {} links for device {}", + "Error in test setup. Expected {} links between devices but got {} links for device {}", + num_links, local_device_fabric_handle.get_num_links(), device->id()); diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp index 2f505f41586..56063965ffd 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.cpp @@ -43,7 +43,7 @@ namespace ttnn::ccl { // FabricEriscDatamoverConfig::FabricEriscDatamoverConfig( - std::size_t channel_buffer_size_bytes, std::size_t sender_ratio_size, std::size_t receiver_ratio_size) { + std::size_t preferred_channel_buffer_size_bytes, std::size_t sender_ratio_size, std::size_t receiver_ratio_size) { TT_FATAL( (receiver_completed_packet_header_cb_address % eth_word_l1_alignment == 0), "receiver_completed_packet_header_cb_address must be aligned to 16 bytes"); @@ -73,44 +73,103 @@ FabricEriscDatamoverConfig::FabricEriscDatamoverConfig( "receiver_completed_packet_header_cb_address must be aligned to 16 bytes"); TT_FATAL(sender_channel_1_buffer_index_address != sender_channel_0_buffer_index_address, "FabricEriscDatamoverConfig was constructed with illegal buffer index address"); - const size_t min_buffer_size = sizeof(tt::fabric::PacketHeader) + 2 * FabricEriscDatamoverConfig::eth_channel_sync_size; - TT_FATAL(channel_buffer_size_bytes >= min_buffer_size, "FabricEriscDatamoverConfig was constructed with `channel_buffer_size_bytes` argument set smaller than minimum size of {}", min_buffer_size); - - constexpr size_t default_pow2_num_sender_buffer_slots = 8; - constexpr size_t default_pow2_num_receiver_buffer_slots = 16; - - const std::size_t channel_buffer_size_with_channel_sync = - channel_buffer_size_bytes + sizeof(tt::fabric::PacketHeader); // + 16 // sizeof(tt::fabric::PacketHeader); + const size_t min_buffer_size = sizeof(tt::fabric::PacketHeader) + FabricEriscDatamoverConfig::eth_channel_sync_size; + TT_FATAL( + preferred_channel_buffer_size_bytes >= min_buffer_size, + "FabricEriscDatamoverConfig was constructed with `preferred_channel_buffer_size_bytes` argument set smaller " + "than minimum size of {}", + min_buffer_size); - const size_t next_lowest_power_of_2_buffer_slot_count = + // constexpr size_t default_pow2_num_sender_buffer_slots = 8; + // constexpr size_t default_pow2_num_receiver_buffer_slots = 16; + // See if we can thread in the constant from HAL - this->channel_buffer_size_bytes = channel_buffer_size_bytes; - this->channel_buffer_size_bytes_with_channel_sync = channel_buffer_size_with_channel_sync; const std::size_t total_ratio_count = 2 * sender_ratio_size + receiver_ratio_size; - this->sender_0_channel_size_bytes = tt::round_down( - (available_channel_buffering_space / total_ratio_count) * sender_ratio_size, - channel_buffer_size_with_channel_sync); - if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) { - this->sender_0_num_buffers = default_pow2_num_sender_buffer_slots; - } else { - this->sender_0_num_buffers = this->sender_0_channel_size_bytes / channel_buffer_size_with_channel_sync; - } - this->sender_1_channel_size_bytes = tt::round_down( - (available_channel_buffering_space / total_ratio_count) * sender_ratio_size, - channel_buffer_size_with_channel_sync); - if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) { - this->sender_1_num_buffers = default_pow2_num_sender_buffer_slots; - } else { - this->sender_1_num_buffers = this->sender_1_channel_size_bytes / channel_buffer_size_with_channel_sync; - } - this->receiver_channel_size_bytes = tt::round_down( - (available_channel_buffering_space / total_ratio_count) * receiver_ratio_size, - channel_buffer_size_with_channel_sync); if constexpr (FabricEriscDatamoverConfig::constrain_to_power_of_2_buffer_slot_counts) { - this->receiver_num_buffers = default_pow2_num_receiver_buffer_slots; + constexpr size_t min_desired_packet_payload_size_bytes = 1088 * 4; + constexpr size_t max_packet_payload_size_bytes = 8192; + constexpr size_t min_desired_packet_payload_size_bytes_with_header = + min_desired_packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader); + constexpr size_t max_packet_payload_size_bytes_with_header = + max_packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader); + + auto round_down_to_power_of_2 = [](size_t x) { + TT_FATAL( + x > 0, + "Cannot compute next lowest power of 2 for 0. Internal error when setting up " + "FabricEriscDatamoverConfig"); + size_t next_power_of_2 = 1; + while (x >= next_power_of_2) { + next_power_of_2 <<= 1; + } + return next_power_of_2 >> 1; + }; + + const size_t sender_channel_max_size_bytes = + (this->available_channel_buffering_space / total_ratio_count) * sender_ratio_size; + const size_t receiver_channel_max_size_bytes = + (this->available_channel_buffering_space / total_ratio_count) * receiver_ratio_size; + + const size_t sender_channel_num_buffer_slots_non_pow2 = + sender_channel_max_size_bytes / min_desired_packet_payload_size_bytes_with_header; + const size_t receiver_channel_num_buffer_slots_non_pow2 = + receiver_channel_max_size_bytes / min_desired_packet_payload_size_bytes_with_header; + const size_t sender_channel_num_buffer_slots_pow2 = + round_down_to_power_of_2(sender_channel_num_buffer_slots_non_pow2); + const size_t receiver_channel_num_buffer_slots_pow2 = + round_down_to_power_of_2(receiver_channel_num_buffer_slots_non_pow2); + + this->sender_0_num_buffers = sender_channel_num_buffer_slots_pow2; + this->sender_1_num_buffers = sender_channel_num_buffer_slots_pow2; + this->receiver_num_buffers = receiver_channel_num_buffer_slots_pow2; + + this->sender_0_channel_size_bytes = sender_channel_max_size_bytes; + this->sender_1_channel_size_bytes = sender_channel_max_size_bytes; + this->receiver_channel_size_bytes = receiver_channel_max_size_bytes; + + const size_t sender_0_buffer_slot_size_bytes = tt::round_down( + sender_channel_max_size_bytes / this->sender_0_num_buffers, sizeof(tt::fabric::PacketHeader)); + const size_t sender_1_buffer_slot_size_bytes = tt::round_down( + sender_channel_max_size_bytes / this->sender_1_num_buffers, sizeof(tt::fabric::PacketHeader)); + const size_t receiver_buffer_slot_size_bytes = tt::round_down( + receiver_channel_max_size_bytes / this->receiver_num_buffers, sizeof(tt::fabric::PacketHeader)); + + this->channel_buffer_size_bytes = std::min( + {sender_0_buffer_slot_size_bytes, sender_1_buffer_slot_size_bytes, receiver_buffer_slot_size_bytes}); + this->sender_0_channel_size_bytes = this->channel_buffer_size_bytes * this->sender_0_num_buffers; + this->sender_1_channel_size_bytes = this->channel_buffer_size_bytes * this->sender_1_num_buffers; + this->receiver_channel_size_bytes = this->channel_buffer_size_bytes * this->receiver_num_buffers; + + TT_FATAL( + this->sender_0_num_buffers == this->sender_1_num_buffers, + "Implementation expects sender_0_num_buffers and sender_1_num_buffers to be the same for now"); + TT_FATAL( + this->sender_0_channel_size_bytes + this->sender_1_channel_size_bytes + this->receiver_channel_size_bytes <= + this->available_channel_buffering_space, + "Internal error - channel sizes exceed available space"); + TT_FATAL( + this->channel_buffer_size_bytes >= min_desired_packet_payload_size_bytes_with_header, + "Error - couldn't produce a channel buffer slot of minimal size {} when setting up " + "FabricEriscDatamoverConfig. This indicates a bug in internal logic", + min_desired_packet_payload_size_bytes_with_header); } else { - this->receiver_num_buffers = this->receiver_channel_size_bytes / channel_buffer_size_with_channel_sync; + this->channel_buffer_size_bytes = preferred_channel_buffer_size_bytes; + this->sender_0_channel_size_bytes = tt::round_down( + (available_channel_buffering_space / total_ratio_count) * sender_ratio_size, + this->channel_buffer_size_bytes); + this->sender_0_num_buffers = this->sender_0_channel_size_bytes / this->channel_buffer_size_bytes; + + this->sender_1_channel_size_bytes = tt::round_down( + (available_channel_buffering_space / total_ratio_count) * sender_ratio_size, + this->channel_buffer_size_bytes); + + this->sender_1_num_buffers = this->sender_1_channel_size_bytes / this->channel_buffer_size_bytes; + + this->receiver_channel_size_bytes = tt::round_down( + (available_channel_buffering_space / total_ratio_count) * receiver_ratio_size, + this->channel_buffer_size_bytes); + this->receiver_num_buffers = this->receiver_channel_size_bytes / this->channel_buffer_size_bytes; } this->sender_0_channel_base_address = buffer_region_start; @@ -123,10 +182,37 @@ FabricEriscDatamoverConfig::FabricEriscDatamoverConfig( static constexpr size_t total_num_channels = 3; // sender0, sender1, receiver const size_t max_channel_buffer_size = (available_channel_buffering_space / total_num_channels) - FabricEriscDatamoverConfig::eth_channel_sync_size - sizeof(tt::fabric::PacketHeader); - TT_FATAL(channel_buffer_size_bytes <= max_channel_buffer_size, "Specified size of `channel_buffer_size_bytes` was too large. Maximum allowable size is {} B", max_channel_buffer_size); + TT_FATAL( + this->channel_buffer_size_bytes <= max_channel_buffer_size, + "Specified size of `channel_buffer_size_bytes` was too large. Maximum allowable size is {} B", + max_channel_buffer_size); TT_FATAL(this->sender_0_channel_size_bytes > 0, "Internal error when computing `sender_0_channel_size_bytes` which was computed to be size 0"); TT_FATAL(this->sender_1_channel_size_bytes > 0, "Internal error when computing `sender_1_channel_size_bytes` which was computed to be size 0"); TT_FATAL(this->receiver_channel_size_bytes > 0, "Internal error when computing `receiver_channel_size_bytes` which was computed to be size 0"); + TT_FATAL( + this->receiver_channel_size_bytes % sizeof(tt::fabric::PacketHeader) == 0, + "Internal error - receiver_channel_size_bytes was computed to be not a multiple of " + "sizeof(tt::fabric::PacketHeader)"); + TT_FATAL( + this->sender_0_channel_size_bytes % sizeof(tt::fabric::PacketHeader) == 0, + "Internal error - sender_0_channel_size_bytes was computed to be not a multiple of " + "sizeof(tt::fabric::PacketHeader)"); + TT_FATAL( + this->sender_1_channel_size_bytes % sizeof(tt::fabric::PacketHeader) == 0, + "Internal error - sender_1_channel_size_bytes was computed to be not a multiple of " + "sizeof(tt::fabric::PacketHeader)"); + TT_FATAL( + this->sender_0_channel_base_address % sizeof(tt::fabric::PacketHeader) == 0, + "Internal error - sender_0_channel_base_address was computed to be not a multiple of " + "sizeof(tt::fabric::PacketHeader)"); + TT_FATAL( + this->sender_1_channel_base_address % sizeof(tt::fabric::PacketHeader) == 0, + "Internal error - sender_1_channel_base_address was computed to be not a multiple of " + "sizeof(tt::fabric::PacketHeader)"); + TT_FATAL( + this->receiver_channel_base_address % sizeof(tt::fabric::PacketHeader) == 0, + "Internal error - receiver_channel_base_address was computed to be not a multiple of " + "sizeof(tt::fabric::PacketHeader)"); TT_FATAL( this->sender_0_channel_size_bytes + this->sender_1_channel_size_bytes + this->receiver_channel_size_bytes <= this->available_channel_buffering_space, "Internal error when computing channel sizes. Total channel size exceeds available space"); diff --git a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp index a9d1a076ba6..b23e25d4175 100644 --- a/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/erisc_datamover_builder.hpp @@ -136,7 +136,6 @@ struct FabricEriscDatamoverConfig { std::size_t channel_buffer_size_bytes, std::size_t sender_ratio_size, std::size_t receiver_ratio_size); std::size_t channel_buffer_size_bytes = 0; - std::size_t channel_buffer_size_bytes_with_channel_sync = 0; std::size_t sender_0_channel_size_bytes = 0; std::size_t sender_0_num_buffers = 0; std::size_t sender_1_channel_size_bytes = 0; @@ -183,7 +182,8 @@ class FabricEriscDatamoverBuilder { public: static constexpr size_t default_firmware_context_switch_interval = 200000; // payload only, no header - static constexpr size_t default_packet_payload_size_bytes = 4096; + static constexpr size_t default_packet_payload_size_bytes = + 1088 * 4; // 4352 bytes to fit up to 4 bfp8 tiles per packet FabricEriscDatamoverBuilder( const CoreCoord& my_eth_core_logical, diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp index 4864cea0b29..b5cad5a6eda 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp @@ -227,7 +227,7 @@ struct WorkerToFabricEdmSenderImpl { noc_inline_dw_write(edm_connection_handshake_noc_addr, open_connection_value); noc_async_read_barrier(); - this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes + sizeof(eth_channel_sync_t))); + this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes)); ASSERT(*this->buffer_slot_wrptr_ptr < 20); } @@ -301,7 +301,7 @@ struct WorkerToFabricEdmSenderImpl { *this->buffer_slot_wrptr_ptr = !(wrptr == ((this->num_buffers_per_channel * 2) - 1)) ? wrptr + 1 : 0; } - this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * (this->buffer_size_bytes + sizeof(eth_channel_sync_t))); + this->edm_buffer_addr = this->edm_buffer_base_addr + (this->get_buffer_slot_index() * this->buffer_size_bytes); } FORCE_INLINE uint64_t compute_dest_buffer_slot_noc_addr() const { diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index 4f7b82b5ce7..b92f5a7a304 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -649,10 +649,23 @@ FORCE_INLINE bool run_sender_channel_step( // TODO: convert to loop to send multiple packets back to back (or support sending multiple packets in one shot) // when moving to stream regs to manage rd/wr ptrs // TODO: update to be stream reg based. Initialize to space available and simply check for non-zero + // + // 4 insns - commonize across channels could save some cycles bool receiver_has_space_for_packet = outbound_to_receiver_channel_pointers.has_space_for_packet(); - if (receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { + // Remove a branch if evaluating eth_txq_is_busy outside of loop - + // - usually it should pass though at that point + // Should structure loop from most likely to least likely to fail + if (receiver_has_space_for_packet && + // 3 insns + !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ) + + ) { + // 4 insns bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload(); + if (has_unsent_packet) { + // Changing to simple counter (outstanding packets) should let us simplify this + // 4 insns bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); if (!sender_backpressured_from_sender_side) { did_something = true; @@ -670,8 +683,10 @@ FORCE_INLINE bool run_sender_channel_step( } } } + // -> Total of up to ~15 insns to decide if we can send packet (eats into budget) // Process COMPLETIONs from receiver + // 2 insns load, 1 for branch int32_t completions_since_last_check = get_ptr_val(to_sender_packets_completed_streams[sender_channel_index]); if (completions_since_last_check > 0) { auto& sender_rdptr = local_sender_channel_worker_interface.local_rdptr; @@ -683,7 +698,10 @@ FORCE_INLINE bool run_sender_channel_step( local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(sender_rdptr.get_ptr()); } } - } + if constexpr (!enable_first_level_ack) { + did_something = true; + } + } // 2 insns // Process ACKs from receiver // ACKs are processed second to avoid any sort of races. If we process acks second, @@ -699,11 +717,9 @@ FORCE_INLINE bool run_sender_channel_step( increment_local_update_ptr_val(to_sender_packets_acked_streams[sender_channel_index], -acks_since_last_check); } did_something = did_something || (completions_since_last_check + acks_since_last_check) > 0; - } else { - did_something = did_something || (completions_since_last_check > 0); } - + // 1 insn if (!channel_connection_established) { // Can get rid of one of these two checks if we duplicate the logic above here in the function // and depending on which of the two versions we are in (the connected version or disconnected version) @@ -727,13 +743,16 @@ FORCE_INLINE bool run_sender_channel_step( local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_rdptr.get_ptr()); } } - } else if (local_sender_channel_worker_interface.has_worker_teardown_request()) { + } else if ( + // 4 insns + local_sender_channel_worker_interface.has_worker_teardown_request()) { did_something = true; channel_connection_established = false; local_sender_channel_worker_interface.teardown_connection( local_sender_channel_worker_interface.local_rdptr.get_ptr()); - } + } // 5 insns total + // Total insns spent checking ~18 insns return did_something; }; @@ -765,10 +784,13 @@ FORCE_INLINE void run_receiver_channel_step( ack_ptr.increment(); } } else { + // TODO: optimize away and use `pkts_received_since_last_check = get_ptr_val()` + // instead to check for unwritten_packets increment_local_update_ptr_val(-pkts_received_since_last_check); ack_ptr.increment_n(pkts_received_since_last_check); - } + } // 8 insns up to here for !enable_first_level_ack + // 3 insns incl branch auto &wr_sent_ptr = receiver_channel_pointers.wr_sent_ptr; bool unwritten_packets = !wr_sent_ptr.is_caught_up_to(ack_ptr); if (unwritten_packets) { @@ -816,9 +838,12 @@ FORCE_INLINE void run_receiver_channel_step( auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr; // Currently unclear if it's better to loop here or not... Also unclear if merging these // two pointers is better or not... Seems to be maybe 5-10% better merged but need more data + + // 4 insns for the branch condition if (!wr_flush_ptr.is_caught_up_to(wr_sent_ptr) && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) { auto receiver_buffer_index = wr_flush_ptr.get_buffer_index(); bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index); + // 7 insns if (next_trid_flushed) { auto &completion_ptr = receiver_channel_pointers.completion_ptr; wr_flush_ptr.increment(); @@ -829,9 +854,10 @@ FORCE_INLINE void run_receiver_channel_step( completion_ptr, local_receiver_channel); } - } - + } // 11 insns } + + // TOTAL of ~22 insns }; diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index 369c4f57f33..be90750d89a 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -52,7 +52,7 @@ class EthChannelBuffer final { // that can fit 2 eth_channel_syncs cfor ack uint8_t channel_id) : buffer_size_in_bytes(buffer_size_bytes), - max_eth_payload_size_in_bytes(buffer_size_in_bytes + sizeof(eth_channel_sync_t)), + max_eth_payload_size_in_bytes(buffer_size_in_bytes), channel_id(channel_id) { for (uint8_t i = 0; i < NUM_BUFFERS; i++) { this->buffer_addresses[i] =