Skip to content

Commit

Permalink
Apply Various 1D Fabric Optimizations - Improve Performance by ~500 M…
Browse files Browse the repository at this point in the history
…B/s for 4k packet size (#18186)

Apply various small optimizations. The transformations and their performance deltas are
listed below. Note that the measurements below are when -O3 is enabled
for fabric kernel build, even though -Os is used in main. The reason for
this is that -O3 will be enabled later this week - currently blocked by
some dependencies - so this is the most representative performance
delta. Baselining and measuring at -Os would not be representative.

```
Baseline
unicast 112816548 -> 15.43 GB/s
mcast 274540294 -> 12.68 GB/s

# Cache noc addr:
110155221 -> 15.8 GB/s
276839301 -> 12.57 GB/s

## Flatten main loop sender, 1st branch nest:
107584162 unicast -> 16.18 GB/s
269844156 mcast -> 12.9 GB/s

## Flatten receiver last branch nest:
106827158 unicast -> 16.3
267551029 mcast -> 13.0 GB/s

Swapping fwd vs local noc write order to do forwarding write first:
104042988 unicast -> 16.7 GB/s
258379905 mcast -> 13.47 GB/s
```

Note that the cached noc addr showed a minor perf degradation for mcast,
although there is no reason it should cause a slow down. I did try
dropping that commit but keeping the rest of the change sequence and saw
a net perf degradation of 1-3% so I think the cached_noc_addr change was
probably perturbing other code indirectly and causing a degradation.
When applied as a last commit there is an improvement.

Update after rebase ontop of @tt-aho's latest changes to routing fields
in packet header, new numbers are
mcast -> 13.81 GB/s, up from 13.3 GB/s
  • Loading branch information
SeanNijjar authored Feb 24, 2025
1 parent 4fb909f commit 42adc10
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -633,13 +633,13 @@ FORCE_INLINE void receiver_forward_packet(
// If the packet is a terminal packet, then we can just deliver it locally
bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL;
uint16_t payload_size_bytes = packet_start->payload_size_bytes;
if (start_distance_is_terminal_value) {
execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id);
}
bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL;
if (not_last_destination_device) {
forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id);
}
if (start_distance_is_terminal_value) {
execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id);
}
} else if constexpr (std::is_same_v<ROUTING_FIELDS_TYPE, tt::fabric::LowLatencyRoutingFields>) {
uint32_t routing = cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK;
uint16_t payload_size_bytes = packet_start->payload_size_bytes;
Expand Down Expand Up @@ -682,25 +682,22 @@ FORCE_INLINE bool run_sender_channel_step(
// when moving to stream regs to manage rd/wr ptrs
// TODO: update to be stream reg based. Initialize to space available and simply check for non-zero
bool receiver_has_space_for_packet = outbound_to_receiver_channel_pointers.has_space_for_packet();
if (receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) {
bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload();
if (has_unsent_packet) {
bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS);
if (!sender_backpressured_from_sender_side) {
did_something = true;
auto packet_header = reinterpret_cast<PACKET_HEADER_TYPE*>(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index()));
if constexpr (enable_packet_header_recording) {
tt::fabric::validate(*packet_header);
packet_header_recorder.record_packet_header(reinterpret_cast<volatile uint32_t*>(packet_header));
}
send_next_data(
local_sender_channel,
local_sender_channel_worker_interface,
outbound_to_receiver_channel_pointers,
remote_receiver_channel,
sender_channel_index);
}
bool has_unsent_packet = local_sender_channel_worker_interface.has_unsent_payload();
bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS);
bool can_send = receiver_has_space_for_packet && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ) && has_unsent_packet && !sender_backpressured_from_sender_side;
if (can_send) {
did_something = true;
auto packet_header = reinterpret_cast<PACKET_HEADER_TYPE*>(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index()));
if constexpr (enable_packet_header_recording) {
tt::fabric::validate(*packet_header);
packet_header_recorder.record_packet_header(reinterpret_cast<volatile uint32_t*>(packet_header));
}
send_next_data(
local_sender_channel,
local_sender_channel_worker_interface,
outbound_to_receiver_channel_pointers,
remote_receiver_channel,
sender_channel_index);
}

// Process COMPLETIONs from receiver
Expand Down Expand Up @@ -753,6 +750,7 @@ FORCE_INLINE bool run_sender_channel_step(
}
did_something = true;
channel_connection_established = true;
local_sender_channel_worker_interface.cache_producer_noc_addr();
if constexpr (enable_first_level_ack) {
local_sender_channel_worker_interface.update_worker_copy_of_read_ptr(local_sender_channel_worker_interface.local_ackptr.get_ptr());
} else {
Expand Down Expand Up @@ -848,19 +846,19 @@ FORCE_INLINE void run_receiver_channel_step(
auto &wr_flush_ptr = receiver_channel_pointers.wr_flush_ptr;
// Currently unclear if it's better to loop here or not... Also unclear if merging these
// two pointers is better or not... Seems to be maybe 5-10% better merged but need more data
if (!wr_flush_ptr.is_caught_up_to(wr_sent_ptr) && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ)) {
auto receiver_buffer_index = wr_flush_ptr.get_buffer_index();
bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index);
if (next_trid_flushed) {
auto &completion_ptr = receiver_channel_pointers.completion_ptr;
wr_flush_ptr.increment();
receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index);
receiver_send_completion_ack(
remote_eth_sender_wrptrs,
remote_sender_channnels,
completion_ptr,
local_receiver_channel);
}
bool unflushed_writes_and_eth_txq_not_busy = !wr_flush_ptr.is_caught_up_to(wr_sent_ptr) && !internal_::eth_txq_is_busy(DEFAULT_ETH_TXQ);
auto receiver_buffer_index = wr_flush_ptr.get_buffer_index();
bool next_trid_flushed = receiver_channel_trid_tracker.transaction_flushed(receiver_buffer_index);
bool can_send_completion = unflushed_writes_and_eth_txq_not_busy && next_trid_flushed;
if (can_send_completion) {
auto &completion_ptr = receiver_channel_pointers.completion_ptr;
wr_flush_ptr.increment();
receiver_channel_trid_tracker.clear_trid_at_buffer_slot(receiver_buffer_index);
receiver_send_completion_ack(
remote_eth_sender_wrptrs,
remote_sender_channnels,
completion_ptr,
local_receiver_channel);
}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ template <uint8_t NUM_BUFFERS>
struct EdmChannelWorkerInterface {
EdmChannelWorkerInterface() :
worker_location_info_ptr(nullptr),
cached_worker_semaphore_address(0),
remote_producer_wrptr(nullptr),
connection_live_semaphore(nullptr),
local_wrptr(),
Expand All @@ -134,6 +135,7 @@ struct EdmChannelWorkerInterface {
volatile tt_l1_ptr uint32_t *const remote_producer_wrptr,
volatile tt_l1_ptr uint32_t *const connection_live_semaphore) :
worker_location_info_ptr(worker_location_info_ptr),
cached_worker_semaphore_address(0),
remote_producer_wrptr(remote_producer_wrptr),
connection_live_semaphore(connection_live_semaphore),
local_wrptr(),
Expand All @@ -155,14 +157,11 @@ struct EdmChannelWorkerInterface {
}

[[nodiscard]] FORCE_INLINE uint32_t get_worker_semaphore_address() const {
return worker_location_info_ptr->worker_semaphore_address;
return cached_worker_semaphore_address & 0xFFFFFFFF;
}

FORCE_INLINE void update_worker_copy_of_read_ptr(BufferPtr new_ptr_val) {
auto const &worker_info = *worker_location_info_ptr;
uint64_t worker_semaphore_address = get_noc_addr(
(uint32_t)worker_info.worker_xy.x, (uint32_t)worker_info.worker_xy.y, worker_info.worker_semaphore_address);
noc_inline_dw_write(worker_semaphore_address, new_ptr_val);
noc_inline_dw_write(this->cached_worker_semaphore_address, new_ptr_val);
}

// Connection management methods
Expand All @@ -180,22 +179,27 @@ struct EdmChannelWorkerInterface {
noc_semaphore_inc(worker_semaphore_address, 1);
}

FORCE_INLINE void cache_producer_noc_addr() {
auto const &worker_info = *worker_location_info_ptr;
uint64_t worker_semaphore_address = get_noc_addr(
(uint32_t)worker_info.worker_xy.x,
(uint32_t)worker_info.worker_xy.y,
worker_info.worker_semaphore_address);
this->cached_worker_semaphore_address = worker_semaphore_address;
}

FORCE_INLINE bool all_eth_packets_acked() const {
return this->local_ackptr.is_caught_up_to(this->local_wrptr);
}
FORCE_INLINE bool all_eth_packets_completed() const {
return this->local_rdptr.is_caught_up_to(this->local_wrptr);
}

// Call to keep the connection flow control info fresh with worker.
FORCE_INLINE void propagate_ackptr_to_connection_info() {
worker_location_info_ptr->edm_rdptr = local_ackptr.get_ptr();
}

[[nodiscard]] FORCE_INLINE bool has_worker_teardown_request() const { return *connection_live_semaphore == tt::fabric::EdmToEdmSender<0>::close_connection_request_value; }
[[nodiscard]] FORCE_INLINE bool connection_is_live() const { return *connection_live_semaphore == tt::fabric::EdmToEdmSender<0>::open_connection_value; }

volatile EDMChannelWorkerLocationInfo *worker_location_info_ptr;
uint64_t cached_worker_semaphore_address = 0;
volatile tt_l1_ptr uint32_t *const remote_producer_wrptr;
volatile tt_l1_ptr uint32_t *const connection_live_semaphore;

Expand Down

0 comments on commit 42adc10

Please sign in to comment.