Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Additional EDM fabric optimizations (mix of low level and experimental flow control protocol trimming) #17749

Merged
merged 7 commits into from
Feb 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ void kernel_main() {

mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast<uint8_t>(mcast_fwd_hops)});
mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast<uint8_t>(mcast_bwd_hops)});
unicast_packet_header->to_chip_unicast(UnicastRoutingCommandHeader{static_cast<uint8_t>(unicast_hops)});
unicast_packet_header->to_chip_unicast(static_cast<uint8_t>(unicast_hops));

{
DeviceZoneScopedN("MAIN-WRITE-ZONE");
Expand All @@ -140,8 +140,8 @@ void kernel_main() {
noc_async_write(source_l1_buffer_address, dest_addr, packet_payload_size_bytes);
if (fabric_connection.has_forward_connection()) {
DeviceZoneScopedN("WR-FWD");
mcast_fwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{
noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)});
mcast_fwd_packet_header->to_noc_unicast_write(
NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes);
{
DeviceZoneScopedN("WR-FWD-WAIT");
fabric_connection.get_forward_connection().wait_for_empty_write_slot();
Expand All @@ -155,8 +155,8 @@ void kernel_main() {

if (fabric_connection.has_backward_connection()) {
DeviceZoneScopedN("WR-BWD");
mcast_bwd_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{
noc0_dest_addr, packet_payload_size_bytes + sizeof(tt::fabric::PacketHeader)});
mcast_bwd_packet_header->to_noc_unicast_write(
NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes);
{
DeviceZoneScopedN("WR-BWD-WAIT");
fabric_connection.get_backward_connection().wait_for_empty_write_slot();
Expand All @@ -179,7 +179,7 @@ void kernel_main() {
DeviceZoneScopedN("UNICAST-WRITE");
auto& fabric_conn =
unicast_is_fwd ? fabric_connection.get_forward_connection() : fabric_connection.get_backward_connection();
unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr, packet_payload_size_bytes});
unicast_packet_header->to_noc_unicast_write(NocUnicastCommandHeader{noc0_dest_addr}, packet_payload_size_bytes);
fabric_conn.wait_for_empty_write_slot();
fabric_conn.send_payload_without_header_non_blocking_from_address(
source_l1_buffer_address, packet_payload_size_bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,18 +124,17 @@ void kernel_main() {
const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX);
const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader);
auto packet_addr = get_read_ptr(cb_id_in0);
auto& packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(packet_addr);
auto* packet_header = reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_addr);
if constexpr (mcast_mode) {
packet_header
.to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
packet_header.reserved2 = 0x1111; // debug only
->to_chip_multicast(
tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range})
->to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size));
} else {
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
dest_noc_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
packet_header.reserved2 = 0x1111; // debug only
packet_header->to_chip_unicast(config.unicast.distance)
->to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{dest_noc_address}, (pages_to_send * page_size));
}

sender.send_payload_blocking_from_address(packet_addr, packet_size);
Expand All @@ -150,7 +149,7 @@ void kernel_main() {
ASSERT(*last_message_semaphore_address == 0);
uint64_t last_message_semaphore_noc0_addr =
safe_get_noc_addr(my_x[0], my_y[0], (uint32_t)last_message_semaphore_address, 0);
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{2});
packet_header.to_chip_unicast(2);
packet_header.to_noc_unicast_atomic_inc(
tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_semaphore_noc0_addr, 1, 32));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,10 @@ auto forward_to_fabric_from_cb(
if constexpr (mcast_mode) {
packet_header
.to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size));
} else {
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{config.unicast.distance})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_address, (pages_to_send * page_size) + sizeof(tt::fabric::PacketHeader)});
packet_header.to_chip_unicast(config.unicast.distance)
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_address}, (pages_to_send * page_size));
}

uint64_t buffer_address = sender.edm_buffer_addr + (*sender.buffer_index_ptr * (sender.buffer_size_bytes + sizeof(eth_channel_sync_t)));
Expand Down Expand Up @@ -189,7 +187,7 @@ void kernel_main() {
packet_header.reserved = 0xE;
packet_header.reserved2 = 0xFFFF;
uint64_t last_message_sem_noc_addr = get_noc_addr(my_x[0], my_y[0], last_message_semaphore_address);
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{kLoopbackNumHopsToMyChip});
packet_header.to_chip_unicast(kLoopbackNumHopsToMyChip);
packet_header.to_noc_unicast_atomic_inc(
tt::fabric::NocUnicastAtomicIncCommandHeader(last_message_sem_noc_addr, 1, 32));

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ bool terminate_fabric_endpoints_farthest_to_nearest (
auto &packet_header = *reinterpret_cast<tt::fabric::PacketHeader*>(a_packet_header_addr);
reinterpret_cast<volatile uint32_t*>(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE;
sender.wait_for_empty_write_slot();
packet_header.to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{static_cast<uint8_t>(distance)})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
termination_sig_noc_addr, sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)});
packet_header.to_chip_unicast(static_cast<uint8_t>(distance))
.to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr},
sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t));
sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header());
noc_async_writes_flushed();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3266,7 +3266,6 @@ TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWra
RunWriteThroughputStabilityTestWithPersistentFabric(
num_mcasts, num_unicasts, num_links, num_op_invocations, params);
}
// hangs with DPRINT
TEST(EdmFabric, BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWrap_2Device) {
const size_t num_mcasts = 9;
const size_t num_unicasts = 0;
Expand Down Expand Up @@ -3294,7 +3293,6 @@ TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWra
RunWriteThroughputStabilityTestWithPersistentFabric(
num_mcasts, num_unicasts, num_links, num_op_invocations, params);
}
// First to hang - maybe somethign to do with merging traffic
TEST(EdmFabric, DISABLED_BasicMcastThroughputTest_SenderFullNoWrap_ReceiverNoWrap_TwoWorkers_4Device) {
const size_t num_mcasts = 9;
const size_t num_unicasts = 0;
Expand Down Expand Up @@ -3603,6 +3601,18 @@ TEST(EdmFabric, BasicMcastThroughputTest_3) {
RunWriteThroughputStabilityTestWithPersistentFabric(
num_mcasts, num_unicasts, num_links, num_op_invocations, params);
}
TEST(EdmFabric, BasicMcastThroughputTest_3_onehop) {
const size_t num_mcasts = 200000;
const size_t num_unicasts = 2;
const size_t num_links = 1;
const size_t num_op_invocations = 1;
const bool line_sync = true;
WriteThroughputStabilityTestWithPersistentFabricParams params;
params.line_sync = line_sync;
params.line_size = 2;
RunWriteThroughputStabilityTestWithPersistentFabric(
num_mcasts, num_unicasts, num_links, num_op_invocations, params);
}
TEST(EdmFabric, BasicMcastThroughputTest_4) {
const size_t num_mcasts = 800000;
const size_t num_unicasts = 2;
Expand Down
11 changes: 11 additions & 0 deletions tests/ttnn/unit_tests/operations/ccl/test_new_all_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,17 @@ def test_all_gather(
None,
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
),
(
4,
[1, 4, 32, 1280],
3,
ttnn.TILE_LAYOUT,
(32, 320),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(1, 4))}),
None,
None,
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
),
],
)
@pytest.mark.parametrize("num_links", [1])
Expand Down
6 changes: 6 additions & 0 deletions tt_metal/hw/inc/ethernet/tunneling.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ void eth_write_remote_reg(uint32_t q_num, uint32_t reg_addr, uint32_t val) {
eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val);
eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG);
}
FORCE_INLINE
void eth_write_remote_reg_no_txq_check(uint32_t q_num, uint32_t reg_addr, uint32_t val) {
eth_txq_reg_write(q_num, ETH_TXQ_DEST_ADDR, reg_addr);
eth_txq_reg_write(q_num, ETH_TXQ_REMOTE_REG_DATA, val);
eth_txq_reg_write(q_num, ETH_TXQ_CMD, ETH_TXQ_CMD_START_REG);
}

void check_and_context_switch() {
uint32_t start_time = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
Expand Down
1 change: 1 addition & 0 deletions ttnn/cpp/pybind11/global_semaphore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <tt-metalium/global_semaphore.hpp>
#include "cpp/ttnn/global_semaphore.hpp"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"

namespace ttnn::global_semaphore {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,15 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write(
pkt_hdr->reserved2 = my_chip_id;
#endif

size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader);
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr, packet_send_size_bytes});
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes);

switch (current_cmd_header.dest_type) {
case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: {
const auto& unicast_args = current_cmd_header.get_unicast_dest_args();
auto& fabric_conn = unicast_args.is_forward_direction ? fabric_connection.get_forward_connection()
: fabric_connection.get_backward_connection();

pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{unicast_args.distance_in_hops});
pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops);
fabric_conn.wait_for_empty_write_slot();
fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes);
fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -438,22 +438,19 @@ void try_advance_inline_write_or_atomic_inc(command_context_t<Addrgen>& cmd_ctx)

ASSERT(cmd_ctx.packet_header_buffer_addr != 0);
auto* pkt_hdr = reinterpret_cast<tt::fabric::PacketHeader*>(cmd_ctx.packet_header_buffer_addr);
#ifdef DEBUG_PRINT_ENABLED
pkt_hdr->reserved2 = my_chip_id;
#endif

uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0);
if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) {
pkt_hdr->to_noc_unicast_atomic_inc(
tt::fabric::NocUnicastAtomicIncCommandHeader{dest_noc_addr_for_pkt, static_cast<uint16_t>(value), 32});
} else {
pkt_hdr->to_noc_unicast_write(
tt::fabric::NocUnicastCommandHeader{dest_noc_addr_for_pkt, static_cast<uint16_t>(value)});
pkt_hdr->to_noc_unicast_inline_write(
tt::fabric::NocUnicastInlineWriteCommandHeader{dest_noc_addr_for_pkt, static_cast<uint16_t>(value)});
}

switch (cmd_ctx.current_cmd_header.dest_type) {
case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: {
pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{
cmd_ctx.current_cmd_header.get_unicast_dest_args().distance_in_hops});
pkt_hdr->to_chip_unicast(cmd_ctx.current_cmd_header.get_unicast_dest_args().distance_in_hops);

auto& fabric_connection = cmd_ctx.current_cmd_header.get_unicast_dest_args().is_forward_direction
? cmd_ctx.fabric_connection.get_forward_connection()
Expand Down Expand Up @@ -563,21 +560,16 @@ void write_and_advance_local_read_address_for_fabric_write(
const size_t payload_l1_address = l1_read_addr;

auto pkt_hdr = reinterpret_cast<volatile tt::fabric::PacketHeader*>(packet_header_buffer_addr);
#ifdef DEBUG_PRINT_ENABLED
pkt_hdr->reserved2 = my_chip_id;
#endif

size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader);
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_noc_addr, packet_send_size_bytes});
pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes);

switch (current_cmd_header.dest_type) {
case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: {
const auto& unicast_args = current_cmd_header.get_unicast_dest_args();
auto& fabric_conn = unicast_args.is_forward_direction ? fabric_connection.get_forward_connection()
: fabric_connection.get_backward_connection();

pkt_hdr->to_chip_unicast(tt::fabric::UnicastRoutingCommandHeader{unicast_args.distance_in_hops});
pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops);

fabric_conn.wait_for_empty_write_slot();
fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,7 @@ void mcast_contig_pages_to_noc_address(
pkt_hdr
.to_chip_multicast(
tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(forward_direction_num_hops)})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_addr,
packet_send_size_bytes});
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes);
forward_fabric_sender.wait_for_empty_write_slot();
forward_fabric_sender.send_payload_flush_blocking_from_address(l1_read_addr, packet_send_size_bytes);
}
Expand All @@ -131,9 +129,7 @@ void mcast_contig_pages_to_noc_address(
pkt_hdr
.to_chip_multicast(
tt::fabric::MulticastRoutingCommandHeader{1, static_cast<uint8_t>(backward_direction_num_hops)})
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{
noc0_dest_addr,
packet_send_size_bytes});
.to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_addr}, packet_send_size_bytes);
backward_fabric_sender.wait_for_empty_write_slot();
backward_fabric_sender.send_payload_non_blocking_from_address(l1_read_addr, packet_send_size_bytes);
}
Expand Down
Loading
Loading