From 857c1bb12f0e7fe2e88eadb0877c244c22f6e2f1 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Thu, 20 Feb 2025 17:15:48 +0000 Subject: [PATCH] #18058: Move init_sync_registers to TRISC0 Initializing all the sync registers takes around 120 cycles on wormhole. By moving the work to TRISC0 (when it's already idle) we can cut the cost to BRISC by around 100 cycles. --- tt_metal/api/tt-metalium/dev_msgs.h | 1 + tt_metal/hw/firmware/src/brisc.cc | 22 +++++++++---------- tt_metal/hw/firmware/src/trisc.cc | 18 +++++++++++++++ tt_metal/impl/debug/watcher_device_reader.cpp | 2 ++ 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/tt_metal/api/tt-metalium/dev_msgs.h b/tt_metal/api/tt-metalium/dev_msgs.h index 9d8502eb03e..717c9bdc5bb 100644 --- a/tt_metal/api/tt-metalium/dev_msgs.h +++ b/tt_metal/api/tt-metalium/dev_msgs.h @@ -45,6 +45,7 @@ constexpr uint32_t RUN_SYNC_MSG_GO = 0x80; // Trigger loading CBs (and IRAM) before actually running the kernel. constexpr uint32_t RUN_SYNC_MSG_LOAD = 0x1; constexpr uint32_t RUN_SYNC_MSG_WAITING_FOR_RESET = 0x2; +constexpr uint32_t RUN_SYNC_MSG_INIT_SYNC_REGISTERS = 0x3; constexpr uint32_t RUN_SYNC_MSG_DONE = 0; constexpr uint32_t RUN_SYNC_MSG_ALL_GO = 0x80808080; constexpr uint32_t RUN_SYNC_MSG_ALL_SLAVES_DONE = 0; diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 45c87c78e63..731679ca77f 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -269,17 +269,6 @@ void device_setup() { // core.ex_sem_init(semaphore::CFG_STATE_BUSY, MAX_CONFIG_STATES, 0, instrn_buf[0]); } -void init_sync_registers() { - volatile tt_reg_ptr uint* tiles_received_ptr; - volatile tt_reg_ptr uint* tiles_acked_ptr; - for (uint32_t operand = 0; operand < NUM_CIRCULAR_BUFFERS; operand++) { - tiles_received_ptr = get_cb_tiles_received_ptr(operand); - tiles_received_ptr[0] = 0; - tiles_acked_ptr = get_cb_tiles_acked_ptr(operand); - tiles_acked_ptr[0] = 0; - } -} - inline void init_ncrisc_iram() { #if NCRISC_FIRMWARE_IN_IRAM uint16_t fw_size16 = mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.ncrisc_kernel_size16; @@ -326,6 +315,11 @@ inline void set_ncrisc_kernel_resume_deassert_address() { } inline void run_triscs(dispatch_core_processor_masks enables) { + // Wait for init_sync_registers to complete. Should always be done by the time we get here. + while (mailboxes->slave_sync.trisc0 != RUN_SYNC_MSG_DONE) { + invalidate_l1_cache(); + } + if (enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_COMPUTE) { mailboxes->slave_sync.trisc0 = RUN_SYNC_MSG_GO; mailboxes->slave_sync.trisc1 = RUN_SYNC_MSG_GO; @@ -371,6 +365,8 @@ inline void wait_ncrisc_trisc() { WAYPOINT("NTD"); } +inline void trigger_sync_register_init() { mailboxes->slave_sync.trisc0 = RUN_SYNC_MSG_INIT_SYNC_REGISTERS; } + int main() { configure_l1_data_cache(); DIRTY_STACK_MEMORY(); @@ -405,6 +401,7 @@ int main() { noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR); noc_local_state_init(noc_index); uint8_t prev_noc_mode = DM_DEDICATED_NOC; + trigger_sync_register_init(); #if defined(ARCH_BLACKHOLE) @@ -416,7 +413,6 @@ int main() { #endif while (1) { - init_sync_registers(); reset_ncrisc_with_iram(); WAYPOINT("GW"); @@ -550,6 +546,8 @@ int main() { wait_ncrisc_trisc(); + trigger_sync_register_init(); + if (noc_mode == DM_DYNAMIC_NOC) { // barrier to make sure all writes are finished while (!ncrisc_dynamic_noc_nonposted_writes_flushed(noc_index)); diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index 9a268bf096e..79e6b6eb4e2 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -19,6 +19,7 @@ #include "circular_buffer.h" #include "circular_buffer_init.h" #endif +#include "circular_buffer_constants.h" // clang-format on #if defined(PROFILE_KERNEL) @@ -75,6 +76,17 @@ constexpr bool cb_init_write = false; using namespace ckernel; +void init_sync_registers() { + volatile tt_reg_ptr uint* tiles_received_ptr; + volatile tt_reg_ptr uint* tiles_acked_ptr; + for (uint32_t operand = 0; operand < NUM_CIRCULAR_BUFFERS; operand++) { + tiles_received_ptr = get_cb_tiles_received_ptr(operand); + tiles_received_ptr[0] = 0; + tiles_acked_ptr = get_cb_tiles_acked_ptr(operand); + tiles_acked_ptr[0] = 0; + } +} + int main(int argc, char *argv[]) { configure_l1_data_cache(); DIRTY_STACK_MEMORY(); @@ -92,6 +104,12 @@ int main(int argc, char *argv[]) { while (1) { WAYPOINT("W"); while (*trisc_run != RUN_SYNC_MSG_GO) { + if constexpr (COMPILE_FOR_TRISC == 0) { + if (*trisc_run == RUN_SYNC_MSG_INIT_SYNC_REGISTERS) { + init_sync_registers(); + *trisc_run = RUN_SYNC_MSG_DONE; + } + } invalidate_l1_cache(); } DeviceZoneScopedMainN("TRISC-FW"); diff --git a/tt_metal/impl/debug/watcher_device_reader.cpp b/tt_metal/impl/debug/watcher_device_reader.cpp index c6227c990f3..54875b822ad 100644 --- a/tt_metal/impl/debug/watcher_device_reader.cpp +++ b/tt_metal/impl/debug/watcher_device_reader.cpp @@ -638,6 +638,8 @@ void WatcherDeviceReader::DumpRunState(CoreDescriptor& core, const launch_msg_t* code = 'L'; } else if (state == RUN_SYNC_MSG_WAITING_FOR_RESET) { code = 'W'; + } else if (state == RUN_SYNC_MSG_INIT_SYNC_REGISTERS) { + code = 'S'; } if (code == 'U') { LogRunningKernels(core, launch_msg);