From 45aed6fb6284fbce8b1d4de68430826c89e275f3 Mon Sep 17 00:00:00 2001 From: John Bauman Date: Mon, 27 Jan 2025 20:53:01 +0000 Subject: [PATCH] #0: Use posted writes for profiler. By posting writes and only waiting for the flush, we can save around .2 us between ops. The only reason there's a flush here is to ensure that later writes to the buffer don't clobber data that's in the process of being written. --- tt_metal/hw/inc/dataflow_api.h | 12 ++++++++++++ tt_metal/tools/profiler/kernel_profiler.hpp | 20 ++++++++++++++++---- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 5fe53ab5eb0..26b8b14cf6b 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -1732,6 +1732,18 @@ void noc_async_writes_flushed(uint8_t noc = noc_index) { WAYPOINT("NWFD"); } +/** + * This blocking call waits for all outstanding enqueued posted *noc_async_write* + * calls issued on the current Tensix core to depart, but will not wait + * for them to complete + */ +FORCE_INLINE +void noc_async_posted_writes_flushed(uint8_t noc = noc_index) { + WAYPOINT("NPWW"); + while (!ncrisc_noc_posted_writes_sent(noc)); + WAYPOINT("NPWD"); +} + /** * This blocking call waits for all the outstanding enqueued *noc_async_write* * calls issued on the current Tensix core to complete. After returning from diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index 91d87db28f5..95307ad140d 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -182,6 +182,18 @@ inline __attribute__((always_inline)) void risc_finished_profiling() { profiler_control_buffer[kernel_profiler::DEVICE_BUFFER_END_INDEX_BR_ER + myRiscID] = wIndex; } +#if defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_ERISC) || \ + defined(COMPILE_FOR_IDLE_ERISC) +inline void noc_async_write_posted( + std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size, uint8_t noc = noc_index) { + WAYPOINT("NAWW"); + DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); + ncrisc_noc_fast_write_any_len( + noc, write_cmd_buf, src_local_l1_addr, dst_noc_addr, size, NOC_UNICAST_WRITE_VC, false, false, 1, true, true); + WAYPOINT("NAWD"); +} +#endif + __attribute__((noinline)) void finish_profiler() { risc_finished_profiling(); #if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_IDLE_ERISC) || defined(COMPILE_FOR_BRISC) @@ -234,14 +246,14 @@ __attribute__((noinline)) void finish_profiler() { uint64_t dram_bank_dst_noc_addr = s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset); - noc_async_write( + noc_async_write_posted( reinterpret_cast(profiler_data_buffer[hostIndex]), dram_bank_dst_noc_addr, send_size); } profiler_control_buffer[deviceIndex] = 0; } } - noc_async_write_barrier(); + noc_async_posted_writes_flushed(); profiler_control_buffer[RUN_COUNTER]++; profiler_control_buffer[PROFILER_DONE] = 1; #endif @@ -281,12 +293,12 @@ __attribute__((noinline)) void quick_push() { uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] + wIndex; if (currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) { - noc_async_write( + noc_async_write_posted( reinterpret_cast(profiler_data_buffer[myRiscID]), dram_bank_dst_noc_addr, wIndex * sizeof(uint32_t)); - noc_async_write_barrier(); + noc_async_posted_writes_flushed(); profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] = currEndIndex; } else {