Skip to content

Commit

Permalink
#0: Use posted writes for profiler.
Browse files Browse the repository at this point in the history
By posting writes and only waiting for the flush, we can save around .2 us
between ops. The only reason there's a flush here is to ensure that later
writes to the buffer don't clobber data that's in the process of being written.
  • Loading branch information
jbaumanTT committed Jan 27, 2025
1 parent 19f51f9 commit 45aed6f
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 4 deletions.
12 changes: 12 additions & 0 deletions tt_metal/hw/inc/dataflow_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1732,6 +1732,18 @@ void noc_async_writes_flushed(uint8_t noc = noc_index) {
WAYPOINT("NWFD");
}

/**
* This blocking call waits for all outstanding enqueued posted *noc_async_write*
* calls issued on the current Tensix core to depart, but will not wait
* for them to complete
*/
FORCE_INLINE
void noc_async_posted_writes_flushed(uint8_t noc = noc_index) {
WAYPOINT("NPWW");
while (!ncrisc_noc_posted_writes_sent(noc));
WAYPOINT("NPWD");
}

/**
* This blocking call waits for all the outstanding enqueued *noc_async_write*
* calls issued on the current Tensix core to complete. After returning from
Expand Down
20 changes: 16 additions & 4 deletions tt_metal/tools/profiler/kernel_profiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,18 @@ inline __attribute__((always_inline)) void risc_finished_profiling() {
profiler_control_buffer[kernel_profiler::DEVICE_BUFFER_END_INDEX_BR_ER + myRiscID] = wIndex;
}

#if defined(COMPILE_FOR_NCRISC) || defined(COMPILE_FOR_BRISC) || defined(COMPILE_FOR_ERISC) || \
defined(COMPILE_FOR_IDLE_ERISC)
inline void noc_async_write_posted(
std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size, uint8_t noc = noc_index) {
WAYPOINT("NAWW");
DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size);
ncrisc_noc_fast_write_any_len<proc_type, noc_mode>(
noc, write_cmd_buf, src_local_l1_addr, dst_noc_addr, size, NOC_UNICAST_WRITE_VC, false, false, 1, true, true);
WAYPOINT("NAWD");
}
#endif

__attribute__((noinline)) void finish_profiler() {
risc_finished_profiling();
#if defined(COMPILE_FOR_ERISC) || defined(COMPILE_FOR_IDLE_ERISC) || defined(COMPILE_FOR_BRISC)
Expand Down Expand Up @@ -234,14 +246,14 @@ __attribute__((noinline)) void finish_profiler() {
uint64_t dram_bank_dst_noc_addr =
s.get_noc_addr(core_flat_id / profiler_core_count_per_dram, dram_offset);

noc_async_write(
noc_async_write_posted(
reinterpret_cast<uint32_t>(profiler_data_buffer[hostIndex]), dram_bank_dst_noc_addr, send_size);
}
profiler_control_buffer[deviceIndex] = 0;
}
}

noc_async_write_barrier();
noc_async_posted_writes_flushed();
profiler_control_buffer[RUN_COUNTER]++;
profiler_control_buffer[PROFILER_DONE] = 1;
#endif
Expand Down Expand Up @@ -281,12 +293,12 @@ __attribute__((noinline)) void quick_push() {
uint32_t currEndIndex = profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] + wIndex;

if (currEndIndex <= PROFILER_FULL_HOST_VECTOR_SIZE_PER_RISC) {
noc_async_write(
noc_async_write_posted(
reinterpret_cast<uint32_t>(profiler_data_buffer[myRiscID]),
dram_bank_dst_noc_addr,
wIndex * sizeof(uint32_t));

noc_async_write_barrier();
noc_async_posted_writes_flushed();
profiler_control_buffer[HOST_BUFFER_END_INDEX_BR_ER + myRiscID] = currEndIndex;

} else {
Expand Down

0 comments on commit 45aed6f

Please sign in to comment.