From 26b75a4fd2ad84f28af06b467934eb68bfc74cda Mon Sep 17 00:00:00 2001
From: John Bauman <jbauman@tenstorrent.com>
Date: Thu, 19 Dec 2024 17:04:31 +0000
Subject: [PATCH] #15605: Only force-stall ethernet programs on earlier
 ethernet programs

Keep track of when the last program using active ethernet cores was dispatched,
so we can wait on that program before sending out binaries. This is better than
always waiting on the immediate previous program, since in most cases we don't
run programs on the ethernet cores back-to-back.
---
 tt_metal/impl/dispatch/command_queue.cpp | 14 +++++++-------
 tt_metal/impl/program/program.cpp        |  7 ++++---
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp
index 284b45886052..096ef323df1b 100644
--- a/tt_metal/impl/dispatch/command_queue.cpp
+++ b/tt_metal/impl/dispatch/command_queue.cpp
@@ -1566,11 +1566,7 @@ void EnqueueProgramCommand::process() {
     uint32_t sync_count = 0;
     bool stall_first = reservation.first.need_sync;
     bool stall_before_program = false;
-    if (!program.kernel_binary_always_stored_in_ringbuffer()) {
-        // Wait for all existing commands to run before writing out the kernel binary.
-        sync_count = this->expected_num_workers_completed;
-        stall_before_program = !stall_first;
-    } else if (reservation.first.need_sync) {
+    if (reservation.first.need_sync) {
         // TODO: attempt to send RTA only without stalling.
         sync_count = reservation.first.sync_count;
         // Check if the launch message is the only thing preventing us from
@@ -1578,6 +1574,7 @@ void EnqueueProgramCommand::process() {
         // would also send the kernel binaries in this case, but the rest of the
         // code isn't set up for that.
         auto config_sizes = program.get_program_config_sizes();
+        config_sizes[config_sizes.size() - 2] = 0;
         config_sizes[config_sizes.size() - 1] = 0;
         const std::pair<ConfigBufferSync, std::vector<ConfigBufferEntry>&> memory_reservation =
             this->config_buffer_mgr.reserve(config_sizes);
@@ -1613,9 +1610,9 @@ void EnqueueProgramCommand::process() {
     this->config_buffer_mgr.alloc(this->expected_num_workers_completed + num_workers);
     std::vector<ConfigBufferEntry>& kernel_config_addrs_raw = reservation.second;
 
-    // Remove launch buffer from config addrs, since it's not a real core.
+    // Remove launch buffers from config addrs, since they're not real cores.
     const tt::stl::Span<ConfigBufferEntry> kernel_config_addrs{
-        kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 1};
+        kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 2};
 
     RecordProgramRun(program);
 
@@ -3057,6 +3054,9 @@ void HWCommandQueue::reset_config_buffer_mgr(const uint32_t num_entries) {
         // Subtract 1 from the number of entries, so the watcher can read information (e.g. fired asserts) from the
         // previous launch message.
         this->config_buffer_mgr[i].init_add_buffer(0, launch_msg_buffer_num_entries - 1);
+
+        // There's no ring buffer for active ethernet binaries, so keep track of them separately.
+        this->config_buffer_mgr[i].init_add_buffer(0, 1);
     }
 }
 
diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp
index b6b871e3be2c..5965bc6f207b 100644
--- a/tt_metal/impl/program/program.cpp
+++ b/tt_metal/impl/program/program.cpp
@@ -327,9 +327,7 @@ detail::Program_::Program_() :
     }
 
     program_configs_.resize(programmable_core_count);
-    program_config_sizes_.resize(programmable_core_count + 1);
-    // Always need one launch buffer msg for a program.
-    program_config_sizes_[programmable_core_count] = 1;
+    program_config_sizes_.resize(programmable_core_count + 2);
 }
 
 Program::Program() : pimpl_(std::make_unique<detail::Program_>()) {}
@@ -1491,6 +1489,9 @@ void detail::Program_::finalize(Device *device) {
                  offset, max_size, magic_enum::enum_name(programmable_core_type));
     }
 
+    this->get_program_config_size(hal.get_programmable_core_type_count()) = runs_on_noc_multicast_only_cores();
+    this->get_program_config_size(hal.get_programmable_core_type_count() + 1) = runs_on_noc_unicast_only_cores();
+
     // The sem offsets cross programmable_core_types so must be set after the loop above
     this->set_launch_msg_sem_offsets();