diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 9d6db6cdbf4..4d5dec0ed96 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1568,11 +1568,7 @@ void EnqueueProgramCommand::process() { uint32_t sync_count = 0; bool stall_first = reservation.first.need_sync; bool stall_before_program = false; - if (!program.kernel_binary_always_stored_in_ringbuffer()) { - // Wait for all existing commands to run before writing out the kernel binary. - sync_count = this->expected_num_workers_completed; - stall_before_program = !stall_first; - } else if (reservation.first.need_sync) { + if (reservation.first.need_sync) { // TODO: attempt to send RTA only without stalling. sync_count = reservation.first.sync_count; // Check if the launch message is the only thing preventing us from @@ -1580,6 +1576,7 @@ void EnqueueProgramCommand::process() { // would also send the kernel binaries in this case, but the rest of the // code isn't set up for that. auto config_sizes = program.get_program_config_sizes(); + config_sizes[config_sizes.size() - 2] = 0; config_sizes[config_sizes.size() - 1] = 0; const std::pair&> memory_reservation = this->config_buffer_mgr.reserve(config_sizes); @@ -1622,9 +1619,9 @@ void EnqueueProgramCommand::process() { this->config_buffer_mgr.alloc(this->expected_num_workers_completed + num_workers); std::vector& kernel_config_addrs_raw = reservation.second; - // Remove launch buffer from config addrs, since it's not a real core. + // Remove launch buffers from config addrs, since they're not real cores. const tt::stl::Span kernel_config_addrs{ - kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 1}; + kernel_config_addrs_raw.data(), kernel_config_addrs_raw.size() - 2}; RecordProgramRun(program); @@ -3077,6 +3074,9 @@ void HWCommandQueue::reset_config_buffer_mgr(const uint32_t num_entries) { // Subtract 1 from the number of entries, so the watcher can read information (e.g. fired asserts) from the // previous launch message. this->config_buffer_mgr[i].init_add_buffer(0, launch_msg_buffer_num_entries - 1); + + // There's no ring buffer for active ethernet binaries, so keep track of them separately. + this->config_buffer_mgr[i].init_add_buffer(0, 1); } } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index d3cf81833f1..6ff0a5c2a64 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -337,9 +337,7 @@ detail::Program_::Program_() : } program_configs_.resize(programmable_core_count); - program_config_sizes_.resize(programmable_core_count + 1); - // Always need one launch buffer msg for a program. - program_config_sizes_[programmable_core_count] = 1; + program_config_sizes_.resize(programmable_core_count + 2); } Program::Program() : pimpl_(std::make_unique()) {} @@ -1504,6 +1502,9 @@ void detail::Program_::finalize(Device *device) { offset, max_size, magic_enum::enum_name(programmable_core_type)); } + this->get_program_config_size(hal.get_programmable_core_type_count()) = runs_on_noc_multicast_only_cores(); + this->get_program_config_size(hal.get_programmable_core_type_count() + 1) = runs_on_noc_unicast_only_cores(); + // The sem offsets cross programmable_core_types so must be set after the loop above this->set_launch_msg_sem_offsets();