diff --git a/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp b/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp index 99679f937b3..cc7947dfcc3 100644 --- a/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp +++ b/tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp @@ -25,11 +25,13 @@ using namespace tt; using namespace tt::test_utils; using namespace tt::test_utils::df; -TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCAsyncWriteProgram) { - uint32_t NUM_PROGRAMS = 3; - uint32_t MAX_LOOP = 123456789; - uint32_t page_size = 1024; - +void build_and_run_program( + tt::tt_metal::IDevice* device, + bool slow_dispatch, + uint32_t NUM_PROGRAMS, + uint32_t MAX_LOOP, + uint32_t page_size, + bool mix_noc_mode) { // Make random auto random_seed = 0; // (unsigned int)time(NULL); uint32_t seed = tt::parse_env("TT_METAL_SEED", random_seed); @@ -43,60 +45,137 @@ TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCAsyncWriteProgram) log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS); - vector programs; - for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { - programs.push_back(Program()); - Program& program = programs.back(); - - if (i % 10 == 0) { - log_info(tt::LogTest, "Compiling program {} of {}", i + 1, NUM_PROGRAMS); + Program program1; + Program program2; + + CircularBufferConfig cb_config = + CircularBufferConfig(page_size, {{0, tt::DataFormat::Float16_b}}).set_page_size(0, page_size); + auto cb1 = CreateCircularBuffer(program1, cr_set, cb_config); + auto cb2 = CreateCircularBuffer(program2, cr_set, cb_config); + + vector compile_args = {MAX_LOOP, page_size}; + + auto brisc_kernel1 = CreateKernel( + program1, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .compile_args = compile_args}); + + auto ncrisc_kernel1 = CreateKernel( + program1, + "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, + .noc = NOC::RISCV_1_default, + .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .compile_args = compile_args}); + + auto brisc_kernel2 = CreateKernel( + program2, + mix_noc_mode ? "tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_0, + .noc = NOC::RISCV_0_default, + .noc_mode = mix_noc_mode ? tt_metal::NOC_MODE::DM_DEDICATED_NOC : tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .compile_args = compile_args}); + + auto ncrisc_kernel2 = CreateKernel( + program2, + mix_noc_mode ? "tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp" + : "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", + cr_set, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, + .noc = NOC::RISCV_1_default, + .noc_mode = mix_noc_mode ? tt_metal::NOC_MODE::DM_DEDICATED_NOC : tt_metal::NOC_MODE::DM_DYNAMIC_NOC, + .compile_args = compile_args}); + + for (int core_idx_y = 0; core_idx_y < worker_grid_size.y; core_idx_y++) { + for (int core_idx_x = 0; core_idx_x < worker_grid_size.x; core_idx_x++) { + CoreCoord core = {(std::size_t)core_idx_x, (std::size_t)core_idx_y}; + CoreCoord neighbour_core = {core_idx_x == worker_grid_size.x - 1 ? 0 : core_idx_x + 1, core_idx_y}; + CoreCoord neighbour_core_physical = device->worker_core_from_logical_core(neighbour_core); + // mcast + auto device_grid = device->compute_with_storage_grid_size(); + CoreCoord top_left_core = {0, 0}; + CoreCoord top_left_core_physical = device->worker_core_from_logical_core(top_left_core); + CoreCoord bottom_right_core = {device_grid.x - 1, device_grid.y - 1}; + CoreCoord bottom_right_core_physical = device->worker_core_from_logical_core(bottom_right_core); + std::vector rt_args = { + (std::uint32_t)neighbour_core_physical.x, + (std::uint32_t)neighbour_core_physical.y, + // mcast + (core_idx_x == 0 && core_idx_y == 0) ? true : false, + top_left_core_physical.x, + top_left_core_physical.y, + bottom_right_core_physical.x, + bottom_right_core_physical.y, + device_grid.x * device_grid.y}; + tt::tt_metal::SetRuntimeArgs(program1, brisc_kernel1, core, rt_args); + tt::tt_metal::SetRuntimeArgs(program1, ncrisc_kernel1, core, rt_args); + tt::tt_metal::SetRuntimeArgs(program2, brisc_kernel2, core, rt_args); + tt::tt_metal::SetRuntimeArgs(program2, ncrisc_kernel2, core, rt_args); } - - CircularBufferConfig cb_config = CircularBufferConfig(page_size, {{0, tt::DataFormat::Float16_b}}).set_page_size(0, page_size); - auto cb = CreateCircularBuffer(program, cr_set, cb_config); - - vector compile_args = {MAX_LOOP, page_size}; - - auto brisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", cr_set, DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, .compile_args = compile_args}); - - auto ncrisc_kernel = CreateKernel( - program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", cr_set, DataMovementConfig{ - .processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, .compile_args = compile_args}); - - for(int core_idx_y = 0; core_idx_y < worker_grid_size.y; core_idx_y++) { - for(int core_idx_x = 0; core_idx_x < worker_grid_size.x; core_idx_x++) { - CoreCoord core = {(std::size_t) core_idx_x, (std::size_t) core_idx_y}; - CoreCoord neighbour_core = {core_idx_x == worker_grid_size.x - 1 ? 0 : core_idx_x + 1, core_idx_y}; - CoreCoord neighbour_core_physical = device_->worker_core_from_logical_core(neighbour_core); - std::vector rt_args = { - (std::uint32_t) neighbour_core_physical.x, - (std::uint32_t) neighbour_core_physical.y, - }; - tt::tt_metal::SetRuntimeArgs(program, brisc_kernel, core, rt_args); - tt::tt_metal::SetRuntimeArgs(program, ncrisc_kernel, core, rt_args); - } - } - - tt::tt_metal::detail::CompileProgram(this->device_, program); } - log_info(tt::LogTest, "Running {} programs for cache warmup.", programs.size()); - // This loop caches program and runs + tt::tt_metal::detail::CompileProgram(device, program1); + tt::tt_metal::detail::CompileProgram(device, program2); + + // This loop caches program1 and runs for (uint32_t i = 0; i < NUM_PROGRAMS; i++) { - Program& program = programs[i]; - log_info(tt::LogTest, "Running program {} of {}", i + 1, NUM_PROGRAMS); - if (this->slow_dispatch_) { - tt::tt_metal::detail::LaunchProgram(this->device_, program); + log_info(tt::LogTest, "Running program1 {} of {}", i + 1, NUM_PROGRAMS); + if (i % 2 == 0) { + if (slow_dispatch) { + tt::tt_metal::detail::LaunchProgram(device, program1); + } else { + EnqueueProgram(device->command_queue(), program1, false); + } } else { - EnqueueProgram(this->device_->command_queue(), program, false); + if (slow_dispatch) { + tt::tt_metal::detail::LaunchProgram(device, program2); + } else { + EnqueueProgram(device->command_queue(), program2, false); + } } } - if (!this->slow_dispatch_) { - Finish(this->device_->command_queue()); + if (!slow_dispatch) { + Finish(device->command_queue()); log_info(tt::LogTest, "Finish FD runs"); } else { log_info(tt::LogTest, "Finish SD runs"); } } + +TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCOneProgram) { + uint32_t NUM_PROGRAMS = 1; + uint32_t MAX_LOOP = 65536; + uint32_t page_size = 1024; + bool mix_noc_mode = false; + + build_and_run_program(this->device_, this->slow_dispatch_, NUM_PROGRAMS, MAX_LOOP, page_size, mix_noc_mode); +} + +TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCMutlipleProgram) { + uint32_t NUM_PROGRAMS = 3; + uint32_t MAX_LOOP = 65536; + uint32_t page_size = 1024; + bool mix_noc_mode = false; + + build_and_run_program(this->device_, this->slow_dispatch_, NUM_PROGRAMS, MAX_LOOP, page_size, mix_noc_mode); +} + +TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCMutlipleProgramMixedMode) { + uint32_t NUM_PROGRAMS = 5; + uint32_t MAX_LOOP = 65536; + uint32_t page_size = 1024; + bool mix_noc_mode = true; + + build_and_run_program(this->device_, this->slow_dispatch_, NUM_PROGRAMS, MAX_LOOP, page_size, mix_noc_mode); +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp new file mode 100644 index 00000000000..d7dd5a10df5 --- /dev/null +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp @@ -0,0 +1,114 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" +#include "debug/dprint.h" + +void kernel_main() { + constexpr std::uint32_t iteration = get_compile_time_arg_val(0); + constexpr std::uint32_t page_size = get_compile_time_arg_val(1); + + std::uint32_t noc_x = get_arg_val(0); + std::uint32_t noc_y = get_arg_val(1); + bool mcast = get_arg_val(2); + std::uint32_t top_left_core_x = get_arg_val(3); + std::uint32_t top_left_core_y = get_arg_val(4); + std::uint32_t bottom_right_core_x = get_arg_val(5); + std::uint32_t bottom_right_core_y = get_arg_val(6); + std::uint32_t num_dests = get_arg_val(7); + + constexpr uint32_t cb_id = 0; + uint32_t l1_read_addr = get_read_ptr(cb_id); + + uint64_t mcast_addr_self_noc; + if (noc_index == 0) { + mcast_addr_self_noc = get_noc_multicast_addr( + top_left_core_x, top_left_core_y, bottom_right_core_x, bottom_right_core_y, l1_read_addr, noc_index); + } else { + mcast_addr_self_noc = get_noc_multicast_addr( + bottom_right_core_x, bottom_right_core_y, top_left_core_x, top_left_core_y, l1_read_addr, noc_index); + } + + uint64_t addr_self_noc = get_noc_addr(noc_x, noc_y, l1_read_addr, noc_index); + + DPRINT << "Start" << ENDL(); + + // Test stateful read API + noc_async_read_set_state(addr_self_noc, noc_index); + for (uint32_t i = 0; i < iteration; i++) { + noc_async_read_with_state(l1_read_addr, l1_read_addr, page_size, noc_index); + } + + // Test stateful read one packet API + noc_async_read_one_packet_set_state(addr_self_noc, page_size, noc_index); + for (uint32_t i = 0; i < iteration; i++) { + noc_async_read_one_packet_with_state(l1_read_addr, l1_read_addr, noc_index); + } + + // Test stateful write one packet API + noc_async_write_one_packet_set_state(addr_self_noc, page_size, noc_index); + for (uint32_t i = 0; i < iteration; i++) { + noc_async_write_one_packet_with_state(l1_read_addr, l1_read_addr, noc_index); + } + + // Test gen_fast + const InterleavedAddrGenFast s0 = { + .bank_base_address = l1_read_addr, .page_size = page_size, .data_format = DataFormat::Float16_b}; + + for (uint32_t i = 0; i < iteration; i++) { + uint32_t noc = noc_index; + + // uint32_t noc = noc_index; + uint64_t noc_addr = get_noc_addr(noc_x, noc_y, l1_read_addr, noc); + + // Test read + noc_async_read_one_packet(noc_addr, l1_read_addr, page_size, noc); + noc_async_read(noc_addr, l1_read_addr, page_size, noc); + // interleaved read + noc_async_read_tile(i % 1024, s0, l1_read_addr, 0, noc); + + // Test semaphore + noc_semaphore_inc(noc_addr, 1, noc); + noc_semaphore_set_remote(l1_read_addr, noc_addr, noc); + + // Test write + noc_async_write(l1_read_addr, noc_addr, page_size, noc); + noc_async_write_one_packet(l1_read_addr, noc_addr, page_size, noc); + // interleaved write + noc_async_write_tile(i % 1024, s0, l1_read_addr, noc); + + // Test mcast + if (mcast) { + // write mcast + noc_async_write_multicast_one_packet( + l1_read_addr, mcast_addr_self_noc, page_size, num_dests - 1, false, true, noc); + noc_async_write_multicast(l1_read_addr, mcast_addr_self_noc, page_size, num_dests - 1, false, true, noc); + noc_async_write_multicast_loopback_src( + l1_read_addr, mcast_addr_self_noc, page_size, num_dests, false, true, noc); + // semaphore mcast + noc_semaphore_set_multicast(l1_read_addr, mcast_addr_self_noc, num_dests - 1, false, true, noc); + noc_semaphore_set_multicast_loopback_src(l1_read_addr, mcast_addr_self_noc, num_dests, false, true, noc); + } + +// dw_write skip BH since there's HW issue +#ifndef ARCH_BLACKHOLE + noc_inline_dw_write(noc_addr, 1, 0xF, noc); +#endif + } + + DPRINT << "END" << ENDL(); + DPRINT << "noc_mode " << (uint)noc_mode << ENDL(); + + // Barrier test - test barrier itself working properly + for (int noc = 0; noc < NUM_NOCS; noc++) { + noc_async_read_barrier(); + noc_async_write_barrier(); + noc_async_writes_flushed(); + noc_async_posted_writes_flushed(); + noc_async_atomic_barrier(); + noc_async_full_barrier(); + } +} diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp index 4e7d568aefc..c934245cffa 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -13,23 +13,181 @@ void kernel_main() { std::uint32_t noc_x = get_arg_val(0); std::uint32_t noc_y = get_arg_val(1); + bool mcast = get_arg_val(2); + std::uint32_t top_left_core_x = get_arg_val(3); + std::uint32_t top_left_core_y = get_arg_val(4); + std::uint32_t bottom_right_core_x = get_arg_val(5); + std::uint32_t bottom_right_core_y = get_arg_val(6); + std::uint32_t num_dests = get_arg_val(7); constexpr uint32_t cb_id = 0; uint32_t l1_read_addr = get_read_ptr(cb_id); + uint64_t mcast_addr_self_noc; + uint64_t mcast_addr_other_noc; + if (noc_index == 0) { + mcast_addr_self_noc = get_noc_multicast_addr( + top_left_core_x, top_left_core_y, bottom_right_core_x, bottom_right_core_y, l1_read_addr, noc_index); + mcast_addr_other_noc = get_noc_multicast_addr( + bottom_right_core_x, bottom_right_core_y, top_left_core_x, top_left_core_y, l1_read_addr, 1 - noc_index); + } else { + mcast_addr_self_noc = get_noc_multicast_addr( + bottom_right_core_x, bottom_right_core_y, top_left_core_x, top_left_core_y, l1_read_addr, noc_index); + mcast_addr_other_noc = get_noc_multicast_addr( + top_left_core_x, top_left_core_y, bottom_right_core_x, bottom_right_core_y, l1_read_addr, 1 - noc_index); + } + + uint64_t addr_self_noc = get_noc_addr(noc_x, noc_y, l1_read_addr, noc_index); + uint64_t addr_other_noc = get_noc_addr(noc_x, noc_y, l1_read_addr, 1 - noc_index); + DPRINT << "Start" < s0 = { + .bank_base_address = l1_read_addr, .page_size = page_size, .data_format = DataFormat::Float16_b}; + for (uint32_t i = 0; i < iteration; i ++) { uint32_t noc = (i % 2) == 0 ? noc_index : 1-noc_index; + + // uint32_t noc = noc_index; uint64_t noc_addr = get_noc_addr(noc_x, noc_y, l1_read_addr, noc); + + // Test read + noc_async_read_one_packet(noc_addr, l1_read_addr, page_size, noc); noc_async_read(noc_addr, l1_read_addr, page_size, noc); + // interleaved read + noc_async_read_tile(i % 1024, s0, l1_read_addr, 0, noc); + + // Test semaphore noc_semaphore_inc(noc_addr, 1, noc); + noc_semaphore_set_remote(l1_read_addr, noc_addr, noc); + + // Test write noc_async_write(l1_read_addr, noc_addr, page_size, noc); noc_async_write_one_packet(l1_read_addr, noc_addr, page_size, noc); + // interleaved write + noc_async_write_tile(i % 1024, s0, l1_read_addr, noc); + + // Test mcast + if (mcast) { + // write mcast + noc_async_write_multicast_one_packet( + l1_read_addr, + noc == noc_index ? mcast_addr_self_noc : mcast_addr_other_noc, + page_size, + num_dests - 1, + false, + true, + noc); + noc_async_write_multicast( + l1_read_addr, + noc == noc_index ? mcast_addr_self_noc : mcast_addr_other_noc, + page_size, + num_dests - 1, + false, + true, + noc); + noc_async_write_multicast_loopback_src( + l1_read_addr, + noc == noc_index ? mcast_addr_self_noc : mcast_addr_other_noc, + page_size, + num_dests, + false, + true, + noc); + // semaphore mcast + noc_semaphore_set_multicast( + l1_read_addr, + noc == noc_index ? mcast_addr_self_noc : mcast_addr_other_noc, + num_dests - 1, + false, + true, + noc); + noc_semaphore_set_multicast_loopback_src( + l1_read_addr, + noc == noc_index ? mcast_addr_self_noc : mcast_addr_other_noc, + num_dests, + false, + true, + noc); + } + +// dw_write skip BH since there's HW issue +#ifndef ARCH_BLACKHOLE + noc_inline_dw_write(noc_addr, 1, 0xF, noc); +#endif + } + + // DRAM sharded read API + noc_async_read_tile_dram_sharded_set_state(0x32, page_size, 0, 0, noc_index); + noc_async_read_tile_dram_sharded_set_state(0x32, page_size, 0, 0, 1 - noc_index); + for (uint32_t i = 0; i < iteration; i++) { + uint32_t trid = i % 16 + 1; + noc_async_read_tile_dram_sharded_with_state_with_trid(0, 0x32, l1_read_addr, trid, noc_index); + noc_async_read_tile_dram_sharded_with_state_with_trid(0, 0x32, l1_read_addr, trid, 1 - noc_index); + } + for (uint32_t i = 1; i < 15; i++) { + noc_async_read_barrier_with_trid(i, noc_index); + noc_async_read_barrier_with_trid(i, 1 - noc_index); } + +// Some mem corruption issue when using the noc_async_write_barrier_with_trid +#ifndef ARCH_BLACKHOLE + // DRAM sharded write API + for (uint32_t i = 0; i < iteration; i++) { + uint32_t trid = i % 16 + 1; + noc_async_write_one_packet_with_trid(l1_read_addr, addr_self_noc, page_size, trid, noc_index); + noc_async_write_one_packet_with_trid(l1_read_addr, addr_other_noc, page_size, trid, 1 - noc_index); + } + for (uint32_t i = 1; i < 15; i++) { + noc_async_write_barrier_with_trid(i, noc_index); + noc_async_write_barrier_with_trid(i, 1 - noc_index); + } +#endif + DPRINT << "END" <kernel_config.max_local_cb_end_index; setup_local_cb_read_write_interfaces( cb_l1_base, num_cbs_to_early_init, end_cb_index, true, true, false); - + start_ncrisc_kernel_run(enables); cb_l1_base = (uint32_t tt_l1_ptr*)(kernel_config_base + launch_msg_address->kernel_config.remote_cb_offset); end_cb_index = launch_msg_address->kernel_config.min_remote_cb_start_index; - experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index); - start_ncrisc_kernel_run(enables); + experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index, noc_index); int index = static_cast::type>(TensixProcessorTypes::DM0); void (*kernel_address)(uint32_t) = (void (*)(uint32_t)) (kernel_config_base + launch_msg_address->kernel_config.kernel_text_offset[index]); @@ -535,25 +540,34 @@ int main() { noc_local_state_init(noc_index); } #endif + start_ncrisc_kernel_run(enables); // Brisc is responsible for issuing any noc cmds needed when initializing remote cbs // So have brisc setup remote cb interfaces even when brisc is not in use if (launch_msg_address->kernel_config.enables) { cb_l1_base = (uint32_t tt_l1_ptr*)(kernel_config_base + launch_msg_address->kernel_config.remote_cb_offset); uint32_t end_cb_index = launch_msg_address->kernel_config.min_remote_cb_start_index; - experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index); + experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index, noc_index); } - start_ncrisc_kernel_run(enables); wait_for_go_message(); } WAYPOINT("D"); wait_ncrisc_trisc(); - - if (noc_mode == DM_DYNAMIC_NOC) { - // barrier to make sure all writes are finished - while (!ncrisc_dynamic_noc_nonposted_writes_flushed(noc_index)); - while (!ncrisc_dynamic_noc_nonposted_writes_flushed(1 - noc_index)); + if constexpr (WATCHER_ASSERT_ENABLED) { + if (noc_mode == DM_DYNAMIC_NOC) { + WAYPOINT("NKFW"); + // Assert that no noc transactions are outstanding, to ensure that all reads and writes have landed + // and the NOC interface is in a known idle state for the next kernel. + for (int noc = 0; noc < NUM_NOCS; noc++) { + ASSERT(ncrisc_dynamic_noc_reads_flushed(noc)); + ASSERT(ncrisc_dynamic_noc_nonposted_writes_sent(noc)); + ASSERT(ncrisc_dynamic_noc_nonposted_writes_flushed(noc)); + ASSERT(ncrisc_dynamic_noc_nonposted_atomics_flushed(noc)); + ASSERT(ncrisc_dynamic_noc_posted_writes_sent(noc)); + } + WAYPOINT("NKFD"); + } } #if defined(PROFILE_KERNEL) @@ -579,6 +593,7 @@ int main() { // messages in the ring buffer. Must be executed before the atomic increment, as after that the launch // message is no longer owned by us. CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER(); + noc_fast_atomic_increment( noc_index, NCRISC_AT_CMD_BUF, @@ -588,6 +603,21 @@ int main() { 31 /*wrap*/, false /*linked*/, post_atomic_increments /*posted*/); +#if defined(ARCH_BLACKHOLE) + if (noc_mode == DM_DYNAMIC_NOC) { + // inc dm noc counter for BH as this is non-posted + inc_noc_counter_val< + static_cast>(TensixProcessorTypes::DM0), + NocBarrierType::NONPOSTED_ATOMICS_ACKED>(noc_index, 1); + // barrier till the atomic response is back + while (!ncrisc_dynamic_noc_nonposted_atomics_flushed(noc_index)); + // reset local counters + noc_local_state_init(noc_index); + } else { + // flush for BH since this is non-posted, which could cause counter mismatch in the next iter + while (!ncrisc_noc_nonposted_atomics_flushed(noc_index)); + } +#endif mailboxes->launch_msg_rd_ptr = (launch_msg_rd_ptr + 1) & (launch_msg_buffer_num_entries - 1); } } diff --git a/tt_metal/hw/firmware/src/idle_erisck.cc b/tt_metal/hw/firmware/src/idle_erisck.cc index 3aa42a8af14..70429ba7286 100644 --- a/tt_metal/hw/firmware/src/idle_erisck.cc +++ b/tt_metal/hw/firmware/src/idle_erisck.cc @@ -42,6 +42,18 @@ void kernel_launch(uint32_t kernel_base_addr) { ASSERT(ncrisc_noc_nonposted_atomics_flushed(NOC_INDEX)); ASSERT(ncrisc_noc_posted_writes_sent(NOC_INDEX)); WAYPOINT("NKFD"); + } else { + WAYPOINT("NKFW"); + // Assert that no noc transactions are outstanding, to ensure that all reads and writes have landed and the + // NOC interface is in a known idle state for the next kernel. + for (int noc = 0; noc < NUM_NOCS; noc++) { + ASSERT(ncrisc_dynamic_noc_reads_flushed(noc)); + ASSERT(ncrisc_dynamic_noc_nonposted_writes_sent(noc)); + ASSERT(ncrisc_dynamic_noc_nonposted_writes_flushed(noc)); + ASSERT(ncrisc_dynamic_noc_nonposted_atomics_flushed(noc)); + ASSERT(ncrisc_dynamic_noc_posted_writes_sent(noc)); + } + WAYPOINT("NKFD"); } } } diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index 3105d84be3a..8b2c055bdf9 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -146,7 +146,8 @@ int main(int argc, char *argv[]) { cb_l1_base = (uint32_t tt_l1_ptr*)(kernel_config_base + launch_msg->kernel_config.remote_cb_offset); end_cb_index = launch_msg->kernel_config.min_remote_cb_start_index; - experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index); + // NOC argument is unused + experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index, 0); WAYPOINT("R"); void (*kernel_address)(uint32_t) = (void (*)(uint32_t)) diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index 9a268bf096e..9176399b8ef 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -109,7 +109,8 @@ int main(int argc, char *argv[]) { cb_l1_base = (uint32_t tt_l1_ptr*)(kernel_config_base + launch_msg->kernel_config.remote_cb_offset); end_cb_index = launch_msg->kernel_config.min_remote_cb_start_index; - experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index); + // NOC argument is unused + experimental::setup_remote_cb_interfaces(cb_l1_base, end_cb_index, 0); #endif rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + diff --git a/tt_metal/hw/inc/blackhole/core_config.h b/tt_metal/hw/inc/blackhole/core_config.h index beab0ab565c..a5438239726 100644 --- a/tt_metal/hw/inc/blackhole/core_config.h +++ b/tt_metal/hw/inc/blackhole/core_config.h @@ -20,6 +20,7 @@ enum class EthProcessorTypes : uint8_t { DM0 = 0, DM1 = 1, COUNT = 2 }; enum class DramProcessorTypes : uint8_t { DM0 = 0, COUNT = 1 }; constexpr uint8_t MaxProcessorsPerCoreType = 5; +constexpr uint8_t MaxDMProcessorsPerCoreType = 2; constexpr uint8_t NumTensixDispatchClasses = 3; constexpr uint8_t NumEthDispatchClasses = 2; constexpr uint8_t NumDramDispatchClasses = 1; diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index f0a87e1567c..d1aacbf10c3 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -81,7 +81,11 @@ // TODO: remove this w/ the ring buffer #define MEM_NCRISC_INIT_IRAM_L1_SIZE MEM_NCRISC_FIRMWARE_SIZE -#define MEM_MAP_END (MEM_TRISC2_FIRMWARE_BASE + MEM_TRISC2_FIRMWARE_SIZE) +#define MEM_NOC_COUNTER_SIZE 4 +#define MEM_NOC_COUNTER_L1_SIZE 5 * 4 * MEM_NOC_COUNTER_SIZE +#define MEM_NOC_COUNTER_BASE (MEM_TRISC2_FIRMWARE_BASE + MEM_TRISC2_FIRMWARE_SIZE) + +#define MEM_MAP_END (MEM_NOC_COUNTER_BASE + MEM_NOC_COUNTER_L1_SIZE) // Every address after MEM_MAP_END is a "scratch" address // These can be used by FW during init, but aren't usable once FW reaches "ready" diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h index 9c57bf31dc6..b4766606cee 100644 --- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h @@ -5,15 +5,22 @@ #pragma once #include - +#include #include "noc_parameters.h" #include #include "noc_overlay_parameters.h" +#if defined(COMPILE_FOR_BRISC) +constexpr std::underlying_type_t proc_type = + static_cast>(TensixProcessorTypes::DM0); +#else +constexpr std::underlying_type_t proc_type = + static_cast>(TensixProcessorTypes::DM1); +#endif + // Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates. #define NOC_0_X(noc_index, noc_size_x, x) (noc_index == 0 ? (x) : (noc_size_x - 1 - (x))) #define NOC_0_Y(noc_index, noc_size_y, y) (noc_index == 0 ? (y) : (noc_size_y - 1 - (y))) - #define NOC_0_X_PHYS_COORD(noc_index, noc_size_x, x) NOC_0_X(noc_index, noc_size_x, x) #define NOC_0_Y_PHYS_COORD(noc_index, noc_size_y, y) NOC_0_Y(noc_index, noc_size_y, y) #define MY_NOC_ENCODING(noc_index) NOC_CMD_BUF_READ_REG(noc_index, 0, NOC_NODE_ID) @@ -55,38 +62,62 @@ extern uint32_t noc_nonposted_writes_acked[NUM_NOCS]; extern uint32_t noc_nonposted_atomics_acked[NUM_NOCS]; extern uint32_t noc_posted_writes_num_issued[NUM_NOCS]; -#define STREAM_NONPOSTED_WR_ACK_RECEIVED STREAM_SCRATCH32_REG_INDEX +enum class NocBarrierType : uint8_t { + READS_NUM_ISSUED, + NONPOSTED_WRITES_NUM_ISSUED, + NONPOSTED_WRITES_ACKED, + NONPOSTED_ATOMICS_ACKED, + POSTED_WRITES_NUM_ISSUED, + COUNT +}; + +static constexpr uint8_t NUM_BARRIER_TYPES = static_cast(NocBarrierType::COUNT); + +constexpr std::array, NUM_BARRIER_TYPES>, MaxDMProcessorsPerCoreType> +initialize_noc_counter_addresses() { + std::array, NUM_BARRIER_TYPES>, MaxDMProcessorsPerCoreType> arr = {}; + uint32_t addr = MEM_NOC_COUNTER_BASE; + for (uint8_t proc = 0; proc < MaxDMProcessorsPerCoreType; proc++) { + for (uint8_t barrier = 0; barrier < NUM_BARRIER_TYPES; barrier++) { + for (uint8_t noc = 0; noc < NUM_NOCS; noc++) { + arr[proc][barrier][noc] = addr; + addr += MEM_NOC_COUNTER_SIZE; + } + } + } + return arr; +} + +static constexpr std::array, NUM_BARRIER_TYPES>, MaxDMProcessorsPerCoreType> + noc_counter_addresses = initialize_noc_counter_addresses(); -#define OPERAND_BRISC_NOC0_STREAM 0 -#define OPERAND_BRISC_NOC1_STREAM 1 -#define OPERAND_NCRISC_NOC0_STREAM 2 -#define OPERAND_NCRISC_NOC1_STREAM 3 +static_assert( + noc_counter_addresses[MaxDMProcessorsPerCoreType - 1][NUM_BARRIER_TYPES - 1][NUM_NOCS - 1] + MEM_NOC_COUNTER_SIZE == + MEM_MAP_END); -template -inline __attribute__((always_inline)) uint32_t get_stream_index(uint32_t noc) { - if constexpr (proc_type == static_cast>(TensixProcessorTypes::DM0)) { - return noc == 0 ? OPERAND_BRISC_NOC0_STREAM : OPERAND_BRISC_NOC1_STREAM; - } else { - return noc == 0 ? OPERAND_NCRISC_NOC0_STREAM : OPERAND_NCRISC_NOC1_STREAM; - } +template +inline __attribute__((always_inline)) uint32_t get_noc_counter_address(uint32_t noc) { + return noc_counter_addresses[proc_t][static_cast>(barrier_type)][noc]; } // noc_nonposted_writes_acked -template -inline __attribute__((always_inline)) uint32_t get_noc_nonposted_writes_acked(uint32_t noc) { - return NOC_STREAM_READ_REG(get_stream_index(noc), STREAM_NONPOSTED_WR_ACK_RECEIVED); +template +inline __attribute__((always_inline)) uint32_t get_noc_counter_val(uint32_t noc) { + uint32_t counter_addr = get_noc_counter_address(noc); + return NOC_READ_REG(counter_addr); } -template -inline __attribute__((always_inline)) void inc_noc_nonposted_writes_acked(uint32_t noc, uint32_t inc = 1) { - uint32_t stream_id = get_stream_index(noc); - uint32_t val = NOC_STREAM_READ_REG(stream_id, STREAM_NONPOSTED_WR_ACK_RECEIVED) + inc; - NOC_STREAM_WRITE_REG(stream_id, STREAM_NONPOSTED_WR_ACK_RECEIVED, val); +template +inline __attribute__((always_inline)) void inc_noc_counter_val(uint32_t noc, uint32_t inc = 1) { + uint32_t counter_addr = get_noc_counter_address(noc); + uint32_t val = NOC_READ_REG(counter_addr) + inc; + NOC_WRITE_REG(counter_addr, val); } -template -inline __attribute__((always_inline)) void set_noc_nonposted_writes_acked(uint32_t noc, uint32_t val) { - NOC_STREAM_WRITE_REG(get_stream_index(noc), STREAM_NONPOSTED_WR_ACK_RECEIVED, val); +template +inline __attribute__((always_inline)) void set_noc_counter_val(uint32_t noc, uint32_t val) { + uint32_t counter_addr = get_noc_counter_address(noc); + NOC_WRITE_REG(counter_addr, val); } inline __attribute__((always_inline)) void NOC_CMD_BUF_WRITE_REG( @@ -118,6 +149,7 @@ inline __attribute__((always_inline)) bool noc_cmd_buf_ready(uint32_t noc, uint3 return (NOC_CMD_BUF_READ_REG(noc, cmd_buf, NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY); } +template inline __attribute__((always_inline)) void ncrisc_noc_fast_read( uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes) { NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, dest_addr); @@ -127,7 +159,17 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_read( noc, cmd_buf, NOC_TARG_ADDR_COORDINATE, (uint32_t)(src_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } +} + +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_reads_flushed(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::READS_NUM_ISSUED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_RD_RESP_RECEIVED) == (self_risc_acked + other_risc_acked)); } inline __attribute__((always_inline)) bool ncrisc_noc_reads_flushed(uint32_t noc) { @@ -139,7 +181,7 @@ inline __attribute__((always_inline)) bool ncrisc_noc_read_with_transaction_id_f return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0); } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write( uint32_t noc, uint32_t cmd_buf, @@ -171,13 +213,17 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_BRCST_EXCLUDE, 0); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - if constexpr (noc_mode == DM_DYNAMIC_NOC) { - if (!posted) { - inc_noc_nonposted_writes_acked(noc, num_dests); + + if (posted) { + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_posted_writes_num_issued[noc] += 1; } } else { - if (posted) { - noc_posted_writes_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, num_dests); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += num_dests; @@ -185,7 +231,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write( } } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_loopback_src( uint32_t noc, uint32_t cmd_buf, @@ -212,14 +258,15 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_loopback_src( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc, num_dests); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, num_dests); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += num_dests; } } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_exclude_region( uint32_t noc, uint32_t cmd_buf, @@ -247,14 +294,15 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_exclude_region( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc, num_dests); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, num_dests); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += num_dests; } } -template +template inline __attribute__((always_inline)) void ncrisc_noc_blitz_write_setup( uint32_t noc, uint32_t cmd_buf, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, uint32_t num_times_to_write) { uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | NOC_CMD_RESP_MARKED; @@ -266,25 +314,37 @@ inline __attribute__((always_inline)) void ncrisc_noc_blitz_write_setup( NOC_CMD_BUF_WRITE_REG( noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc, num_times_to_write); + inc_noc_counter_val(noc, num_times_to_write); + inc_noc_counter_val(noc, num_times_to_write); } else { noc_nonposted_writes_num_issued[noc] += num_times_to_write; noc_nonposted_writes_acked[noc] += num_times_to_write; } } +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_nonposted_writes_sent(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::NONPOSTED_WRITES_NUM_ISSUED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_NONPOSTED_WR_REQ_SENT) == (self_risc_acked + other_risc_acked)); +} + inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_writes_sent(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_NONPOSTED_WR_REQ_SENT) == noc_nonposted_writes_num_issued[noc]); } +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_posted_writes_sent(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::POSTED_WRITES_NUM_ISSUED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_POSTED_WR_REQ_SENT) == (self_risc_acked + other_risc_acked)); +} + inline __attribute__((always_inline)) bool ncrisc_noc_posted_writes_sent(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_POSTED_WR_REQ_SENT) == noc_posted_writes_num_issued[noc]); } -template inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_nonposted_writes_flushed(uint32_t noc) { - uint32_t self_risc_acked = get_noc_nonposted_writes_acked(noc); - uint32_t other_risc_acked = get_noc_nonposted_writes_acked<1 - proc_type>(noc); + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::NONPOSTED_WRITES_ACKED>(noc); return (NOC_STATUS_READ_REG(noc, NIU_MST_WR_ACK_RECEIVED) == (self_risc_acked + other_risc_acked)); } @@ -302,6 +362,12 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_trans return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0); } +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_nonposted_atomics_flushed(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::NONPOSTED_ATOMICS_ACKED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == (self_risc_acked + other_risc_acked)); +} + inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]); } @@ -356,11 +422,7 @@ inline __attribute__((always_inline)) void dynamic_noc_init() { uint32_t my_y = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK; uint64_t xy_local_addr = NOC_XY_ADDR(my_x, my_y, 0); - uint32_t noc_rd_cmd_field = - NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); - // program brisc cmd_buf 0 - NOC_CMD_BUF_WRITE_REG(noc, DYNAMIC_NOC_BRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field); NOC_CMD_BUF_WRITE_REG( noc, DYNAMIC_NOC_BRISC_RD_CMD_BUF, @@ -375,7 +437,6 @@ inline __attribute__((always_inline)) void dynamic_noc_init() { (uint32_t)(xy_local_addr >> NOC_ADDR_COORD_SHIFT)); // program ncrisc cmd_buf 2 - NOC_CMD_BUF_WRITE_REG(noc, DYNAMIC_NOC_NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field); NOC_CMD_BUF_WRITE_REG( noc, DYNAMIC_NOC_NCRISC_RD_CMD_BUF, @@ -408,10 +469,45 @@ inline __attribute__((always_inline)) void noc_local_state_init(int noc) { } inline __attribute__((always_inline)) void dynamic_noc_local_state_init() { - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM0)>(NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_WR_ACK_RECEIVED)); - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM0)>(NOC_1, 0); - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM1)>(NOC_0, 0); - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM1)>(NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_WR_ACK_RECEIVED)); + using underlying_tensix_processor_types_t = std::underlying_type_t; + constexpr underlying_tensix_processor_types_t dm0 = + static_cast(TensixProcessorTypes::DM0); + constexpr underlying_tensix_processor_types_t dm1 = + static_cast(TensixProcessorTypes::DM1); + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_RD_RESP_RECEIVED)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_RD_RESP_RECEIVED)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_NONPOSTED_WR_REQ_SENT)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_NONPOSTED_WR_REQ_SENT)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_WR_ACK_RECEIVED)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_WR_ACK_RECEIVED)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_ATOMIC_RESP_RECEIVED)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_ATOMIC_RESP_RECEIVED)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_POSTED_WR_REQ_SENT)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_POSTED_WR_REQ_SENT)); } inline __attribute__((always_inline)) void ncrisc_noc_counters_init() { @@ -441,20 +537,21 @@ inline __attribute__((always_inline)) void ncrisc_noc_full_sync() { } } +template inline __attribute__((always_inline)) void ncrisc_noc_fast_read_any_len( uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes) { while (len_bytes > NOC_MAX_BURST_SIZE) { while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, NOC_MAX_BURST_SIZE); + ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, NOC_MAX_BURST_SIZE); src_addr += NOC_MAX_BURST_SIZE; dest_addr += NOC_MAX_BURST_SIZE; len_bytes -= NOC_MAX_BURST_SIZE; } while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes); + ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes); } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( uint32_t noc, uint32_t cmd_buf, @@ -471,7 +568,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( if constexpr (!one_packet) { while (len_bytes > NOC_MAX_BURST_SIZE) { while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write( + ncrisc_noc_fast_write( noc, cmd_buf, src_addr, @@ -490,7 +587,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( } } while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write( + ncrisc_noc_fast_write( noc, cmd_buf, src_addr, @@ -505,7 +602,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( trid); } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopback_src( uint32_t noc, uint32_t cmd_buf, @@ -519,7 +616,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopbac bool multicast_path_reserve) { while (len_bytes > NOC_MAX_BURST_SIZE) { while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write_loopback_src( + ncrisc_noc_fast_write_loopback_src( noc, cmd_buf, src_addr, @@ -535,11 +632,11 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopbac len_bytes -= NOC_MAX_BURST_SIZE; } while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write_loopback_src( + ncrisc_noc_fast_write_loopback_src( noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve); } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_exclude_region( uint32_t noc, uint32_t cmd_buf, @@ -554,7 +651,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_exclude uint32_t exclude_region = 0) { while (len_bytes > NOC_MAX_BURST_SIZE) { while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write_exclude_region( + ncrisc_noc_fast_write_exclude_region( noc, cmd_buf, src_addr, @@ -571,7 +668,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_exclude len_bytes -= NOC_MAX_BURST_SIZE; } while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write_exclude_region( + ncrisc_noc_fast_write_exclude_region( noc, cmd_buf, src_addr, @@ -585,7 +682,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_exclude exclude_region); } -template +template inline __attribute__((always_inline)) void noc_fast_write_dw_inline( uint32_t noc, uint32_t cmd_buf, @@ -615,13 +712,17 @@ inline __attribute__((always_inline)) void noc_fast_write_dw_inline( noc, cmd_buf, NOC_TARG_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, be32); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - if constexpr (noc_mode == DM_DYNAMIC_NOC) { - if (!posted) { - inc_noc_nonposted_writes_acked(noc); + + if (posted) { + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_posted_writes_num_issued[noc] += 1; } } else { - if (posted) { - noc_posted_writes_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, 1); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += 1; @@ -666,11 +767,16 @@ inline __attribute__((always_inline)) void noc_fast_atomic_increment( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_DATA, incr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, 0x1); if (!posted) { - noc_nonposted_atomics_acked[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_nonposted_atomics_acked[noc] += 1; + } } } // issue noc reads while wait for outstanding transactions done +template inline __attribute__((always_inline)) void ncrisc_noc_fast_read_with_transaction_id( uint32_t noc, uint32_t cmd_buf, uint32_t src_base_addr, uint32_t src_addr, uint32_t dest_addr, uint32_t trid) { uint32_t src_addr_; @@ -682,7 +788,11 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_read_with_transaction NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } // set transaction id for a noc read diff --git a/tt_metal/hw/inc/circular_buffer_init.h b/tt_metal/hw/inc/circular_buffer_init.h index 09983f7be87..2eea6a0417d 100644 --- a/tt_metal/hw/inc/circular_buffer_init.h +++ b/tt_metal/hw/inc/circular_buffer_init.h @@ -57,12 +57,7 @@ FORCE_INLINE void setup_local_cb_read_write_interfaces( namespace experimental { template -inline void setup_remote_cb_interfaces(uint32_t tt_l1_ptr* cb_l1_base, uint32_t start_cb_index) { -#ifdef COMPILE_FOR_TRISC - uint8_t noc = 0; -#else - uint8_t noc = noc_index; -#endif +inline void setup_remote_cb_interfaces(uint32_t tt_l1_ptr* cb_l1_base, uint32_t start_cb_index, uint8_t noc) { volatile tt_l1_ptr uint32_t* circular_buffer_config_addr = cb_l1_base; for (uint32_t cb_id = NUM_CIRCULAR_BUFFERS - 1, end_id = start_cb_index - 1; cb_id != end_id; cb_id--) { @@ -85,6 +80,7 @@ inline void setup_remote_cb_interfaces(uint32_t tt_l1_ptr* cb_l1_base, uint32_t sender_cb_interface.receiver_noc_xy_ptr = remote_noc_xy_addr; sender_cb_interface.aligned_pages_sent_ptr = aligned_pages_sent_addr; sender_cb_interface.num_receivers = num_receivers; + // Using posted semaphore inc resize_remote_sender_cb_interface(cb_id, page_size, noc); } else { uint32_t aligned_pages_acked_addr = aligned_pages_sent_addr + L1_ALIGNMENT; @@ -97,6 +93,7 @@ inline void setup_remote_cb_interfaces(uint32_t tt_l1_ptr* cb_l1_base, uint32_t receiver_cb_interface.sender_noc_x = sender_noc_x; receiver_cb_interface.sender_noc_y = sender_noc_y; receiver_cb_interface.aligned_pages_acked_ptr = aligned_pages_acked_addr; + // Using posted semaphore inc resize_remote_receiver_cb_interface(cb_id, page_size, noc); } circular_buffer_config_addr += UINT32_WORDS_PER_REMOTE_CIRCULAR_BUFFER_CONFIG; diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h index 69a8c523515..cd1e33c417c 100644 --- a/tt_metal/hw/inc/dataflow_api.h +++ b/tt_metal/hw/inc/dataflow_api.h @@ -448,6 +448,11 @@ void noc_async_read_one_packet( WAYPOINT("NAOW"); DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, src_noc_addr, dst_local_l1_addr, size); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + uint32_t noc_rd_cmd_field = + NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CTRL, noc_rd_cmd_field); + } NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dst_local_l1_addr); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, (uint32_t)src_noc_addr); #ifdef ARCH_BLACKHOLE @@ -461,7 +466,11 @@ void noc_async_read_one_packet( (uint32_t)(src_noc_addr >> NOC_ADDR_COORD_SHIFT) & NOC_COORDINATE_MASK); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, size); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } WAYPOINT("NAOD"); } @@ -496,7 +505,7 @@ inline void noc_async_read( } else { WAYPOINT("NARW"); DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, src_noc_addr, dst_local_l1_addr, size); - ncrisc_noc_fast_read_any_len(noc, read_cmd_buf, src_noc_addr, dst_local_l1_addr, size); + ncrisc_noc_fast_read_any_len(noc, read_cmd_buf, src_noc_addr, dst_local_l1_addr, size); WAYPOINT("NARD"); } } @@ -516,6 +525,11 @@ void noc_async_read_one_packet_set_state(std::uint64_t src_noc_addr, std::uint32 WAYPOINT("NASW"); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + uint32_t noc_rd_cmd_field = + NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CTRL, noc_rd_cmd_field); + } #ifdef ARCH_BLACKHOLE // Handles reading from PCIe NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_MID, (uint32_t)(src_noc_addr >> 32) & 0x1000000F); @@ -554,7 +568,11 @@ FORCE_INLINE void noc_async_read_one_packet_with_state( NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (inc_num_issued) { - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } WAYPOINT("NATD"); @@ -574,6 +592,11 @@ void noc_async_read_set_state(std::uint64_t src_noc_addr, uint8_t noc = noc_inde WAYPOINT("NAUW"); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + uint32_t noc_rd_cmd_field = + NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CTRL, noc_rd_cmd_field); + } #ifdef ARCH_BLACKHOLE // Handles reading from PCIe NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_MID, (uint32_t)(src_noc_addr >> 32) & 0x1000000F); @@ -613,7 +636,11 @@ FORCE_INLINE void noc_async_read_with_state( src_noc_addr += NOC_MAX_BURST_SIZE; dst_local_l1_addr += NOC_MAX_BURST_SIZE; if constexpr (inc_num_issued) { - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } } @@ -627,7 +654,11 @@ FORCE_INLINE void noc_async_read_with_state( NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, size); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (inc_num_issued) { - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } WAYPOINT("NAVD"); @@ -635,7 +666,11 @@ FORCE_INLINE void noc_async_read_with_state( FORCE_INLINE void noc_async_read_inc_num_issued(std::uint32_t num_issued_reads_inc, uint8_t noc = noc_index) { - noc_reads_num_issued[noc] += num_issued_reads_inc; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, num_issued_reads_inc); + } else { + noc_reads_num_issued[noc] += num_issued_reads_inc; + } } // TODO: write docs @@ -668,7 +703,8 @@ void noc_async_write_one_packet( NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, size); NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, 1); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += 1; // num_dests @@ -711,7 +747,8 @@ void noc_async_write_multicast_one_packet( NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, size); NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc, num_dests); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, num_dests); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += num_dests; @@ -763,7 +800,8 @@ FORCE_INLINE void noc_async_write_one_packet_with_state( if constexpr (non_posted) { if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, 1); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += 1; // num_dests @@ -827,7 +865,7 @@ inline void noc_async_write( } else { WAYPOINT("NAWW"); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); - ncrisc_noc_fast_write_any_len( + ncrisc_noc_fast_write_any_len( noc, write_cmd_buf, src_local_l1_addr, dst_noc_addr, size, NOC_UNICAST_WRITE_VC, false, false, 1, true); WAYPOINT("NAWD"); } @@ -851,7 +889,7 @@ inline void noc_semaphore_set_remote( std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, uint8_t noc = noc_index) { WAYPOINT("NSSW"); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, 4); - ncrisc_noc_fast_write_any_len( + ncrisc_noc_fast_write_any_len( noc, write_reg_cmd_buf, src_local_l1_addr, @@ -914,7 +952,7 @@ inline void noc_async_write_multicast( } else { WAYPOINT("NMWW"); DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, size); - ncrisc_noc_fast_write_any_len( + ncrisc_noc_fast_write_any_len( noc, write_cmd_buf, src_local_l1_addr, @@ -964,7 +1002,7 @@ inline void noc_semaphore_set_multicast( uint8_t noc = noc_index) { WAYPOINT("NSNW"); DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, 4); - ncrisc_noc_fast_write_any_len( + ncrisc_noc_fast_write_any_len( noc, write_reg_cmd_buf, src_local_l1_addr, @@ -1012,7 +1050,7 @@ inline void noc_semaphore_set_multicast_loopback_src( uint8_t noc = noc_index) { WAYPOINT("NSLW"); DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, 4); - ncrisc_noc_fast_write_any_len_loopback_src( + ncrisc_noc_fast_write_any_len_loopback_src( noc, write_reg_cmd_buf, src_local_l1_addr, @@ -1036,7 +1074,7 @@ inline void noc_async_write_multicast_loopback_src( uint8_t noc = noc_index) { WAYPOINT("NMLW"); DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, size); - ncrisc_noc_fast_write_any_len_loopback_src( + ncrisc_noc_fast_write_any_len_loopback_src( noc, write_cmd_buf, src_local_l1_addr, @@ -1101,7 +1139,7 @@ inline void noc_async_write_multicast_exclude_region( uint8_t noc = noc_index) { WAYPOINT("NMEW"); DEBUG_SANITIZE_NOC_MULTI_WRITE_TRANSACTION(noc, dst_noc_addr_multicast, src_local_l1_addr, size); - ncrisc_noc_fast_write_any_len_exclude_region( + ncrisc_noc_fast_write_any_len_exclude_region( noc, write_cmd_buf, src_local_l1_addr, @@ -1127,7 +1165,13 @@ inline void noc_async_write_multicast_exclude_region( */ void noc_async_read_barrier(uint8_t noc = noc_index) { WAYPOINT("NRBW"); - while (!ncrisc_noc_reads_flushed(noc)); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + while (!ncrisc_dynamic_noc_reads_flushed(noc)) { + invalidate_l1_cache(); + } + } else { + while (!ncrisc_noc_reads_flushed(noc)); + } invalidate_l1_cache(); WAYPOINT("NRBD"); } @@ -1144,10 +1188,13 @@ FORCE_INLINE void noc_async_write_barrier(uint8_t noc = noc_index) { WAYPOINT("NWBW"); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - while (!ncrisc_dynamic_noc_nonposted_writes_flushed(noc)); + while (!ncrisc_dynamic_noc_nonposted_writes_flushed(noc)) { + invalidate_l1_cache(); + } } else { while (!ncrisc_noc_nonposted_writes_flushed(noc)); } + invalidate_l1_cache(); WAYPOINT("NWBD"); } @@ -1159,7 +1206,14 @@ void noc_async_write_barrier(uint8_t noc = noc_index) { FORCE_INLINE void noc_async_writes_flushed(uint8_t noc = noc_index) { WAYPOINT("NWFW"); - while (!ncrisc_noc_nonposted_writes_sent(noc)); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + while (!ncrisc_dynamic_noc_nonposted_writes_sent(noc)) { + invalidate_l1_cache(); + } + } else { + while (!ncrisc_noc_nonposted_writes_sent(noc)); + } + invalidate_l1_cache(); WAYPOINT("NWFD"); } @@ -1171,7 +1225,14 @@ void noc_async_writes_flushed(uint8_t noc = noc_index) { FORCE_INLINE void noc_async_posted_writes_flushed(uint8_t noc = noc_index) { WAYPOINT("NPWW"); - while (!ncrisc_noc_posted_writes_sent(noc)); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + while (!ncrisc_dynamic_noc_posted_writes_sent(noc)) { + invalidate_l1_cache(); + } + } else { + while (!ncrisc_noc_posted_writes_sent(noc)); + } + invalidate_l1_cache(); WAYPOINT("NPWD"); } @@ -1186,7 +1247,14 @@ void noc_async_posted_writes_flushed(uint8_t noc = noc_index) { FORCE_INLINE void noc_async_atomic_barrier(uint8_t noc_idx = noc_index) { WAYPOINT("NABW"); - while (!ncrisc_noc_nonposted_atomics_flushed(noc_idx)); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + while (!ncrisc_dynamic_noc_nonposted_atomics_flushed(noc_idx)) { + invalidate_l1_cache(); + } + } else { + while (!ncrisc_noc_nonposted_atomics_flushed(noc_idx)); + } + invalidate_l1_cache(); WAYPOINT("NABD"); } @@ -1201,17 +1269,31 @@ void noc_async_atomic_barrier(uint8_t noc_idx = noc_index) { FORCE_INLINE void noc_async_full_barrier(uint8_t noc_idx = noc_index) { invalidate_l1_cache(); - WAYPOINT("NFBW"); - while (!ncrisc_noc_reads_flushed(noc_idx)); - WAYPOINT("NFCW"); - while (!ncrisc_noc_nonposted_writes_sent(noc_idx)); - WAYPOINT("NFDW"); - while (!ncrisc_noc_nonposted_writes_flushed(noc_idx)); - WAYPOINT("NFEW"); - while (!ncrisc_noc_nonposted_atomics_flushed(noc_idx)); - WAYPOINT("NFFW"); - while (!ncrisc_noc_posted_writes_sent(noc_idx)); - WAYPOINT("NFBD"); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + WAYPOINT("NFBW"); + while (!ncrisc_dynamic_noc_reads_flushed(noc_idx)); + WAYPOINT("NFCW"); + while (!ncrisc_dynamic_noc_nonposted_writes_sent(noc_idx)); + WAYPOINT("NFDW"); + while (!ncrisc_dynamic_noc_nonposted_writes_flushed(noc_idx)); + WAYPOINT("NFEW"); + while (!ncrisc_dynamic_noc_nonposted_atomics_flushed(noc_idx)); + WAYPOINT("NFFW"); + while (!ncrisc_dynamic_noc_posted_writes_sent(noc_idx)); + WAYPOINT("NFBD"); + } else { + WAYPOINT("NFBW"); + while (!ncrisc_noc_reads_flushed(noc_idx)); + WAYPOINT("NFCW"); + while (!ncrisc_noc_nonposted_writes_sent(noc_idx)); + WAYPOINT("NFDW"); + while (!ncrisc_noc_nonposted_writes_flushed(noc_idx)); + WAYPOINT("NFEW"); + while (!ncrisc_noc_nonposted_atomics_flushed(noc_idx)); + WAYPOINT("NFFW"); + while (!ncrisc_noc_posted_writes_sent(noc_idx)); + WAYPOINT("NFBD"); + } } // clang-format off @@ -1309,7 +1391,7 @@ FORCE_INLINE void noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t be = 0xF, uint8_t noc = noc_index) { WAYPOINT("NWIW"); DEBUG_SANITIZE_NOC_ADDR(noc, addr, 4); - noc_fast_write_dw_inline( + noc_fast_write_dw_inline( noc, write_at_cmd_buf, val, @@ -1412,7 +1494,11 @@ void noc_async_read_tile_dram_sharded_with_state( NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } FORCE_INLINE @@ -1420,7 +1506,7 @@ void noc_async_read_tile_dram_sharded_with_state_with_trid( uint32_t src_base_addr, uint32_t src_addr, uint32_t dest_addr, uint32_t trid = 0, uint8_t noc = noc_index) { WAYPOINT("NRDW"); #ifndef ARCH_GRAYSKULL - ncrisc_noc_fast_read_with_transaction_id(noc, read_cmd_buf, src_base_addr, src_addr, dest_addr, trid); + ncrisc_noc_fast_read_with_transaction_id(noc, read_cmd_buf, src_base_addr, src_addr, dest_addr, trid); #endif WAYPOINT("NRDD"); } @@ -1496,7 +1582,7 @@ FORCE_INLINE void noc_async_write_one_packet_with_trid( WAYPOINT("NAWW"); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); #ifndef ARCH_GRAYSKULL - ncrisc_noc_fast_write_any_len( + ncrisc_noc_fast_write_any_len( noc, write_cmd_buf, src_local_l1_addr, diff --git a/tt_metal/hw/inc/dataflow_api_addrgen.h b/tt_metal/hw/inc/dataflow_api_addrgen.h index 39ade2ce9f3..afec1f14fb1 100644 --- a/tt_metal/hw/inc/dataflow_api_addrgen.h +++ b/tt_metal/hw/inc/dataflow_api_addrgen.h @@ -347,12 +347,21 @@ struct InterleavedAddrGenFast { while (!noc_cmd_buf_ready(noc, read_cmd_buf)); WAYPOINT("NRTD"); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + uint32_t noc_rd_cmd_field = + NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CTRL, noc_rd_cmd_field); + } NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, this->page_size); // len_bytes NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } FORCE_INLINE @@ -381,7 +390,8 @@ struct InterleavedAddrGenFast { NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, this->page_size); // len_bytes NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, 1); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += 1; // num_dests @@ -436,13 +446,22 @@ struct InterleavedPow2AddrGenFast { while (!noc_cmd_buf_ready(noc, read_cmd_buf)); WAYPOINT("NRPD"); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + uint32_t noc_rd_cmd_field = + NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CTRL, noc_rd_cmd_field); + } NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 NOC_CMD_BUF_WRITE_REG( noc, read_cmd_buf, NOC_AT_LEN_BE, 1 << this->aligned_log_base_2_of_page_size); // len_bytes NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } FORCE_INLINE @@ -462,12 +481,21 @@ struct InterleavedPow2AddrGenFast { WAYPOINT("RP1D"); DEBUG_SANITIZE_NOC_READ_TRANSACTION(noc, get_noc_addr_helper(src_noc_xy, src_addr), dest_addr, size); + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + uint32_t noc_rd_cmd_field = + NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); + NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CTRL, noc_rd_cmd_field); + } NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_LO, src_addr); // (uint32_t)src_addr NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_TARG_ADDR_COORDINATE, src_noc_xy); // src_addr >> 32 NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_AT_LEN_BE, size); // len_bytes NOC_CMD_BUF_WRITE_REG(noc, read_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } FORCE_INLINE @@ -501,7 +529,8 @@ struct InterleavedPow2AddrGenFast { NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_AT_LEN_BE, write_size_bytes); // len_bytes NOC_CMD_BUF_WRITE_REG(noc, write_cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, 1); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += 1; // num_dests diff --git a/tt_metal/hw/inc/dataflow_api_common.h b/tt_metal/hw/inc/dataflow_api_common.h index 0308c679ec1..336ed1dff6b 100644 --- a/tt_metal/hw/inc/dataflow_api_common.h +++ b/tt_metal/hw/inc/dataflow_api_common.h @@ -6,11 +6,6 @@ #include -#if defined(COMPILE_FOR_BRISC) -constexpr uint8_t proc_type = static_cast>(TensixProcessorTypes::DM0); -#else -constexpr uint8_t proc_type = static_cast>(TensixProcessorTypes::DM1); -#endif #if defined(KERNEL_BUILD) constexpr uint8_t noc_index = NOC_INDEX; constexpr uint8_t noc_mode = NOC_MODE; diff --git a/tt_metal/hw/inc/debug/assert.h b/tt_metal/hw/inc/debug/assert.h index f9da574761d..4f25bd8eb09 100644 --- a/tt_metal/hw/inc/debug/assert.h +++ b/tt_metal/hw/inc/debug/assert.h @@ -42,8 +42,12 @@ void assert_and_hang(uint32_t line_num) { assert_and_hang(__LINE__); \ } while (0) +#define WATCHER_ASSERT_ENABLED 1 + #else // !WATCHER_ENABLED #define ASSERT(condition) +#define WATCHER_ASSERT_ENABLED 0 + #endif // WATCHER_ENABLED diff --git a/tt_metal/hw/inc/remote_circular_buffer_api.h b/tt_metal/hw/inc/remote_circular_buffer_api.h index fd804c517b5..7968a742784 100644 --- a/tt_metal/hw/inc/remote_circular_buffer_api.h +++ b/tt_metal/hw/inc/remote_circular_buffer_api.h @@ -31,7 +31,16 @@ FORCE_INLINE void update_pages_sent( NOC_XY_ENCODING(DYNAMIC_NOC_X(noc, remote_noc_xy_ptr[0]), DYNAMIC_NOC_Y(noc, remote_noc_xy_ptr[1]))); *pages_sent_ptr += aligned_page_adjustment; uint64_t remote_ack_ptr_addr = get_noc_addr_helper(remote_noc_xy, (uint32_t)pages_sent_ptr); - noc_semaphore_inc(remote_ack_ptr_addr, aligned_page_adjustment, noc); + noc_fast_atomic_increment( + noc, + write_at_cmd_buf, + remote_ack_ptr_addr, + NOC_UNICAST_WRITE_VC, + aligned_page_adjustment, + 31 /*wrap*/, + false /*linked*/, + true /*posted*/, + MEM_NOC_ATOMIC_RET_VAL_ADDR); pages_sent_ptr += 2 * L1_ALIGNMENT / sizeof(uint32_t); remote_noc_xy_ptr += 2; } @@ -48,11 +57,23 @@ FORCE_INLINE void update_pages_acked( reinterpret_cast(aligned_pages_acked_addr); *pages_acked_ptr += aligned_page_adjustment; uint64_t remote_ack_ptr_addr = get_noc_addr(sender_noc_x, sender_noc_y, (uint32_t)pages_acked_ptr, noc); - noc_semaphore_inc(remote_ack_ptr_addr, aligned_page_adjustment, noc); + noc_fast_atomic_increment( + noc, + write_at_cmd_buf, + remote_ack_ptr_addr, + NOC_UNICAST_WRITE_VC, + aligned_page_adjustment, + 31 /*wrap*/, + false /*linked*/, + true /*posted*/, + MEM_NOC_ATOMIC_RET_VAL_ADDR); } #else +static constexpr uint8_t default_noc_mode = 0; +template FORCE_INLINE void update_pages_sent( const RemoteSenderCBInterface& sender_cb_interface, uint32_t aligned_page_adjustment, uint8_t noc) {} +template FORCE_INLINE void update_pages_acked( const RemoteReceiverCBInterface& receiver_cb_interface, uint32_t aligned_page_adjustment, uint8_t noc) {} #endif diff --git a/tt_metal/hw/inc/wormhole/core_config.h b/tt_metal/hw/inc/wormhole/core_config.h index e1d0c168036..5e5603962bc 100644 --- a/tt_metal/hw/inc/wormhole/core_config.h +++ b/tt_metal/hw/inc/wormhole/core_config.h @@ -18,6 +18,7 @@ enum class TensixProcessorTypes : uint8_t { DM0 = 0, DM1 = 1, MATH0 = 2, MATH1 = enum class EthProcessorTypes : uint8_t { DM0 = 0, COUNT = 1 }; constexpr uint8_t MaxProcessorsPerCoreType = 5; +constexpr uint8_t MaxDMProcessorsPerCoreType = 2; constexpr uint8_t NumTensixDispatchClasses = 3; constexpr uint8_t NumEthDispatchClasses = 1; constexpr uint8_t noc_size_x = 10; diff --git a/tt_metal/hw/inc/wormhole/dev_mem_map.h b/tt_metal/hw/inc/wormhole/dev_mem_map.h index affb5aeef4f..98e0427fab3 100644 --- a/tt_metal/hw/inc/wormhole/dev_mem_map.h +++ b/tt_metal/hw/inc/wormhole/dev_mem_map.h @@ -93,7 +93,7 @@ #define MEM_NCRISC_KERNEL_BASE (MEM_NCRISC_IRAM_BASE) #define MEM_NOC_COUNTER_SIZE 4 -#define MEM_NOC_COUNTER_L1_SIZE 16 +#define MEM_NOC_COUNTER_L1_SIZE 5 * 4 * MEM_NOC_COUNTER_SIZE #define MEM_NOC_COUNTER_BASE (MEM_NCRISC_INIT_IRAM_L1_BASE + MEM_NCRISC_INIT_IRAM_L1_SIZE) #define MEM_MAP_END (MEM_NOC_COUNTER_BASE + MEM_NOC_COUNTER_L1_SIZE) diff --git a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h index 9bc12dbfff3..c0cfd1c39ef 100644 --- a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h +++ b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h @@ -5,11 +5,19 @@ #pragma once #include - +#include #include "noc_parameters.h" #include #include "noc_overlay_parameters.h" +#if defined(COMPILE_FOR_BRISC) +constexpr std::underlying_type_t proc_type = + static_cast>(TensixProcessorTypes::DM0); +#else +constexpr std::underlying_type_t proc_type = + static_cast>(TensixProcessorTypes::DM1); +#endif + // Helper functions to convert NoC coordinates to NoC-0 coordinates, used in metal as "physical" coordinates. #define NOC_0_X(noc_index, noc_size_x, x) x #define NOC_0_Y(noc_index, noc_size_y, y) y @@ -55,38 +63,61 @@ extern uint32_t noc_nonposted_writes_acked[NUM_NOCS]; extern uint32_t noc_nonposted_atomics_acked[NUM_NOCS]; extern uint32_t noc_posted_writes_num_issued[NUM_NOCS]; -#define BRISC_NOC0_NONPOSTED_WR_ACK_RECEIVED MEM_NOC_COUNTER_BASE -#define BRISC_NOC1_NONPOSTED_WR_ACK_RECEIVED (BRISC_NOC0_NONPOSTED_WR_ACK_RECEIVED + MEM_NOC_COUNTER_SIZE) -#define NCRISC_NOC0_NONPOSTED_WR_ACK_RECEIVED (BRISC_NOC1_NONPOSTED_WR_ACK_RECEIVED + MEM_NOC_COUNTER_SIZE) -#define NCRISC_NOC1_NONPOSTED_WR_ACK_RECEIVED (NCRISC_NOC0_NONPOSTED_WR_ACK_RECEIVED + MEM_NOC_COUNTER_SIZE) +enum class NocBarrierType : uint8_t { + READS_NUM_ISSUED, + NONPOSTED_WRITES_NUM_ISSUED, + NONPOSTED_WRITES_ACKED, + NONPOSTED_ATOMICS_ACKED, + POSTED_WRITES_NUM_ISSUED, + COUNT +}; + +static constexpr uint8_t NUM_BARRIER_TYPES = static_cast(NocBarrierType::COUNT); + +constexpr std::array, NUM_BARRIER_TYPES>, MaxDMProcessorsPerCoreType> +initialize_noc_counter_addresses() { + std::array, NUM_BARRIER_TYPES>, MaxDMProcessorsPerCoreType> arr = {}; + uint32_t addr = MEM_NOC_COUNTER_BASE; + for (uint8_t proc = 0; proc < MaxDMProcessorsPerCoreType; proc++) { + for (uint8_t barrier = 0; barrier < NUM_BARRIER_TYPES; barrier++) { + for (uint8_t noc = 0; noc < NUM_NOCS; noc++) { + arr[proc][barrier][noc] = addr; + addr += MEM_NOC_COUNTER_SIZE; + } + } + } + return arr; +} + +static constexpr std::array, NUM_BARRIER_TYPES>, MaxDMProcessorsPerCoreType> + noc_counter_addresses = initialize_noc_counter_addresses(); + +static_assert( + noc_counter_addresses[MaxDMProcessorsPerCoreType - 1][NUM_BARRIER_TYPES - 1][NUM_NOCS - 1] + MEM_NOC_COUNTER_SIZE == + MEM_MAP_END); -template +template inline __attribute__((always_inline)) uint32_t get_noc_counter_address(uint32_t noc) { - if constexpr ( - proc_type == static_cast>(TensixProcessorTypes::DM0)) { // BRISC - return noc == 0 ? BRISC_NOC0_NONPOSTED_WR_ACK_RECEIVED : BRISC_NOC1_NONPOSTED_WR_ACK_RECEIVED; - } else { - return noc == 0 ? NCRISC_NOC0_NONPOSTED_WR_ACK_RECEIVED : NCRISC_NOC1_NONPOSTED_WR_ACK_RECEIVED; - } + return noc_counter_addresses[proc_t][static_cast>(barrier_type)][noc]; } // noc_nonposted_writes_acked -template -inline __attribute__((always_inline)) uint32_t get_noc_nonposted_writes_acked(uint32_t noc) { - uint32_t counter_addr = get_noc_counter_address(noc); +template +inline __attribute__((always_inline)) uint32_t get_noc_counter_val(uint32_t noc) { + uint32_t counter_addr = get_noc_counter_address(noc); return NOC_READ_REG(counter_addr); } -template -inline __attribute__((always_inline)) void inc_noc_nonposted_writes_acked(uint32_t noc, uint32_t inc = 1) { - uint32_t counter_addr = get_noc_counter_address(noc); +template +inline __attribute__((always_inline)) void inc_noc_counter_val(uint32_t noc, uint32_t inc = 1) { + uint32_t counter_addr = get_noc_counter_address(noc); uint32_t val = NOC_READ_REG(counter_addr) + inc; NOC_WRITE_REG(counter_addr, val); } -template -inline __attribute__((always_inline)) void set_noc_nonposted_writes_acked(uint32_t noc, uint32_t val) { - uint32_t counter_addr = get_noc_counter_address(noc); +template +inline __attribute__((always_inline)) void set_noc_counter_val(uint32_t noc, uint32_t val) { + uint32_t counter_addr = get_noc_counter_address(noc); NOC_WRITE_REG(counter_addr, val); } @@ -119,14 +150,30 @@ inline __attribute__((always_inline)) bool noc_cmd_buf_ready(uint32_t noc, uint3 return (NOC_CMD_BUF_READ_REG(noc, cmd_buf, NOC_CMD_CTRL) == NOC_CTRL_STATUS_READY); } +template inline __attribute__((always_inline)) void ncrisc_noc_fast_read( uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes) { + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + uint32_t noc_rd_cmd_field = + NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); + NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_rd_cmd_field); + } NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, (uint32_t)src_addr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_COORDINATE, (uint32_t)(src_addr >> NOC_ADDR_COORD_SHIFT)); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } +} + +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_reads_flushed(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::READS_NUM_ISSUED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_RD_RESP_RECEIVED) == (self_risc_acked + other_risc_acked)); } inline __attribute__((always_inline)) bool ncrisc_noc_reads_flushed(uint32_t noc) { @@ -138,7 +185,7 @@ inline __attribute__((always_inline)) bool ncrisc_noc_read_with_transaction_id_f return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0); } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write( uint32_t noc, uint32_t cmd_buf, @@ -167,13 +214,17 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT)); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - if constexpr (noc_mode == DM_DYNAMIC_NOC) { - if (!posted) { - inc_noc_nonposted_writes_acked(noc, num_dests); + + if (posted) { + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_posted_writes_num_issued[noc] += 1; } } else { - if (posted) { - noc_posted_writes_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, num_dests); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += num_dests; @@ -181,7 +232,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write( } } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_loopback_src( uint32_t noc, uint32_t cmd_buf, @@ -205,14 +256,15 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_loopback_src( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc, num_dests); + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, num_dests); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += num_dests; } } -template +template inline __attribute__((always_inline)) void ncrisc_noc_blitz_write_setup( uint32_t noc, uint32_t cmd_buf, uint64_t dest_addr, uint32_t len_bytes, uint32_t vc, uint32_t num_times_to_write) { uint32_t noc_cmd_field = NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | NOC_CMD_RESP_MARKED; @@ -222,25 +274,37 @@ inline __attribute__((always_inline)) void ncrisc_noc_blitz_write_setup( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, len_bytes); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT)); if constexpr (noc_mode == DM_DYNAMIC_NOC) { - inc_noc_nonposted_writes_acked(noc, num_times_to_write); + inc_noc_counter_val(noc, num_times_to_write); + inc_noc_counter_val(noc, num_times_to_write); } else { noc_nonposted_writes_num_issued[noc] += num_times_to_write; noc_nonposted_writes_acked[noc] += num_times_to_write; } } +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_nonposted_writes_sent(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::NONPOSTED_WRITES_NUM_ISSUED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_NONPOSTED_WR_REQ_SENT) == (self_risc_acked + other_risc_acked)); +} + inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_writes_sent(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_NONPOSTED_WR_REQ_SENT) == noc_nonposted_writes_num_issued[noc]); } +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_posted_writes_sent(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::POSTED_WRITES_NUM_ISSUED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_POSTED_WR_REQ_SENT) == (self_risc_acked + other_risc_acked)); +} + inline __attribute__((always_inline)) bool ncrisc_noc_posted_writes_sent(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_POSTED_WR_REQ_SENT) == noc_posted_writes_num_issued[noc]); } -template inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_nonposted_writes_flushed(uint32_t noc) { - uint32_t self_risc_acked = get_noc_nonposted_writes_acked(noc); - uint32_t other_risc_acked = get_noc_nonposted_writes_acked<1 - proc_type>(noc); + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::NONPOSTED_WRITES_ACKED>(noc); return (NOC_STATUS_READ_REG(noc, NIU_MST_WR_ACK_RECEIVED) == (self_risc_acked + other_risc_acked)); } @@ -258,6 +322,12 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_trans return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0); } +inline __attribute__((always_inline)) bool ncrisc_dynamic_noc_nonposted_atomics_flushed(uint32_t noc) { + uint32_t self_risc_acked = get_noc_counter_val(noc); + uint32_t other_risc_acked = get_noc_counter_val<1 - proc_type, NocBarrierType::NONPOSTED_ATOMICS_ACKED>(noc); + return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == (self_risc_acked + other_risc_acked)); +} + inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed(uint32_t noc) { return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]); } @@ -296,11 +366,7 @@ inline __attribute__((always_inline)) void dynamic_noc_init() { uint32_t my_y = (noc_id_reg >> NOC_ADDR_NODE_ID_BITS) & NOC_NODE_ID_MASK; uint64_t xy_local_addr = NOC_XY_ADDR(my_x, my_y, 0); - uint32_t noc_rd_cmd_field = - NOC_CMD_CPY | NOC_CMD_RD | NOC_CMD_RESP_MARKED | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(1); - // program brisc cmd_buf 0 - NOC_CMD_BUF_WRITE_REG(noc, DYNAMIC_NOC_BRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field); NOC_CMD_BUF_WRITE_REG( noc, DYNAMIC_NOC_BRISC_RD_CMD_BUF, @@ -315,7 +381,6 @@ inline __attribute__((always_inline)) void dynamic_noc_init() { (uint32_t)(xy_local_addr >> NOC_ADDR_COORD_SHIFT)); // program ncrisc cmd_buf 2 - NOC_CMD_BUF_WRITE_REG(noc, DYNAMIC_NOC_NCRISC_RD_CMD_BUF, NOC_CTRL, noc_rd_cmd_field); NOC_CMD_BUF_WRITE_REG( noc, DYNAMIC_NOC_NCRISC_RD_CMD_BUF, @@ -348,10 +413,45 @@ inline __attribute__((always_inline)) void noc_local_state_init(int noc) { } inline __attribute__((always_inline)) void dynamic_noc_local_state_init() { - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM0)>(NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_WR_ACK_RECEIVED)); - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM0)>(NOC_1, 0); - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM1)>(NOC_0, 0); - set_noc_nonposted_writes_acked>(TensixProcessorTypes::DM1)>(NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_WR_ACK_RECEIVED)); + using underlying_tensix_processor_types_t = std::underlying_type_t; + constexpr underlying_tensix_processor_types_t dm0 = + static_cast(TensixProcessorTypes::DM0); + constexpr underlying_tensix_processor_types_t dm1 = + static_cast(TensixProcessorTypes::DM1); + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_RD_RESP_RECEIVED)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_RD_RESP_RECEIVED)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_NONPOSTED_WR_REQ_SENT)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_NONPOSTED_WR_REQ_SENT)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_WR_ACK_RECEIVED)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_WR_ACK_RECEIVED)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_ATOMIC_RESP_RECEIVED)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_ATOMIC_RESP_RECEIVED)); + + set_noc_counter_val( + NOC_0, NOC_STATUS_READ_REG(NOC_0, NIU_MST_POSTED_WR_REQ_SENT)); + set_noc_counter_val(NOC_1, 0); + set_noc_counter_val(NOC_0, 0); + set_noc_counter_val( + NOC_1, NOC_STATUS_READ_REG(NOC_1, NIU_MST_POSTED_WR_REQ_SENT)); } inline __attribute__((always_inline)) void ncrisc_noc_counters_init() { @@ -381,20 +481,21 @@ inline __attribute__((always_inline)) void ncrisc_noc_full_sync() { } } +template inline __attribute__((always_inline)) void ncrisc_noc_fast_read_any_len( uint32_t noc, uint32_t cmd_buf, uint64_t src_addr, uint32_t dest_addr, uint32_t len_bytes) { while (len_bytes > NOC_MAX_BURST_SIZE) { while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, NOC_MAX_BURST_SIZE); + ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, NOC_MAX_BURST_SIZE); src_addr += NOC_MAX_BURST_SIZE; dest_addr += NOC_MAX_BURST_SIZE; len_bytes -= NOC_MAX_BURST_SIZE; } while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes); + ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes); } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( uint32_t noc, uint32_t cmd_buf, @@ -411,7 +512,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( if constexpr (!one_packet) { while (len_bytes > NOC_MAX_BURST_SIZE) { while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write( + ncrisc_noc_fast_write( noc, cmd_buf, src_addr, @@ -430,7 +531,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( } } while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write( + ncrisc_noc_fast_write( noc, cmd_buf, src_addr, @@ -445,7 +546,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len( trid); } -template +template inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopback_src( uint32_t noc, uint32_t cmd_buf, @@ -459,7 +560,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopbac bool multicast_path_reserve) { while (len_bytes > NOC_MAX_BURST_SIZE) { while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write_loopback_src( + ncrisc_noc_fast_write_loopback_src( noc, cmd_buf, src_addr, @@ -475,11 +576,11 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len_loopbac len_bytes -= NOC_MAX_BURST_SIZE; } while (!noc_cmd_buf_ready(noc, cmd_buf)); - ncrisc_noc_fast_write_loopback_src( + ncrisc_noc_fast_write_loopback_src( noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve); } -template +template inline __attribute__((always_inline)) void noc_fast_write_dw_inline( uint32_t noc, uint32_t cmd_buf, @@ -507,13 +608,17 @@ inline __attribute__((always_inline)) void noc_fast_write_dw_inline( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_COORDINATE, (uint32_t)(dest_addr >> NOC_ADDR_COORD_SHIFT)); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_LEN_BE, be32); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - if constexpr (noc_mode == DM_DYNAMIC_NOC) { - if (!posted) { - inc_noc_nonposted_writes_acked(noc); + + if (posted) { + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_posted_writes_num_issued[noc] += 1; } } else { - if (posted) { - noc_posted_writes_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + inc_noc_counter_val(noc, 1); } else { noc_nonposted_writes_num_issued[noc] += 1; noc_nonposted_writes_acked[noc] += 1; @@ -556,11 +661,16 @@ inline __attribute__((always_inline)) void noc_fast_atomic_increment( NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_AT_DATA, incr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, 0x1); if (!posted) { - noc_nonposted_atomics_acked[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_nonposted_atomics_acked[noc] += 1; + } } } // issue noc reads while wait for outstanding transactions done +template inline __attribute__((always_inline)) void ncrisc_noc_fast_read_with_transaction_id( uint32_t noc, uint32_t cmd_buf, uint32_t src_base_addr, uint32_t src_addr, uint32_t dest_addr, uint32_t trid) { uint32_t src_addr_; @@ -572,7 +682,11 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_read_with_transaction NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, dest_addr); NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_addr_); // (uint32_t)src_addr NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CMD_CTRL, NOC_CTRL_SEND_REQ); - noc_reads_num_issued[noc] += 1; + if constexpr (noc_mode == DM_DYNAMIC_NOC) { + inc_noc_counter_val(noc, 1); + } else { + noc_reads_num_issued[noc] += 1; + } } // set transaction id for a noc read diff --git a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp index 3b27f9cd4a1..e153613e19b 100644 --- a/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp +++ b/tt_metal/impl/dispatch/kernels/cq_dispatch_slave.cpp @@ -106,7 +106,7 @@ FORCE_INLINE void dispatch_s_noc_inline_dw_write(uint64_t addr, uint32_t val, uint8_t noc_id, uint8_t be = 0xF) { WAYPOINT("NWIW"); DEBUG_SANITIZE_NOC_ADDR(noc_id, addr, 4); - noc_fast_write_dw_inline( + noc_fast_write_dw_inline( noc_id, DISPATCH_S_WR_REG_CMD_BUF, val, diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 73ce063b8dd..fce4b34b0ff 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -388,6 +388,7 @@ KernelGroup::KernelGroup( } uint32_t processor_classes = hal.get_processor_classes_count(programmable_core_type_index); + std::set noc_modes; for (int class_id = 0; class_id < processor_classes; class_id++) { auto& optional_id = kernel_ids[class_id]; if (optional_id) { @@ -399,6 +400,7 @@ KernelGroup::KernelGroup( // The code below sets the brisc_noc_id for use by the device firmware // Use 0 if neither brisc nor ncrisc specify a noc if (class_id == utils::underlying_type(DataMovementProcessor::RISCV_0)) { + noc_modes.insert(std::get(kernel->config()).noc_mode); // Use brisc's noc if brisc specifies a noc this->launch_msg.kernel_config.brisc_noc_id = std::get(kernel->config()).noc; // if noc mode is already set to DM_DYNAMIC_NOC then we can't change back to DM_DEDICATED_NOC @@ -406,6 +408,7 @@ KernelGroup::KernelGroup( this->launch_msg.kernel_config.brisc_noc_mode = NOC_MODE::DM_DYNAMIC_NOC; } } else if (class_id == utils::underlying_type(DataMovementProcessor::RISCV_1)) { + noc_modes.insert(std::get(kernel->config()).noc_mode); // Use 1-ncrisc's noc (the other noc) if ncrisc specifies a noc // If both brisc and ncrisc set the noc, then this is safe due to prior correctness validation this->launch_msg.kernel_config.brisc_noc_id = 1 - std::get(kernel->config()).noc; @@ -417,6 +420,7 @@ KernelGroup::KernelGroup( } } } + TT_FATAL(noc_modes.size() <= 1, "KernelGroup must have the same noc mode for all kernels"); for (uint32_t index = 0; index < NUM_PROCESSORS_PER_CORE_TYPE; index ++) { this->kernel_bin_sizes[index] = 0; diff --git a/tt_metal/tools/profiler/kernel_profiler.hpp b/tt_metal/tools/profiler/kernel_profiler.hpp index a304454b82c..817f5e1847d 100644 --- a/tt_metal/tools/profiler/kernel_profiler.hpp +++ b/tt_metal/tools/profiler/kernel_profiler.hpp @@ -188,7 +188,7 @@ inline void __attribute__((always_inline)) profiler_noc_async_write_posted( std::uint32_t src_local_l1_addr, std::uint64_t dst_noc_addr, std::uint32_t size, uint8_t noc = noc_index) { WAYPOINT("NAWW"); DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size); - ncrisc_noc_fast_write_any_len( + ncrisc_noc_fast_write_any_len( noc, write_cmd_buf, src_local_l1_addr, dst_noc_addr, size, NOC_UNICAST_WRITE_VC, false, false, 1, true, true); WAYPOINT("NAWD"); } diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_ring_all_gather.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_ring_all_gather.cpp index 4a651612e1e..e814047bf1a 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_ring_all_gather.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_ring_all_gather.cpp @@ -78,4 +78,5 @@ void kernel_main() { } } } + noc_async_atomic_barrier(); } diff --git a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_ring_all_gather.cpp b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_ring_all_gather.cpp index 69af13c8d54..3eafb65e5aa 100644 --- a/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_ring_all_gather.cpp +++ b/ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_ring_all_gather.cpp @@ -96,5 +96,6 @@ void kernel_main() { #ifdef ENABLE_GLOBAL_CB experimental::update_remote_cb_config_in_l1(remote_cb_id); + noc_async_atomic_barrier(); #endif }