Skip to content

Commit

Permalink
#0: Add support for all dynamic noc barriers, fix fw logic
Browse files Browse the repository at this point in the history
  • Loading branch information
tt-aho authored and yugaoTT committed Feb 28, 2025
1 parent 6d48b82 commit 016f839
Show file tree
Hide file tree
Showing 25 changed files with 1,027 additions and 252 deletions.
179 changes: 129 additions & 50 deletions tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ using namespace tt;
using namespace tt::test_utils;
using namespace tt::test_utils::df;

TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCAsyncWriteProgram) {
uint32_t NUM_PROGRAMS = 3;
uint32_t MAX_LOOP = 123456789;
uint32_t page_size = 1024;

void build_and_run_program(
tt::tt_metal::IDevice* device,
bool slow_dispatch,
uint32_t NUM_PROGRAMS,
uint32_t MAX_LOOP,
uint32_t page_size,
bool mix_noc_mode) {
// Make random
auto random_seed = 0; // (unsigned int)time(NULL);
uint32_t seed = tt::parse_env("TT_METAL_SEED", random_seed);
Expand All @@ -43,60 +45,137 @@ TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCAsyncWriteProgram)

log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);

vector<Program> programs;
for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
programs.push_back(Program());
Program& program = programs.back();

if (i % 10 == 0) {
log_info(tt::LogTest, "Compiling program {} of {}", i + 1, NUM_PROGRAMS);
Program program1;
Program program2;

CircularBufferConfig cb_config =
CircularBufferConfig(page_size, {{0, tt::DataFormat::Float16_b}}).set_page_size(0, page_size);
auto cb1 = CreateCircularBuffer(program1, cr_set, cb_config);
auto cb2 = CreateCircularBuffer(program2, cr_set, cb_config);

vector<uint32_t> compile_args = {MAX_LOOP, page_size};

auto brisc_kernel1 = CreateKernel(
program1,
"tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0,
.noc = NOC::RISCV_0_default,
.noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
.compile_args = compile_args});

auto ncrisc_kernel1 = CreateKernel(
program1,
"tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1,
.noc = NOC::RISCV_1_default,
.noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
.compile_args = compile_args});

auto brisc_kernel2 = CreateKernel(
program2,
mix_noc_mode ? "tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp"
: "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0,
.noc = NOC::RISCV_0_default,
.noc_mode = mix_noc_mode ? tt_metal::NOC_MODE::DM_DEDICATED_NOC : tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
.compile_args = compile_args});

auto ncrisc_kernel2 = CreateKernel(
program2,
mix_noc_mode ? "tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp"
: "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1,
.noc = NOC::RISCV_1_default,
.noc_mode = mix_noc_mode ? tt_metal::NOC_MODE::DM_DEDICATED_NOC : tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
.compile_args = compile_args});

for (int core_idx_y = 0; core_idx_y < worker_grid_size.y; core_idx_y++) {
for (int core_idx_x = 0; core_idx_x < worker_grid_size.x; core_idx_x++) {
CoreCoord core = {(std::size_t)core_idx_x, (std::size_t)core_idx_y};
CoreCoord neighbour_core = {core_idx_x == worker_grid_size.x - 1 ? 0 : core_idx_x + 1, core_idx_y};
CoreCoord neighbour_core_physical = device->worker_core_from_logical_core(neighbour_core);
// mcast
auto device_grid = device->compute_with_storage_grid_size();
CoreCoord top_left_core = {0, 0};
CoreCoord top_left_core_physical = device->worker_core_from_logical_core(top_left_core);
CoreCoord bottom_right_core = {device_grid.x - 1, device_grid.y - 1};
CoreCoord bottom_right_core_physical = device->worker_core_from_logical_core(bottom_right_core);
std::vector<uint32_t> rt_args = {
(std::uint32_t)neighbour_core_physical.x,
(std::uint32_t)neighbour_core_physical.y,
// mcast
(core_idx_x == 0 && core_idx_y == 0) ? true : false,
top_left_core_physical.x,
top_left_core_physical.y,
bottom_right_core_physical.x,
bottom_right_core_physical.y,
device_grid.x * device_grid.y};
tt::tt_metal::SetRuntimeArgs(program1, brisc_kernel1, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program1, ncrisc_kernel1, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program2, brisc_kernel2, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program2, ncrisc_kernel2, core, rt_args);
}

CircularBufferConfig cb_config = CircularBufferConfig(page_size, {{0, tt::DataFormat::Float16_b}}).set_page_size(0, page_size);
auto cb = CreateCircularBuffer(program, cr_set, cb_config);

vector<uint32_t> compile_args = {MAX_LOOP, page_size};

auto brisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", cr_set, DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default, .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, .compile_args = compile_args});

auto ncrisc_kernel = CreateKernel(
program, "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp", cr_set, DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC, .compile_args = compile_args});

for(int core_idx_y = 0; core_idx_y < worker_grid_size.y; core_idx_y++) {
for(int core_idx_x = 0; core_idx_x < worker_grid_size.x; core_idx_x++) {
CoreCoord core = {(std::size_t) core_idx_x, (std::size_t) core_idx_y};
CoreCoord neighbour_core = {core_idx_x == worker_grid_size.x - 1 ? 0 : core_idx_x + 1, core_idx_y};
CoreCoord neighbour_core_physical = device_->worker_core_from_logical_core(neighbour_core);
std::vector<uint32_t> rt_args = {
(std::uint32_t) neighbour_core_physical.x,
(std::uint32_t) neighbour_core_physical.y,
};
tt::tt_metal::SetRuntimeArgs(program, brisc_kernel, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program, ncrisc_kernel, core, rt_args);
}
}

tt::tt_metal::detail::CompileProgram(this->device_, program);
}

log_info(tt::LogTest, "Running {} programs for cache warmup.", programs.size());
// This loop caches program and runs
tt::tt_metal::detail::CompileProgram(device, program1);
tt::tt_metal::detail::CompileProgram(device, program2);

// This loop caches program1 and runs
for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
Program& program = programs[i];
log_info(tt::LogTest, "Running program {} of {}", i + 1, NUM_PROGRAMS);
if (this->slow_dispatch_) {
tt::tt_metal::detail::LaunchProgram(this->device_, program);
log_info(tt::LogTest, "Running program1 {} of {}", i + 1, NUM_PROGRAMS);
if (i % 2 == 0) {
if (slow_dispatch) {
tt::tt_metal::detail::LaunchProgram(device, program1);
} else {
EnqueueProgram(device->command_queue(), program1, false);
}
} else {
EnqueueProgram(this->device_->command_queue(), program, false);
if (slow_dispatch) {
tt::tt_metal::detail::LaunchProgram(device, program2);
} else {
EnqueueProgram(device->command_queue(), program2, false);
}
}
}
if (!this->slow_dispatch_) {
Finish(this->device_->command_queue());
if (!slow_dispatch) {
Finish(device->command_queue());
log_info(tt::LogTest, "Finish FD runs");
} else {
log_info(tt::LogTest, "Finish SD runs");
}
}

TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCOneProgram) {
uint32_t NUM_PROGRAMS = 1;
uint32_t MAX_LOOP = 65536;
uint32_t page_size = 1024;
bool mix_noc_mode = false;

build_and_run_program(this->device_, this->slow_dispatch_, NUM_PROGRAMS, MAX_LOOP, page_size, mix_noc_mode);
}

TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCMutlipleProgram) {
uint32_t NUM_PROGRAMS = 3;
uint32_t MAX_LOOP = 65536;
uint32_t page_size = 1024;
bool mix_noc_mode = false;

build_and_run_program(this->device_, this->slow_dispatch_, NUM_PROGRAMS, MAX_LOOP, page_size, mix_noc_mode);
}

TEST_F(DeviceSingleCardFastSlowDispatchFixture, TestDynamicNoCMutlipleProgramMixedMode) {
uint32_t NUM_PROGRAMS = 5;
uint32_t MAX_LOOP = 65536;
uint32_t page_size = 1024;
bool mix_noc_mode = true;

build_and_run_program(this->device_, this->slow_dispatch_, NUM_PROGRAMS, MAX_LOOP, page_size, mix_noc_mode);
}
114 changes: 114 additions & 0 deletions tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>

#include "dataflow_api.h"
#include "debug/dprint.h"

void kernel_main() {
constexpr std::uint32_t iteration = get_compile_time_arg_val(0);
constexpr std::uint32_t page_size = get_compile_time_arg_val(1);

std::uint32_t noc_x = get_arg_val<uint32_t>(0);
std::uint32_t noc_y = get_arg_val<uint32_t>(1);
bool mcast = get_arg_val<uint32_t>(2);
std::uint32_t top_left_core_x = get_arg_val<uint32_t>(3);
std::uint32_t top_left_core_y = get_arg_val<uint32_t>(4);
std::uint32_t bottom_right_core_x = get_arg_val<uint32_t>(5);
std::uint32_t bottom_right_core_y = get_arg_val<uint32_t>(6);
std::uint32_t num_dests = get_arg_val<uint32_t>(7);

constexpr uint32_t cb_id = 0;
uint32_t l1_read_addr = get_read_ptr(cb_id);

uint64_t mcast_addr_self_noc;
if (noc_index == 0) {
mcast_addr_self_noc = get_noc_multicast_addr(
top_left_core_x, top_left_core_y, bottom_right_core_x, bottom_right_core_y, l1_read_addr, noc_index);
} else {
mcast_addr_self_noc = get_noc_multicast_addr(
bottom_right_core_x, bottom_right_core_y, top_left_core_x, top_left_core_y, l1_read_addr, noc_index);
}

uint64_t addr_self_noc = get_noc_addr(noc_x, noc_y, l1_read_addr, noc_index);

DPRINT << "Start" << ENDL();

// Test stateful read API
noc_async_read_set_state(addr_self_noc, noc_index);
for (uint32_t i = 0; i < iteration; i++) {
noc_async_read_with_state(l1_read_addr, l1_read_addr, page_size, noc_index);
}

// Test stateful read one packet API
noc_async_read_one_packet_set_state(addr_self_noc, page_size, noc_index);
for (uint32_t i = 0; i < iteration; i++) {
noc_async_read_one_packet_with_state(l1_read_addr, l1_read_addr, noc_index);
}

// Test stateful write one packet API
noc_async_write_one_packet_set_state(addr_self_noc, page_size, noc_index);
for (uint32_t i = 0; i < iteration; i++) {
noc_async_write_one_packet_with_state(l1_read_addr, l1_read_addr, noc_index);
}

// Test gen_fast
const InterleavedAddrGenFast<false> s0 = {
.bank_base_address = l1_read_addr, .page_size = page_size, .data_format = DataFormat::Float16_b};

for (uint32_t i = 0; i < iteration; i++) {
uint32_t noc = noc_index;

// uint32_t noc = noc_index;
uint64_t noc_addr = get_noc_addr(noc_x, noc_y, l1_read_addr, noc);

// Test read
noc_async_read_one_packet(noc_addr, l1_read_addr, page_size, noc);
noc_async_read(noc_addr, l1_read_addr, page_size, noc);
// interleaved read
noc_async_read_tile(i % 1024, s0, l1_read_addr, 0, noc);

// Test semaphore
noc_semaphore_inc(noc_addr, 1, noc);
noc_semaphore_set_remote(l1_read_addr, noc_addr, noc);

// Test write
noc_async_write(l1_read_addr, noc_addr, page_size, noc);
noc_async_write_one_packet(l1_read_addr, noc_addr, page_size, noc);
// interleaved write
noc_async_write_tile(i % 1024, s0, l1_read_addr, noc);

// Test mcast
if (mcast) {
// write mcast
noc_async_write_multicast_one_packet(
l1_read_addr, mcast_addr_self_noc, page_size, num_dests - 1, false, true, noc);
noc_async_write_multicast(l1_read_addr, mcast_addr_self_noc, page_size, num_dests - 1, false, true, noc);
noc_async_write_multicast_loopback_src(
l1_read_addr, mcast_addr_self_noc, page_size, num_dests, false, true, noc);
// semaphore mcast
noc_semaphore_set_multicast(l1_read_addr, mcast_addr_self_noc, num_dests - 1, false, true, noc);
noc_semaphore_set_multicast_loopback_src(l1_read_addr, mcast_addr_self_noc, num_dests, false, true, noc);
}

// dw_write skip BH since there's HW issue
#ifndef ARCH_BLACKHOLE
noc_inline_dw_write(noc_addr, 1, 0xF, noc);
#endif
}

DPRINT << "END" << ENDL();
DPRINT << "noc_mode " << (uint)noc_mode << ENDL();

// Barrier test - test barrier itself working properly
for (int noc = 0; noc < NUM_NOCS; noc++) {
noc_async_read_barrier();
noc_async_write_barrier();
noc_async_writes_flushed();
noc_async_posted_writes_flushed();
noc_async_atomic_barrier();
noc_async_full_barrier();
}
}
Loading

0 comments on commit 016f839

Please sign in to comment.