Skip to content

Commit

Permalink
#0: add tests fix for BH
Browse files Browse the repository at this point in the history
  • Loading branch information
yugaoTT committed Feb 28, 2025
1 parent a245aed commit a4422ec
Show file tree
Hide file tree
Showing 8 changed files with 273 additions and 85 deletions.
165 changes: 92 additions & 73 deletions tests/tt_metal/tt_metal/noc/test_dynamic_noc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,85 +45,104 @@ void build_and_run_program(

log_info(tt::LogTest, "Starting compile of {} programs now.", NUM_PROGRAMS);

vector<Program> programs;
for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
programs.push_back(Program());
Program& program = programs.back();

tt_metal::NOC_MODE noc_mode;
if (mix_noc_mode) {
noc_mode = i % 2 == 0 ? tt_metal::NOC_MODE::DM_DYNAMIC_NOC : tt_metal::NOC_MODE::DM_DEDICATED_NOC;
} else {
noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC;
}

if (i % 10 == 0) {
log_info(tt::LogTest, "Compiling program {} of {}", i + 1, NUM_PROGRAMS);
}

CircularBufferConfig cb_config = CircularBufferConfig(page_size, {{0, tt::DataFormat::Float16_b}}).set_page_size(0, page_size);
auto cb = CreateCircularBuffer(program, cr_set, cb_config);

vector<uint32_t> compile_args = {MAX_LOOP, page_size};

auto brisc_kernel = CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0,
.noc = NOC::RISCV_0_default,
.noc_mode = noc_mode,
.compile_args = compile_args});

auto ncrisc_kernel = CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1,
.noc = NOC::RISCV_1_default,
.noc_mode = noc_mode,
.compile_args = compile_args});

for(int core_idx_y = 0; core_idx_y < worker_grid_size.y; core_idx_y++) {
for(int core_idx_x = 0; core_idx_x < worker_grid_size.x; core_idx_x++) {
CoreCoord core = {(std::size_t) core_idx_x, (std::size_t) core_idx_y};
CoreCoord neighbour_core = {core_idx_x == worker_grid_size.x - 1 ? 0 : core_idx_x + 1, core_idx_y};
CoreCoord neighbour_core_physical = device->worker_core_from_logical_core(neighbour_core);
Program program1;
Program program2;

CircularBufferConfig cb_config =
CircularBufferConfig(page_size, {{0, tt::DataFormat::Float16_b}}).set_page_size(0, page_size);
auto cb1 = CreateCircularBuffer(program1, cr_set, cb_config);
auto cb2 = CreateCircularBuffer(program2, cr_set, cb_config);

vector<uint32_t> compile_args = {MAX_LOOP, page_size};

auto brisc_kernel1 = CreateKernel(
program1,
"tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0,
.noc = NOC::RISCV_0_default,
.noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
.compile_args = compile_args});

auto ncrisc_kernel1 = CreateKernel(
program1,
"tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1,
.noc = NOC::RISCV_1_default,
.noc_mode = tt_metal::NOC_MODE::DM_DYNAMIC_NOC,
.compile_args = compile_args});

auto brisc_kernel2 = CreateKernel(
program2,
mix_noc_mode ? "tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp"
: "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_0,
.noc = NOC::RISCV_0_default,
.noc_mode = tt_metal::NOC_MODE::DM_DEDICATED_NOC,
.compile_args = compile_args});

auto ncrisc_kernel2 = CreateKernel(
program2,
mix_noc_mode ? "tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp"
: "tests/tt_metal/tt_metal/test_kernels/dataflow/dynamic_noc_writer.cpp",
cr_set,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1,
.noc = NOC::RISCV_1_default,
.noc_mode = tt_metal::NOC_MODE::DM_DEDICATED_NOC,
.compile_args = compile_args});

for (int core_idx_y = 0; core_idx_y < worker_grid_size.y; core_idx_y++) {
for (int core_idx_x = 0; core_idx_x < worker_grid_size.x; core_idx_x++) {
CoreCoord core = {(std::size_t)core_idx_x, (std::size_t)core_idx_y};
CoreCoord neighbour_core = {core_idx_x == worker_grid_size.x - 1 ? 0 : core_idx_x + 1, core_idx_y};
CoreCoord neighbour_core_physical = device->worker_core_from_logical_core(neighbour_core);
// mcast
auto device_grid = device->compute_with_storage_grid_size();
CoreCoord top_left_core = {0, 0};
CoreCoord top_left_core_physical = device->worker_core_from_logical_core(top_left_core);
CoreCoord bottom_right_core = {device_grid.x - 1, device_grid.y - 1};
CoreCoord bottom_right_core_physical = device->worker_core_from_logical_core(bottom_right_core);
std::vector<uint32_t> rt_args = {
(std::uint32_t)neighbour_core_physical.x,
(std::uint32_t)neighbour_core_physical.y,
// mcast
auto device_grid = device->compute_with_storage_grid_size();
CoreCoord top_left_core = {0, 0};
CoreCoord top_left_core_physical = device->worker_core_from_logical_core(top_left_core);
CoreCoord bottom_right_core = {device_grid.x - 1, device_grid.y - 1};
CoreCoord bottom_right_core_physical = device->worker_core_from_logical_core(bottom_right_core);
std::vector<uint32_t> rt_args = {
(std::uint32_t)neighbour_core_physical.x,
(std::uint32_t)neighbour_core_physical.y,
// mcast
(core_idx_x == 0 && core_idx_y == 0) ? true : false,
top_left_core_physical.x,
top_left_core_physical.y,
bottom_right_core_physical.x,
bottom_right_core_physical.y,
device_grid.x * device_grid.y};
tt::tt_metal::SetRuntimeArgs(program, brisc_kernel, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program, ncrisc_kernel, core, rt_args);
}
(core_idx_x == 0 && core_idx_y == 0) ? true : false,
top_left_core_physical.x,
top_left_core_physical.y,
bottom_right_core_physical.x,
bottom_right_core_physical.y,
device_grid.x * device_grid.y};
tt::tt_metal::SetRuntimeArgs(program1, brisc_kernel1, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program1, ncrisc_kernel1, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program2, brisc_kernel2, core, rt_args);
tt::tt_metal::SetRuntimeArgs(program2, ncrisc_kernel2, core, rt_args);
}

tt::tt_metal::detail::CompileProgram(device, program);
}

log_info(tt::LogTest, "Running {} programs for cache warmup.", programs.size());
// This loop caches program and runs
tt::tt_metal::detail::CompileProgram(device, program1);
tt::tt_metal::detail::CompileProgram(device, program2);

// This loop caches program1 and runs
for (uint32_t i = 0; i < NUM_PROGRAMS; i++) {
Program& program = programs[i];
log_info(tt::LogTest, "Running program {} of {}", i + 1, NUM_PROGRAMS);
if (slow_dispatch) {
tt::tt_metal::detail::LaunchProgram(device, program);
log_info(tt::LogTest, "Running program1 {} of {}", i + 1, NUM_PROGRAMS);
if (i % 2 == 0) {
if (slow_dispatch) {
tt::tt_metal::detail::LaunchProgram(device, program1);
} else {
EnqueueProgram(device->command_queue(), program1, false);
}
} else {
EnqueueProgram(device->command_queue(), program, false);
if (slow_dispatch) {
tt::tt_metal::detail::LaunchProgram(device, program2);
} else {
EnqueueProgram(device->command_queue(), program2, false);
}
}
}
if (!slow_dispatch) {
Expand Down
114 changes: 114 additions & 0 deletions tests/tt_metal/tt_metal/test_kernels/dataflow/dedicated_noc_writer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>

#include "dataflow_api.h"
#include "debug/dprint.h"

void kernel_main() {
constexpr std::uint32_t iteration = get_compile_time_arg_val(0);
constexpr std::uint32_t page_size = get_compile_time_arg_val(1);

std::uint32_t noc_x = get_arg_val<uint32_t>(0);
std::uint32_t noc_y = get_arg_val<uint32_t>(1);
bool mcast = get_arg_val<uint32_t>(2);
std::uint32_t top_left_core_x = get_arg_val<uint32_t>(3);
std::uint32_t top_left_core_y = get_arg_val<uint32_t>(4);
std::uint32_t bottom_right_core_x = get_arg_val<uint32_t>(5);
std::uint32_t bottom_right_core_y = get_arg_val<uint32_t>(6);
std::uint32_t num_dests = get_arg_val<uint32_t>(7);

constexpr uint32_t cb_id = 0;
uint32_t l1_read_addr = get_read_ptr(cb_id);

uint64_t mcast_addr_self_noc;
if (noc_index == 0) {
mcast_addr_self_noc = get_noc_multicast_addr(
top_left_core_x, top_left_core_y, bottom_right_core_x, bottom_right_core_y, l1_read_addr, noc_index);
} else {
mcast_addr_self_noc = get_noc_multicast_addr(
bottom_right_core_x, bottom_right_core_y, top_left_core_x, top_left_core_y, l1_read_addr, noc_index);
}

uint64_t addr_self_noc = get_noc_addr(noc_x, noc_y, l1_read_addr, noc_index);

DPRINT << "Start" << ENDL();

// Test stateful read API
noc_async_read_set_state(addr_self_noc, noc_index);
for (uint32_t i = 0; i < iteration; i++) {
noc_async_read_with_state(l1_read_addr, l1_read_addr, page_size, noc_index);
}

// Test stateful read one packet API
noc_async_read_one_packet_set_state(addr_self_noc, page_size, noc_index);
for (uint32_t i = 0; i < iteration; i++) {
noc_async_read_one_packet_with_state(l1_read_addr, l1_read_addr, noc_index);
}

// Test stateful write one packet API
noc_async_write_one_packet_set_state(addr_self_noc, page_size, noc_index);
for (uint32_t i = 0; i < iteration; i++) {
noc_async_write_one_packet_with_state(l1_read_addr, l1_read_addr, noc_index);
}

// Test gen_fast
const InterleavedAddrGenFast<false> s0 = {
.bank_base_address = l1_read_addr, .page_size = page_size, .data_format = DataFormat::Float16_b};

for (uint32_t i = 0; i < iteration; i++) {
uint32_t noc = noc_index;

// uint32_t noc = noc_index;
uint64_t noc_addr = get_noc_addr(noc_x, noc_y, l1_read_addr, noc);

// Test read
noc_async_read_one_packet(noc_addr, l1_read_addr, page_size, noc);
noc_async_read(noc_addr, l1_read_addr, page_size, noc);
// interleaved read
noc_async_read_tile(i % 1024, s0, l1_read_addr, 0, noc);

// Test semaphore
noc_semaphore_inc(noc_addr, 1, noc);
noc_semaphore_set_remote(l1_read_addr, noc_addr, noc);

// Test write
noc_async_write(l1_read_addr, noc_addr, page_size, noc);
noc_async_write_one_packet(l1_read_addr, noc_addr, page_size, noc);
// interleaved write
noc_async_write_tile(i % 1024, s0, l1_read_addr, noc);

// Test mcast
if (mcast) {
// write mcast
noc_async_write_multicast_one_packet(
l1_read_addr, mcast_addr_self_noc, page_size, num_dests - 1, false, true, noc);
noc_async_write_multicast(l1_read_addr, mcast_addr_self_noc, page_size, num_dests - 1, false, true, noc);
noc_async_write_multicast_loopback_src(
l1_read_addr, mcast_addr_self_noc, page_size, num_dests, false, true, noc);
// semaphore mcast
noc_semaphore_set_multicast(l1_read_addr, mcast_addr_self_noc, num_dests - 1, false, true, noc);
noc_semaphore_set_multicast_loopback_src(l1_read_addr, mcast_addr_self_noc, num_dests, false, true, noc);
}

// dw_write skip BH since there's HW issue
#ifndef ARCH_BLACKHOLE
noc_inline_dw_write(noc_addr, 1, 0xF, noc);
#endif
}

DPRINT << "END" << ENDL();
DPRINT << "noc_mode " << (uint)noc_mode << ENDL();

// Barrier test - test barrier itself working properly
for (int noc = 0; noc < NUM_NOCS; noc++) {
noc_async_read_barrier();
noc_async_write_barrier();
noc_async_writes_flushed();
noc_async_posted_writes_flushed();
noc_async_atomic_barrier();
noc_async_full_barrier();
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -136,8 +136,10 @@ void kernel_main() {
noc);
}

// dw_write
// dw_write skip BH since there's HW issue
#ifndef ARCH_BLACKHOLE
noc_inline_dw_write(noc_addr, 1, 0xF, noc);
#endif
}

// DRAM sharded read API
Expand All @@ -153,6 +155,8 @@ void kernel_main() {
noc_async_read_barrier_with_trid(i, 1 - noc_index);
}

// Some mem corruption issue when using the noc_async_write_barrier_with_trid
#ifndef ARCH_BLACKHOLE
// DRAM sharded write API
for (uint32_t i = 0; i < iteration; i++) {
uint32_t trid = i % 16 + 1;
Expand All @@ -163,8 +167,10 @@ void kernel_main() {
noc_async_write_barrier_with_trid(i, noc_index);
noc_async_write_barrier_with_trid(i, 1 - noc_index);
}
#endif

DPRINT << "END" <<ENDL();
DPRINT << "noc_mode " << (uint)noc_mode << ENDL();

// barrier on all txns
for (int noc = 0; noc < NUM_NOCS; noc++) {
Expand Down
29 changes: 29 additions & 0 deletions tt_metal/hw/firmware/src/brisc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include "debug/waypoint.h"
#include "debug/dprint.h"
#include "debug/stack_usage.h"

#include "debug/ring_buffer.h"
// clang-format on

uint8_t noc_index;
Expand Down Expand Up @@ -587,6 +589,32 @@ int main() {
// messages in the ring buffer. Must be executed before the atomic increment, as after that the launch
// message is no longer owned by us.
CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER();

#if defined(ARCH_BLACKHOLE)
if (noc_mode == DM_DYNAMIC_NOC) {
noc_fast_atomic_increment<DM_DYNAMIC_NOC>(
noc_index,
NCRISC_AT_CMD_BUF,
dispatch_addr,
NOC_UNICAST_WRITE_VC,
1,
31 /*wrap*/,
false /*linked*/,
post_atomic_increments /*posted*/);
// barrier till the atomic response is back
while (!ncrisc_dynamic_noc_nonposted_atomics_flushed(noc_index));
} else {
noc_fast_atomic_increment(
noc_index,
NCRISC_AT_CMD_BUF,
dispatch_addr,
NOC_UNICAST_WRITE_VC,
1,
31 /*wrap*/,
false /*linked*/,
post_atomic_increments /*posted*/);
}
#else
noc_fast_atomic_increment(
noc_index,
NCRISC_AT_CMD_BUF,
Expand All @@ -596,6 +624,7 @@ int main() {
31 /*wrap*/,
false /*linked*/,
post_atomic_increments /*posted*/);
#endif
mailboxes->launch_msg_rd_ptr = (launch_msg_rd_ptr + 1) & (launch_msg_buffer_num_entries - 1);
}
}
Expand Down
1 change: 1 addition & 0 deletions tt_metal/hw/inc/blackhole/core_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ enum class EthProcessorTypes : uint8_t { DM0 = 0, DM1 = 1, COUNT = 2 };
enum class DramProcessorTypes : uint8_t { DM0 = 0, COUNT = 1 };

constexpr uint8_t MaxProcessorsPerCoreType = 5;
constexpr uint8_t MaxDMProcessorsPerCoreType = 2;
constexpr uint8_t NumTensixDispatchClasses = 3;
constexpr uint8_t NumEthDispatchClasses = 2;
constexpr uint8_t NumDramDispatchClasses = 1;
Expand Down
Loading

0 comments on commit a4422ec

Please sign in to comment.