Skip to content

Commit

Permalink
#11830: Moving DRAM and L1 unreserved base into the HAL
Browse files Browse the repository at this point in the history
  • Loading branch information
abhullar-tt committed Oct 3, 2024
1 parent ae2c1d6 commit 65cc65c
Show file tree
Hide file tree
Showing 91 changed files with 647 additions and 437 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ double get_tt_npu_rpeak_tflops(tt::ARCH arch, CoreCoord grid_size, int tt_npu_cl
std::tuple<uint32_t, uint32_t, uint32_t> get_aligned_input_tile_num(uint32_t M, uint32_t N, uint32_t K);

uint32_t get_in0_block_w(
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t Kt, uint32_t single_tile_size, uint32_t l1_size);
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t Kt, uint32_t single_tile_size, uint32_t l1_size, uint32_t l1_unreserved_base);

CoreCoord get_core_range(
uint32_t num_blocks_rows, uint32_t num_blocks_cols, uint32_t max_num_rows, uint32_t max_num_cols);
Expand All @@ -93,7 +93,7 @@ std::tuple<MathFidelity, bool> get_compute_params(tt::ARCH arch);
std::tuple<uint32_t, uint32_t> get_out_subblock_params(uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t choice);

std::tuple<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t> get_all_buffers_addresses(
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t in0_block_w, uint32_t single_tile_size);
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t in0_block_w, uint32_t single_tile_size, uint32_t l1_unreserved_base);

std::vector<float> generate_fp32_random(uint32_t num_elems, int32_t rand_max_val);

Expand Down Expand Up @@ -305,6 +305,7 @@ int main(int argc, char** argv) {

int pci_express_slot = 0;
tt_metal::Device* device = tt_metal::CreateDevice(pci_express_slot);
uint32_t l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
const tt::ARCH arch = device->arch();
////////////////////////////////////////////////////////////////////////////
// Check Input Args
Expand All @@ -330,7 +331,7 @@ int main(int argc, char** argv) {
uint32_t num_cores_x = grid_size.x;
uint32_t per_core_Mt = (Mt - 1) / num_cores_y + 1;
uint32_t per_core_Nt = (Nt - 1) / num_cores_x + 1;
uint32_t in0_block_w = get_in0_block_w(per_core_Mt, per_core_Nt, Kt, single_tile_size, l1_size);
uint32_t in0_block_w = get_in0_block_w(per_core_Mt, per_core_Nt, Kt, single_tile_size, l1_size, l1_unreserved_base);
if (in0_block_w == 0) {
log_error(
LogTest,
Expand Down Expand Up @@ -418,7 +419,7 @@ int main(int argc, char** argv) {
}
auto [out_subblock_h, out_subblock_w] = get_out_subblock_params(per_core_Mt, per_core_Nt, subblock_choice);
auto [in0_cb_addr, in1_cb_addr, in2_cb_addr, out_cb_addr, in0_addr, in1_addr, out_addr] =
get_all_buffers_addresses(per_core_Mt, per_core_Nt, in0_block_w, single_tile_size);
get_all_buffers_addresses(per_core_Mt, per_core_Nt, in0_block_w, single_tile_size, l1_unreserved_base);

if (fp32_dest_acc_en and (out_subblock_h * out_subblock_w > 4)) {
if (out_subblock_w >= 4) {
Expand Down Expand Up @@ -732,11 +733,11 @@ std::tuple<uint32_t, uint32_t, uint32_t> get_aligned_input_tile_num(uint32_t M,
}

uint32_t get_in0_block_w(
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t Kt, uint32_t single_tile_size, uint32_t l1_size) {
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t Kt, uint32_t single_tile_size, uint32_t l1_size, uint32_t l1_unreserved_base) {
std::vector<uint32_t> in0_block_w_choices = {4, 2, 1};
uint32_t num_buffer = 2; // double buffering
uint32_t in0_block_w = 0;
uint32_t base_addr = L1_UNRESERVED_BASE;
uint32_t base_addr = l1_unreserved_base;
for (auto choice : in0_block_w_choices) {
if (Kt % choice != 0)
continue;
Expand Down Expand Up @@ -811,9 +812,9 @@ std::tuple<uint32_t, uint32_t> get_out_subblock_params(uint32_t per_core_Mt, uin
}

std::tuple<uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t> get_all_buffers_addresses(
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t in0_block_w, uint32_t single_tile_size) {
uint32_t per_core_Mt, uint32_t per_core_Nt, uint32_t in0_block_w, uint32_t single_tile_size, uint32_t l1_unreserved_base) {
uint32_t num_buffer = 2; // double buffering
uint32_t in0_cb_addr = L1_UNRESERVED_BASE;
uint32_t in0_cb_addr = l1_unreserved_base;
uint32_t in0_cb_size = per_core_Mt * in0_block_w * num_buffer * single_tile_size;
uint32_t in1_cb_addr = in0_cb_addr + in0_cb_size;
uint32_t in1_cb_size = per_core_Nt * in0_block_w * num_buffer * single_tile_size;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ int main(int argc, char** argv) {
uint32_t single_tile_size = 2 * 1024;

uint32_t cb_src0_index = 0;
uint32_t cb_src0_addr = L1_UNRESERVED_BASE;
uint32_t cb_src0_addr = device->get_base_allocator_addr(HalMemType::L1);
tt_metal::CircularBufferConfig cb_src0_config =
tt_metal::CircularBufferConfig(cb_tiles * single_tile_size, {{cb_src0_index, tt::DataFormat::Float16_b}})
.set_page_size(cb_src0_index, single_tile_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@ void kernel_main() {
constexpr uint32_t pcie_base = get_compile_time_arg_val(0);
constexpr uint32_t pcie_sizeB = get_compile_time_arg_val(1);
constexpr uint32_t read_sizeB = get_compile_time_arg_val(2);
constexpr uint32_t done_address = get_compile_time_arg_val(3);

uint32_t pcie_read_ptr = pcie_base;

volatile tt_l1_ptr uint32_t* done_address = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(L1_UNRESERVED_BASE);
volatile tt_l1_ptr uint32_t* done_address_ptr = reinterpret_cast<volatile tt_l1_ptr uint32_t*>(done_address);

uint64_t pcie_noc_xy_encoding = (uint64_t)NOC_XY_PCIE_ENCODING(PCIE_NOC_X, PCIE_NOC_Y, NOC_INDEX);
while (done_address[0] == 0) {
while (done_address_ptr[0] == 0) {
uint64_t host_src_addr = pcie_noc_xy_encoding | pcie_read_ptr;
noc_async_read(host_src_addr, L1_UNRESERVED_BASE, read_sizeB);
noc_async_read(host_src_addr, done_address, read_sizeB);
pcie_read_ptr += read_sizeB;
if (pcie_read_ptr > pcie_base + pcie_sizeB) {
pcie_read_ptr = pcie_base;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,16 @@ int main(int argc, char **argv) {
uint32_t hugepage_size = tt::Cluster::instance().get_host_channel_size(mmio_device_id, channel);
uint32_t host_write_ptr = 0;

uint32_t reg_addr = dispatch_constants::PREFETCH_Q_BASE;
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_id);
uint32_t prefetch_q_base = dispatch_constants::get(dispatch_core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED);

uint32_t reg_addr = prefetch_q_base;
uint32_t num_reg_entries = 128;

std::vector<uint32_t> go_signal = {0};
std::vector<uint32_t> done_signal = {1};
tt_metal::detail::WriteToDeviceL1(device, logical_core, L1_UNRESERVED_BASE, go_signal);
uint32_t l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
tt_metal::detail::WriteToDeviceL1(device, logical_core, l1_unreserved_base, go_signal);

// Application setup
tt_metal::Program program = tt_metal::Program();
Expand All @@ -241,7 +245,7 @@ int main(int argc, char **argv) {
tt_metal::DataMovementConfig{
.processor = tt_metal::DataMovementProcessor::RISCV_1,
.noc = tt_metal::NOC::NOC_0,
.compile_args = {host_write_ptr, hugepage_size, kernel_read_size}});
.compile_args = {host_write_ptr, hugepage_size, kernel_read_size, l1_unreserved_base}});

// Add 2 * alignment so that we have enough space when aligning the ptr
// First add is for aligning to next aligned addr
Expand Down Expand Up @@ -334,17 +338,17 @@ int main(int argc, char **argv) {
memcpy_to_device<true>(host_mem_ptr, (uint8_t *)(start_ptr + src_data_offset), write_size_bytes);
}

uint32_t num_reg_writes = (reg_addr - dispatch_constants::PREFETCH_Q_BASE) / sizeof(uint32_t);
uint32_t num_reg_writes = (reg_addr - prefetch_q_base) / sizeof(uint32_t);
uint32_t val_to_write = data_written_bytes;
if (simulate_write_ptr_update) {
uint32_t num_write_ptr_updates = write_size_bytes / (32 * 1024);
for (int i = 0; i < num_write_ptr_updates; i++) {
tt::Cluster::instance().write_reg(
&val_to_write, tt_cxy_pair(device->id(), physical_core), reg_addr);
reg_addr += sizeof(uint32_t);
num_reg_writes = (reg_addr - dispatch_constants::PREFETCH_Q_BASE) / sizeof(uint32_t);
num_reg_writes = (reg_addr - prefetch_q_base) / sizeof(uint32_t);
if (num_reg_writes == num_reg_entries) {
reg_addr = dispatch_constants::PREFETCH_Q_BASE;
reg_addr = prefetch_q_base;
}
}
}
Expand All @@ -360,7 +364,7 @@ int main(int argc, char **argv) {
}

auto t_end = std::chrono::steady_clock::now();
tt_metal::detail::WriteToDeviceL1(device, logical_core, L1_UNRESERVED_BASE, done_signal);
tt_metal::detail::WriteToDeviceL1(device, logical_core, l1_unreserved_base, done_signal);

t1.join();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, uint32_t> create_program(

uint32_t cb_index = 0;
uint32_t cb_tiles = num_reqs_at_a_time;
uint32_t cb_addr = L1_UNRESERVED_BASE;
uint32_t cb_addr = device->get_base_allocator_addr(HalMemType::L1);
tt_metal::CircularBufferConfig cb_config =
tt_metal::CircularBufferConfig(cb_tiles * single_tile_size, {{cb_index, tile_format}})
.set_page_size(cb_index, single_tile_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ std::tuple<tt_metal::Program, tt_metal::KernelHandle, uint32_t> create_program(
uint32_t page_size, num_pages;
get_max_page_size_and_num_pages(block_num_tiles, single_tile_size, page_size, num_pages);

uint32_t cb_addr = L1_UNRESERVED_BASE;
uint32_t cb_addr = device->get_base_allocator_addr(HalMemType::L1);
tt_metal::CircularBufferConfig cb_config =
tt_metal::CircularBufferConfig(cb_size, {{cb_index, tile_format}})
.set_page_size(cb_index, single_tile_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,10 @@ int main(int argc, char **argv) {
uint32_t offset = 0;
uint32_t page = 0;
uint32_t * pcie_base = (uint32_t *)host_pcie_base + pcie_offset / sizeof(uint32_t);
uint32_t l1_unreserved_base = device->get_base_allocator_addr(HalMemType::L1);
while (!done) {
if (hammer_write_reg_g) {
tt::Cluster::instance().write_reg(&addr, tt_cxy_pair(device->id(), w), L1_UNRESERVED_BASE);
tt::Cluster::instance().write_reg(&addr, tt_cxy_pair(device->id(), w), l1_unreserved_base);
}
if (hammer_pcie_g) {
if (page == page_count_g) {
Expand Down Expand Up @@ -316,20 +317,22 @@ int main(int argc, char **argv) {
vector<std::uint32_t> vec;
vec.resize(page_size_g / sizeof(uint32_t));

CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
uint32_t dispatch_l1_unreserved_base = dispatch_constants::get(core_type).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED);
for (int i = 0; i < warmup_iterations_g; i++) {
if (source_mem_g == 4) {
tt::Cluster::instance().read_core(vec, sizeof(uint32_t), tt_cxy_pair(device->id(), w), DISPATCH_L1_UNRESERVED_BASE);
tt::Cluster::instance().read_core(vec, sizeof(uint32_t), tt_cxy_pair(device->id(), w), dispatch_l1_unreserved_base);
} else {
tt::Cluster::instance().write_core(vec.data(), vec.size() * sizeof(uint32_t), tt_cxy_pair(device->id(), w), DISPATCH_L1_UNRESERVED_BASE, vec.size() == 1);
tt::Cluster::instance().write_core(vec.data(), vec.size() * sizeof(uint32_t), tt_cxy_pair(device->id(), w), dispatch_l1_unreserved_base, vec.size() == 1);
}
}

auto start = std::chrono::system_clock::now();
for (int i = 0; i < iterations_g; i++) {
if (source_mem_g == 4) {
tt::Cluster::instance().read_core(vec, page_size_g, tt_cxy_pair(device->id(), w), DISPATCH_L1_UNRESERVED_BASE);
tt::Cluster::instance().read_core(vec, page_size_g, tt_cxy_pair(device->id(), w), dispatch_l1_unreserved_base);
} else {
tt::Cluster::instance().write_core(vec.data(), vec.size() * sizeof(uint32_t), tt_cxy_pair(device->id(), w), DISPATCH_L1_UNRESERVED_BASE, vec.size() == 1);
tt::Cluster::instance().write_core(vec.data(), vec.size() * sizeof(uint32_t), tt_cxy_pair(device->id(), w), dispatch_l1_unreserved_base, vec.size() == 1);
}
}
auto end = std::chrono::system_clock::now();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,8 @@ int main(int argc, char **argv) {
uint32_t num_compute_cores = device->compute_with_storage_grid_size().x * device->compute_with_storage_grid_size().y;

// Want different buffers on each core, instead use big buffer and self-manage it
uint32_t l1_buf_base = align(DISPATCH_L1_UNRESERVED_BASE, dispatch_buffer_page_size_g);
uint32_t dispatch_l1_unreserved_base = dispatch_constants::get(CoreType::WORKER).get_device_command_queue_addr(CommandQueueDeviceAddrType::UNRESERVED);
uint32_t l1_buf_base = align(dispatch_l1_unreserved_base, dispatch_buffer_page_size_g);
TT_ASSERT((l1_buf_base & (dispatch_buffer_page_size_g - 1)) == 0);

// Make sure user doesn't exceed available L1 space with cmd line arguments.
Expand Down Expand Up @@ -449,6 +450,10 @@ int main(int argc, char **argv) {
const uint32_t spoof_prefetch_core_sem_1_id = tt_metal::CreateSemaphore(program, {spoof_prefetch_core}, 0);
const uint32_t prefetch_sync_sem = spoof_prefetch_core_sem_1_id;

const uint32_t host_completion_queue_wr_ptr = dispatch_constants::get(CoreType::WORKER).get_host_command_queue_addr(CommandQueueHostAddrType::COMPLETION_Q_WR);
const uint32_t dev_completion_queue_wr_ptr = dispatch_constants::get(CoreType::WORKER).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_WR);
const uint32_t dev_completion_queue_rd_ptr = dispatch_constants::get(CoreType::WORKER).get_device_command_queue_addr(CommandQueueDeviceAddrType::COMPLETION_Q_RD);

std::vector<uint32_t> dispatch_compile_args =
{l1_buf_base,
log_dispatch_buffer_page_size_g,
Expand Down Expand Up @@ -476,6 +481,9 @@ int main(int argc, char **argv) {
0,
0,
0,
host_completion_queue_wr_ptr,
dev_completion_queue_wr_ptr,
dev_completion_queue_rd_ptr,
true, // is_dram_variant
true, // is_host_variant
};
Expand Down
Loading

0 comments on commit 65cc65c

Please sign in to comment.