From 20018c7f735b240d0f516595b6e08835a4c2768a Mon Sep 17 00:00:00 2001 From: Paul Keller Date: Sat, 17 Aug 2024 21:52:17 +0000 Subject: [PATCH] #4984: Move CB configs into kernel config ring buffer --- .../command_queue/random_program.cpp | 6 ++- .../test_CircularBuffer_allocation.cpp | 11 ++--- .../test_CircularBuffer_creation.cpp | 2 +- .../command_queue/test_EnqueueProgram.cpp | 15 ++++--- .../common_runtime_address_map.h | 13 ++---- tt_metal/hw/firmware/src/brisc.cc | 10 +++-- tt_metal/hw/firmware/src/idle_erisc.cc | 8 ++-- tt_metal/hw/firmware/src/ncrisc.cc | 8 +++- tt_metal/hw/firmware/src/trisc.cc | 8 +++- .../hw/inc/blackhole/eth_l1_address_map.h | 6 ++- tt_metal/hw/inc/circular_buffer.h | 9 ++-- tt_metal/hw/inc/grayskull/dev_mem_map.h | 1 + tt_metal/hw/inc/wormhole/eth_l1_address_map.h | 2 + tt_metal/impl/dispatch/command_queue.cpp | 7 ++- tt_metal/impl/program/program.cpp | 45 +++++++++++++++++++ tt_metal/impl/program/program.hpp | 8 +++- tt_metal/llrt/llrt.cpp | 26 ----------- tt_metal/llrt/llrt.hpp | 13 ------ tt_metal/tt_metal.cpp | 23 ++++++---- 19 files changed, 133 insertions(+), 88 deletions(-) diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp index 39ba2d1e119..473474462a0 100644 --- a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp +++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp @@ -39,7 +39,11 @@ void MAIN { constexpr volatile uint32_t page_size = get_compile_time_arg_val(7); for (uint32_t i = 0; i < num_cbs; i++) { - uint32_t cb_val = reinterpret_cast(CIRCULAR_BUFFER_CONFIG_BASE + i * 16)[3]; + tt_l1_ptr mailboxes_t* const mailboxes = (tt_l1_ptr mailboxes_t*)(MEM_MAILBOX_BASE); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.cb_offset); + uint32_t cb_val = reinterpret_cast(cb_l1_base + i * 4)[3]; uint32_t expected = ((i + 1) * page_size) >> 4; if (cb_val != expected) { DPRINT << "Problem with CB idx: " << i << " Expected: " << expected << " Got: " << cb_val << ENDL(); diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp index 3b3be3fa3d5..9f16aafd4c5 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_allocation.cpp @@ -25,7 +25,7 @@ void validate_cb_address(Program &program, Device *device, const CoreRangeSet &c for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord core_coord(x, y); tt::tt_metal::detail::ReadFromDeviceL1( - device, core_coord, CIRCULAR_BUFFER_CONFIG_BASE, cb_config_buffer_size, cb_config_vector); + device, core_coord, program.get_cb_base_addr(device, core_coord, CoreType::WORKER), cb_config_buffer_size, cb_config_vector); std::map address_per_buffer_index = core_to_address_per_buffer_index.at(core_coord); @@ -284,6 +284,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferAddress) { TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) { for (unsigned int id = 0; id < num_devices_; id++) { + Device *device = this->devices_.at(id); Program program; CBConfig cb_config; CoreCoord core0(0, 0); @@ -306,7 +307,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) { expected_cb_addr += cb_config.page_size; } - detail::LaunchProgram(this->devices_.at(id), program); + detail::LaunchProgram(device, program); vector cb_config_vector; uint32_t cb_config_buffer_size = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); @@ -316,7 +317,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord core_coord(x, y); tt::tt_metal::detail::ReadFromDeviceL1( - this->devices_.at(id), core_coord, CIRCULAR_BUFFER_CONFIG_BASE, cb_config_buffer_size, cb_config_vector); + device, core_coord, program.get_cb_base_addr(device, core_coord, CoreType::WORKER), cb_config_buffer_size, cb_config_vector); std::map address_per_buffer_index = golden_addresses_per_core.at(core_coord); std::map num_pages_per_buffer_index = golden_num_pages_per_core.at(core_coord); @@ -333,7 +334,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) { UpdateCircularBufferPageSize(program, cb_ids[1], 1, cb_config.page_size / 2); golden_num_pages_per_core[core0][1] = 2; - detail::LaunchProgram(this->devices_.at(id), program); + detail::LaunchProgram(device, program); // addresses should not be changed for (const CoreRange &core_range : cr_set.ranges()) { @@ -341,7 +342,7 @@ TEST_F(DeviceFixture, TestUpdateCircularBufferPageSize) { for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord core_coord(x, y); tt::tt_metal::detail::ReadFromDeviceL1( - this->devices_.at(id), core_coord, CIRCULAR_BUFFER_CONFIG_BASE, cb_config_buffer_size, cb_config_vector); + device, core_coord, program.get_cb_base_addr(device, core_coord, CoreType::WORKER), cb_config_buffer_size, cb_config_vector); std::map address_per_buffer_index = golden_addresses_per_core.at(core_coord); std::map num_pages_per_buffer_index = golden_num_pages_per_core.at(core_coord); diff --git a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp index 64f58c20fe4..57e2e33a858 100644 --- a/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/circular_buffer/test_CircularBuffer_creation.cpp @@ -28,7 +28,7 @@ bool test_cb_config_written_to_core(Program &program, Device *device, const Core for (auto y = core_range.start_coord.y; y <= core_range.end_coord.y; y++) { CoreCoord core_coord(x, y); tt::tt_metal::detail::ReadFromDeviceL1( - device, core_coord, CIRCULAR_BUFFER_CONFIG_BASE, cb_config_buffer_size, cb_config_vector); + device, core_coord, program.get_sem_base_addr(device, core_coord, CoreType::WORKER), cb_config_buffer_size, cb_config_vector); for (const auto &[buffer_index, golden_cb_config] : cb_config_per_buffer_index) { auto base_index = UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index; diff --git a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp index 731e840ecc4..7754a6d5e17 100644 --- a/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp +++ b/tests/tt_metal/tt_metal/unit_tests_fast_dispatch/command_queue/test_EnqueueProgram.cpp @@ -76,7 +76,7 @@ std::vector initialize_dummy_circular_buffers(Program& program, const return cb_handles; } -bool cb_config_successful(Device* device, const DummyProgramMultiCBConfig & program_config){ +bool cb_config_successful(Device* device, Program &program, const DummyProgramMultiCBConfig & program_config){ bool pass = true; // Need to use old APIs to read since we cannot allocate a buffer in the reserved space we're trying @@ -86,7 +86,9 @@ bool cb_config_successful(Device* device, const DummyProgramMultiCBConfig & prog for (const CoreRange& core_range : program_config.cr_set.ranges()) { for (const CoreCoord& core_coord : core_range) { - tt::tt_metal::detail::ReadFromDeviceL1(device, core_coord, CIRCULAR_BUFFER_CONFIG_BASE, cb_config_buffer_size, cb_config_vector); + tt::tt_metal::detail::ReadFromDeviceL1(device, core_coord, + program.get_sem_base_addr(device, core_coord, CoreType::WORKER), + cb_config_buffer_size, cb_config_vector); uint32_t cb_addr = L1_UNRESERVED_BASE; for (uint32_t i = 0; i < program_config.cb_config_vector.size(); i++) { @@ -115,7 +117,7 @@ bool test_dummy_EnqueueProgram_with_cbs(Device* device, CommandQueue& cq, DummyP EnqueueProgram(cq, program, is_blocking_op); Finish(cq); - return cb_config_successful(device, program_config); + return cb_config_successful(device, program, program_config); } bool test_dummy_EnqueueProgram_with_cbs_update_size(Device* device, CommandQueue& cq, const DummyProgramMultiCBConfig& program_config) { @@ -126,7 +128,7 @@ bool test_dummy_EnqueueProgram_with_cbs_update_size(Device* device, CommandQueue EnqueueProgram(cq, program, false); Finish(cq); - const bool is_cb_config_before_update_successful = cb_config_successful(device, program_config); + const bool is_cb_config_before_update_successful = cb_config_successful(device, program, program_config); DummyProgramMultiCBConfig program_config_2 = program_config; for (uint32_t cb_id = 0; cb_id < program_config.cb_config_vector.size(); cb_id++) { @@ -139,7 +141,7 @@ bool test_dummy_EnqueueProgram_with_cbs_update_size(Device* device, CommandQueue EnqueueProgram(cq, program, false); Finish(cq); - const bool is_cb_config_after_update_successful = cb_config_successful(device, program_config_2); + const bool is_cb_config_after_update_successful = cb_config_successful(device, program, program_config_2); return is_cb_config_before_update_successful && is_cb_config_after_update_successful; } @@ -776,7 +778,8 @@ TEST_F(CommandQueueSingleCardFixture, TestMultiCBSharedAddressSpaceSentSingleCor vector cb_config_vector; tt::tt_metal::detail::ReadFromDeviceL1( - device, core_coord, CIRCULAR_BUFFER_CONFIG_BASE, cb_config_buffer_size, cb_config_vector); + device, core_coord, + program.get_cb_base_addr(device, core_coord, CoreType::WORKER), cb_config_buffer_size, cb_config_vector); uint32_t cb_addr = L1_UNRESERVED_BASE; uint32_t intermediate_index = intermediate_cb * sizeof(uint32_t); diff --git a/tt_metal/hostdevcommon/common_runtime_address_map.h b/tt_metal/hostdevcommon/common_runtime_address_map.h index 971c9e7ea3e..e43d1b1a5e8 100644 --- a/tt_metal/hostdevcommon/common_runtime_address_map.h +++ b/tt_metal/hostdevcommon/common_runtime_address_map.h @@ -51,26 +51,19 @@ constexpr static std::uint32_t PROFILER_RISC_COUNT = 5; static_assert (PROFILER_FULL_HOST_BUFFER_SIZE_PER_RISC > PROFILER_L1_BUFFER_SIZE); // Kernel config buffer is WIP -// Will eventually move CBs/Sems and likely kernel bins into this buffer -// Size is presently based on the old size of the RTAs (large enough to hold 1 set) +// Size is presently based on the old sizes of the RTAs + CB config + Sems // plus some extra space freed up in the mem map constexpr static std::uint32_t L1_KERNEL_CONFIG_BASE = PROFILER_L1_END_ADDRESS; -constexpr static std::uint32_t L1_KERNEL_CONFIG_SIZE = 4 * 1024 + 256 + 128; +constexpr static std::uint32_t L1_KERNEL_CONFIG_SIZE = 4 * 1024 + 256 + 128 + 512; constexpr static std::uint32_t IDLE_ERISC_L1_KERNEL_CONFIG_BASE = 32 * 1024; -// config for 32 L1 buffers is at addr BUFFER_CONFIG_BASE -// 12 bytes for each buffer: (addr, size, size_in_tiles) -// addr and size are in 16B words (byte address >> 4) -// this is a total of 32 * 3 * 4 = 384B -constexpr static std::uint32_t CIRCULAR_BUFFER_CONFIG_BASE = L1_KERNEL_CONFIG_BASE + L1_KERNEL_CONFIG_SIZE; constexpr static std::uint32_t NUM_CIRCULAR_BUFFERS = 32; constexpr static std::uint32_t UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG = 4; -constexpr static std::uint32_t CIRCULAR_BUFFER_CONFIG_SIZE = NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); constexpr static std::uint32_t PROFILER_L1_CONTROL_VECTOR_SIZE = 32; constexpr static std::uint32_t PROFILER_L1_CONTROL_BUFFER_SIZE = PROFILER_L1_CONTROL_VECTOR_SIZE * sizeof(uint32_t); -constexpr static std::uint32_t PROFILER_L1_BUFFER_CONTROL = CIRCULAR_BUFFER_CONFIG_BASE + CIRCULAR_BUFFER_CONFIG_SIZE; +constexpr static std::uint32_t PROFILER_L1_BUFFER_CONTROL = L1_KERNEL_CONFIG_BASE + L1_KERNEL_CONFIG_SIZE; constexpr static std::uint32_t L1_UNRESERVED_BASE = ((PROFILER_L1_BUFFER_CONTROL + PROFILER_L1_CONTROL_BUFFER_SIZE - 1) | (DRAM_ALIGNMENT - 1)) + 1; diff --git a/tt_metal/hw/firmware/src/brisc.cc b/tt_metal/hw/firmware/src/brisc.cc index 47c897160f9..9c0c9b86b07 100644 --- a/tt_metal/hw/firmware/src/brisc.cc +++ b/tt_metal/hw/firmware/src/brisc.cc @@ -384,12 +384,16 @@ int main() { noc_index = mailboxes->launch.kernel_config.brisc_noc_id; - setup_cb_read_write_interfaces(0, num_cbs_to_early_init, true, true, false); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + + uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.cb_offset); + setup_cb_read_write_interfaces(cb_l1_base, 0, num_cbs_to_early_init, true, true, false); + finish_ncrisc_copy_and_run(enables); // Run the BRISC kernel DEBUG_STATUS("R"); - uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM0].rta_offset); crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + @@ -398,7 +402,7 @@ int main() { mailboxes->launch.kernel_config.sem_offset); if (enables & DISPATCH_CLASS_MASK_TENSIX_ENABLE_DM0) { - setup_cb_read_write_interfaces(num_cbs_to_early_init, mailboxes->launch.kernel_config.max_cb_index, true, true, false); + setup_cb_read_write_interfaces(cb_l1_base, num_cbs_to_early_init, mailboxes->launch.kernel_config.max_cb_index, true, true, false); kernel_init(); RECORD_STACK_USAGE(); } else { diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index f35caeb236b..4bc8dcfa114 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -112,13 +112,15 @@ int main() { noc_index = mailboxes->launch.kernel_config.brisc_noc_id; - setup_cb_read_write_interfaces(0, mailboxes->launch.kernel_config.max_cb_index, true, true, false); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + + uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.cb_offset); + setup_cb_read_write_interfaces(cb_l1_base, 0, mailboxes->launch.kernel_config.max_cb_index, true, true, false); // Run the ERISC kernel DEBUG_STATUS("R"); //if (mailboxes->launch.enable_brisc) { - //UC FIXME: do i need this? - uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_ETH_DM0].rta_offset); crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + diff --git a/tt_metal/hw/firmware/src/ncrisc.cc b/tt_metal/hw/firmware/src/ncrisc.cc index 7e27803ae7d..137f6e0edfd 100644 --- a/tt_metal/hw/firmware/src/ncrisc.cc +++ b/tt_metal/hw/firmware/src/ncrisc.cc @@ -96,9 +96,13 @@ int main(int argc, char *argv[]) { notify_brisc_and_wait(); DeviceZoneScopedMainN("NCRISC-FW"); - setup_cb_read_write_interfaces(0, mailboxes->launch.kernel_config.max_cb_index, true, true, false); - uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + + uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.cb_offset); + + setup_cb_read_write_interfaces(cb_l1_base, 0, mailboxes->launch.kernel_config.max_cb_index, true, true, false); + rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_DM1].rta_offset); crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + diff --git a/tt_metal/hw/firmware/src/trisc.cc b/tt_metal/hw/firmware/src/trisc.cc index 5a01cda4087..6fc0fec26bb 100644 --- a/tt_metal/hw/firmware/src/trisc.cc +++ b/tt_metal/hw/firmware/src/trisc.cc @@ -58,6 +58,7 @@ volatile tt_l1_ptr uint32_t l1_buffer[16] __attribute__((section("l1_data"))) __ __attribute__((used)); #if !defined(UCK_CHLKC_MATH) +uint32_t tt_l1_ptr *cb_l1_base __attribute__((used)); CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); #endif @@ -107,15 +108,18 @@ int main(int argc, char *argv[]) { while (*trisc_run != RUN_SYNC_MSG_GO); DeviceZoneScopedMainN("TRISC-FW"); + uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; + #if !defined(UCK_CHLKC_MATH) - setup_cb_read_write_interfaces(0, mailboxes->launch.kernel_config.max_cb_index, cb_init_read, cb_init_write, cb_init_write); + uint32_t tt_l1_ptr *cb_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + + mailboxes->launch.kernel_config.cb_offset); + setup_cb_read_write_interfaces(cb_l1_base, 0, mailboxes->launch.kernel_config.max_cb_index, cb_init_read, cb_init_write, cb_init_write); #if defined(UCK_CHLKC_UNPACK) // Hack workaround for issue #11591 for (volatile uint32_t xxx = 0; xxx < 100; xxx++); #endif #endif - uint32_t kernel_config_base = mailboxes->launch.kernel_config.kernel_config_base; rta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + mailboxes->launch.kernel_config.mem_map[DISPATCH_CLASS_TENSIX_COMPUTE].rta_offset); crta_l1_base = (uint32_t tt_l1_ptr *)(kernel_config_base + diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index 5a89295eb3a..8d9039c6acb 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -28,7 +28,9 @@ struct address_map { static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024; - static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4; + // Kernel config buffer is WIP + // Size is presently based on the old sizes of the RTAs + CB config + Sems + static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4 + 8 * 16; // Base addresses static constexpr std::int32_t FIRMWARE_BASE = 0x9040; @@ -58,6 +60,8 @@ struct address_map { // erisc early exit functionality re-uses mailboxes_t::ncrisc_halt_msg_t::stack_save memory static constexpr std::int32_t ERISC_MEM_MAILBOX_STACK_SAVE = ERISC_MEM_MAILBOX_BASE + 4; + // Kernel config buffer is WIP + // Size is presently based on the old sizes of the RTAs + CB config + Sems static constexpr std::uint32_t PROFILER_L1_BUFFER_ER = ERISC_MEM_MAILBOX_BASE + 288 + 256 + 16; static constexpr std::uint32_t PROFILER_L1_BUFFER_CONTROL = PROFILER_L1_BUFFER_ER + PROFILER_L1_BUFFER_SIZE; diff --git a/tt_metal/hw/inc/circular_buffer.h b/tt_metal/hw/inc/circular_buffer.h index 832228e8512..c4e5a8a9eef 100644 --- a/tt_metal/hw/inc/circular_buffer.h +++ b/tt_metal/hw/inc/circular_buffer.h @@ -51,8 +51,11 @@ extern CBInterface cb_interface[NUM_CIRCULAR_BUFFERS]; // NCRISC and BRISC setup read and write // TRISC sets up read or write -inline void setup_cb_read_write_interfaces(uint32_t start_cb_index, uint32_t max_cb_index, bool read, bool write, bool init_wr_tile_ptr) { - volatile tt_l1_ptr uint32_t* circular_buffer_config_addr = (volatile tt_l1_ptr uint32_t*)(CIRCULAR_BUFFER_CONFIG_BASE) + start_cb_index * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG; +inline void setup_cb_read_write_interfaces(uint32_t tt_l1_ptr *cb_l1_base, uint32_t start_cb_index, uint32_t max_cb_index, bool read, bool write, bool init_wr_tile_ptr) { + + constexpr uint32_t WORDS_PER_CIRCULAR_BUFFER_CONFIG = 4; + + volatile tt_l1_ptr uint32_t* circular_buffer_config_addr = cb_l1_base + start_cb_index * WORDS_PER_CIRCULAR_BUFFER_CONFIG; for (uint32_t cb_id = start_cb_index; cb_id < max_cb_index; cb_id++) { @@ -81,6 +84,6 @@ inline void setup_cb_read_write_interfaces(uint32_t start_cb_index, uint32_t max cb_interface[cb_id].fifo_wr_tile_ptr = 0; } - circular_buffer_config_addr += UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG; + circular_buffer_config_addr += WORDS_PER_CIRCULAR_BUFFER_CONFIG; } } diff --git a/tt_metal/hw/inc/grayskull/dev_mem_map.h b/tt_metal/hw/inc/grayskull/dev_mem_map.h index 99d700a2afb..d8cee6ca051 100644 --- a/tt_metal/hw/inc/grayskull/dev_mem_map.h +++ b/tt_metal/hw/inc/grayskull/dev_mem_map.h @@ -54,6 +54,7 @@ #define MEM_BOOT_CODE_BASE 0 #define MEM_L1_BARRIER 12 #define MEM_MAILBOX_BASE 16 +// Magic size must be big enough to hold dev_msgs_t. static_asserts will fire if this is too small #define MEM_MAILBOX_END (MEM_MAILBOX_BASE + 1348) #define MEM_IERISC_MAILBOX_BASE 0 #define MEM_IERISC_MAILBOX_END 0 diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index 258a70e6839..59469ab1c4e 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -28,6 +28,8 @@ struct address_map { static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024; static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024; + // Kernel config buffer is WIP + // Size is presently based on the old sizes of the RTAs + CB config + Sems static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4 + 8 * 16; // Base addresses diff --git a/tt_metal/impl/dispatch/command_queue.cpp b/tt_metal/impl/dispatch/command_queue.cpp index 5a99ad7ae6a..15674de6ea3 100644 --- a/tt_metal/impl/dispatch/command_queue.cpp +++ b/tt_metal/impl/dispatch/command_queue.cpp @@ -1097,6 +1097,7 @@ void EnqueueProgramCommand::assemble_device_commands( } // CB Configs commands + index = hal.get_programmable_core_type_index(HalProgrammableCoreType::TENSIX); if (num_multicast_cb_sub_cmds > 0) { uint32_t curr_sub_cmd_idx = 0; cached_program_command_sequence.cb_configs_payloads.reserve(num_multicast_cb_sub_cmds); @@ -1105,13 +1106,15 @@ void EnqueueProgramCommand::assemble_device_commands( uint32_t write_offset_bytes = program_command_sequence.write_offset_bytes(); program_command_sequence.add_dispatch_write_packed( num_sub_cmds_in_cmd, - CIRCULAR_BUFFER_CONFIG_BASE, + program.get_program_config(index).cb_offset, cb_config_size_bytes, mcast_cb_payload_sizeB, multicast_cb_config_sub_cmds, multicast_cb_config_data, this->packed_write_max_unicast_sub_cmds, - curr_sub_cmd_idx); + curr_sub_cmd_idx, + false, + DISPATCH_WRITE_OFFSET_TENSIX_L1_CONFIG_BASE); for (auto &data_and_size : multicast_cb_config_data) { RecordDispatchData(program, DISPATCH_DATA_CB_CONFIG, data_and_size.second); } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 60e9613540c..cc0f651c2f2 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -889,6 +889,28 @@ uint32_t Program::finalize_sems(uint32_t programmable_core_type_index, uint32_t return base_offset + sem_size; } +uint32_t Program::finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset) { + + int max_id = -1; + + // TODO: has to be better way to do this and don't read from volatile + for (auto& kg : this->get_kernel_groups(programmable_core_type_index)) { + int32_t id = kg.launch_msg.kernel_config.max_cb_index; + if (id > max_id) { + max_id = id; + } + + kg.launch_msg.kernel_config.cb_offset = base_offset; + } + + uint32_t cb_size = (max_id + 1) * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * sizeof(uint32_t); + + this->program_configs_[programmable_core_type_index].cb_offset = base_offset; + this->program_configs_[programmable_core_type_index].cb_size = cb_size; + + return base_offset + cb_size; +} + uint32_t& Program::get_program_config_size(uint32_t programmable_core_type_index) { return this->program_config_sizes_[programmable_core_type_index]; } @@ -911,6 +933,7 @@ void Program::finalize() { uint32_t offset = 0; offset = finalize_rt_args(index, offset); offset = finalize_sems(index, offset); + offset = finalize_cbs(index, offset); this->get_program_config_size(index) = offset; } @@ -1043,6 +1066,19 @@ uint32_t Program::get_sem_base_addr(Device *device, CoreCoord logical_core, Core return base_addr + this->program_configs_[index].sem_offset; } +uint32_t Program::get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const { + + CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); + + uint32_t base_addr = device->using_fast_dispatch ? + device->sysmem_manager().get_config_buffer_mgr().get_last_slot_addr(programmable_core_type) : + hal.get_dev_addr(programmable_core_type, HalMemAddrType::KERNEL_CONFIG); + + return base_addr + this->program_configs_[index].cb_offset; +} + uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const { CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); @@ -1052,5 +1088,14 @@ uint32_t Program::get_sem_size(Device *device, CoreCoord logical_core, CoreType return this->program_configs_[index].sem_size; } +uint32_t Program::get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const { + + CoreCoord phys_core = device->physical_core_from_logical_core(logical_core, core_type); + HalProgrammableCoreType programmable_core_type = device->get_programmable_core_type(phys_core); + uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); + + return this->program_configs_[index].cb_size; +} + Program::~Program() {} } // namespace tt::tt_metal diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 0e4e0aa7201..3c92e6f5611 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -65,6 +65,8 @@ struct ProgramConfig { std::array crta_sizes; uint32_t sem_offset; uint32_t sem_size; + uint32_t cb_offset; + uint32_t cb_size; }; class Program { @@ -143,9 +145,13 @@ class Program { void capture_multi_device_dependencies() { capture_multi_device_dependencies_ = true; } bool has_multi_device_dependencies() { return capture_multi_device_dependencies_; } + ProgramConfig& get_program_config(uint32_t programmable_core_type_index); + // debug/test uint32_t get_sem_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_cb_base_addr(Device *device, CoreCoord logical_core, CoreType core_type) const; uint32_t get_sem_size(Device *device, CoreCoord logical_core, CoreType core_type) const; + uint32_t get_cb_size(Device *device, CoreCoord logical_core, CoreType core_type) const; private: void populate_dispatch_data(Device *device); @@ -239,11 +245,11 @@ class Program { void update_kernel_groups(uint32_t programmable_core_type_index); - ProgramConfig& get_program_config(uint32_t programmable_core_type_index); uint32_t& get_program_config_size(uint32_t programmable_core_type_index); uint32_t finalize_rt_args(uint32_t programmable_core_type_index, uint32_t base_offset); uint32_t finalize_sems(uint32_t programmable_core_type_index, uint32_t base_offset); + uint32_t finalize_cbs(uint32_t programmable_core_type_index, uint32_t base_offset); friend class HWCommandQueue; friend class EnqueueProgramCommand; diff --git a/tt_metal/llrt/llrt.cpp b/tt_metal/llrt/llrt.cpp index f53bf3c890a..3e346b25c80 100644 --- a/tt_metal/llrt/llrt.cpp +++ b/tt_metal/llrt/llrt.cpp @@ -174,32 +174,6 @@ void print_worker_cores(chip_id_t chip_id) { std::cout << std::endl << std::endl; } -CircularBufferConfigVec create_circular_buffer_config_vector() { - CircularBufferConfigVec circular_buffer_config_vec( - NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG, 0); // init to 0's - return circular_buffer_config_vec; -} - -void set_config_for_circular_buffer( - CircularBufferConfigVec &circular_buffer_config_vec, - uint32_t circular_buffer_index, - uint32_t addr_in_bytes, - uint32_t size_in_bytes, - uint32_t num_pages) { - - uint32_t page_size = size_in_bytes / num_pages; - circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * circular_buffer_index) = - addr_in_bytes >> 4; // convert to addr in 16B words - circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * circular_buffer_index + 1) = - size_in_bytes >> 4; // convert to addr in 16B words - circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * circular_buffer_index + 2) = num_pages; - circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * circular_buffer_index + 3) = page_size >> 4; -} - -void write_circular_buffer_config_vector_to_core(chip_id_t chip, const CoreCoord &core, CircularBufferConfigVec circular_buffer_config_vec) { - write_hex_vec_to_core(chip, core, circular_buffer_config_vec, CIRCULAR_BUFFER_CONFIG_BASE); -} - ll_api::memory read_mem_from_core(chip_id_t chip, const CoreCoord &core, const ll_api::memory& mem, uint64_t local_init_addr) { ll_api::memory read_mem; diff --git a/tt_metal/llrt/llrt.hpp b/tt_metal/llrt/llrt.hpp index 56567fd966a..422d1777668 100644 --- a/tt_metal/llrt/llrt.hpp +++ b/tt_metal/llrt/llrt.hpp @@ -52,7 +52,6 @@ using NUM_REPETITIONS = std::uint32_t; using WorkerCore = tt_cxy_pair; using WorkerCores = std::vector; -using CircularBufferConfigVec = std::vector; ll_api::memory get_risc_binary(string path); uint16_t get_binary_code_size16(const ll_api::memory &mem, int riscv_id); @@ -91,18 +90,6 @@ inline bool is_ethernet_core(const CoreCoord &core, chip_id_t chip_id) { soc_desc.physical_ethernet_cores.end(); } -CircularBufferConfigVec create_circular_buffer_config_vector(); - -void set_config_for_circular_buffer( - CircularBufferConfigVec &circular_buffer_config_vec, - uint32_t circular_buffer_index, - uint32_t addr_in_bytes, - uint32_t size_in_bytes, - uint32_t num_pages); - -void write_circular_buffer_config_vector_to_core( - chip_id_t chip, const CoreCoord &core, CircularBufferConfigVec circular_buffer_config_vec); - uint32_t generate_risc_startup_addr(bool is_eth_core); void program_risc_startup_addr(chip_id_t chip_id, const CoreCoord &core); diff --git a/tt_metal/tt_metal.cpp b/tt_metal/tt_metal.cpp index 99c8434b8f3..a6f5c1b0ac6 100644 --- a/tt_metal/tt_metal.cpp +++ b/tt_metal/tt_metal.cpp @@ -716,23 +716,28 @@ bool ConfigureDeviceWithProgram(Device *device, Program &program, bool fd_bootlo // TODO: add support for CB for ethernet cores if (core_type == CoreType::WORKER) { // CircularBufferConfigVec -- common across all kernels, so written once to the core - llrt::CircularBufferConfigVec circular_buffer_config_vec = llrt::create_circular_buffer_config_vector(); + std::vector circular_buffer_config_vec(NUM_CIRCULAR_BUFFERS * UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG, 0); auto cbs_on_core = program.circular_buffers_on_core(logical_core); for (auto circular_buffer : cbs_on_core) { for (uint32_t buffer_index : circular_buffer->buffer_indices()) { - llrt::set_config_for_circular_buffer( - circular_buffer_config_vec, - buffer_index, - circular_buffer->address(), - circular_buffer->size(), - circular_buffer->num_pages(buffer_index)); + uint32_t addr_in_bytes = circular_buffer->address(); + uint32_t size_in_bytes = circular_buffer->size(); + uint32_t num_pages = circular_buffer->num_pages(buffer_index); + uint32_t page_size = size_in_bytes / num_pages; + circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index) = + addr_in_bytes >> 4; // convert to addr in 16B words + circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index + 1) = + size_in_bytes >> 4; // convert to addr in 16B words + circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index + 2) = num_pages; + circular_buffer_config_vec.at(UINT32_WORDS_PER_CIRCULAR_BUFFER_CONFIG * buffer_index + 3) = page_size >> 4; } } // PROF_END("CBS") if (cbs_on_core.size()) { - llrt::write_circular_buffer_config_vector_to_core( - device_id, physical_core, circular_buffer_config_vec); + uint64_t kernel_config_base = hal.get_dev_addr(index, HalMemAddrType::KERNEL_CONFIG); + uint64_t addr = kernel_config_base + program.get_program_config(index).cb_offset; + llrt::write_hex_vec_to_core(device_id, physical_core, circular_buffer_config_vec, addr); } } program.init_semaphores(*device, logical_core, index);