From 6eef86bc3eafd5467df68df298f5102dcf076f61 Mon Sep 17 00:00:00 2001 From: Austin Ho Date: Thu, 10 Oct 2024 14:25:48 +0000 Subject: [PATCH] #13656: Deprecate support for mutating program after initial compilation on any device --- .../tt_metal/test_compile_program.cpp | 6 +- .../test_compile_sets_kernel_binaries.cpp | 133 ++++++++++-------- .../basic/initialize_semaphores.cpp | 10 +- tt_metal/impl/program/program.cpp | 17 +-- tt_metal/impl/program/program.hpp | 4 +- 5 files changed, 85 insertions(+), 85 deletions(-) diff --git a/tests/tt_metal/tt_metal/test_compile_program.cpp b/tests/tt_metal/tt_metal/test_compile_program.cpp index d8cd49f83a0..dac634833e2 100644 --- a/tests/tt_metal/tt_metal/test_compile_program.cpp +++ b/tests/tt_metal/tt_metal/test_compile_program.cpp @@ -210,9 +210,9 @@ bool test_compile_program_after_clean_kernel_binary_directory(Device *device) { std::unordered_map kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str; ClearKernelCache(device->build_key()); - program.invalidate_compile(); - auto second_kernel_cache_status = CompileProgramTestWrapper(device, program); - assert_program_cache_hit_status(program, /*hit_expected=*/false, second_kernel_cache_status); + auto second_program = create_program(device, default_attributes); + auto second_kernel_cache_status = CompileProgramTestWrapper(device, second_program); + assert_program_cache_hit_status(second_program, /*hit_expected=*/false, second_kernel_cache_status); assert_kernel_hash_matches(kernel_name_to_hash, second_kernel_cache_status); return pass; diff --git a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp index aaf315cc31d..558651e9152 100644 --- a/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp +++ b/tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp @@ -40,6 +40,68 @@ std::string get_latest_kernel_binary_path(uint32_t mask, const std::shared_ptrname() + "/" + latest_hash; } +void construct_program(Program& program, Device * device, CoreCoord& core) { + uint32_t single_tile_size = 2 * 1024; + uint32_t num_tiles = 2048; + uint32_t dram_buffer_size = + single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels + + tt_metal::InterleavedBufferConfig buff_config{ + .device = device, + .size = dram_buffer_size, + .page_size = dram_buffer_size, + .buffer_type = tt_metal::BufferType::DRAM}; + + auto src_dram_buffer = CreateBuffer(buff_config); + uint32_t dram_buffer_src_addr = src_dram_buffer->address(); + auto dst_dram_buffer = CreateBuffer(buff_config); + uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); + + auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); + auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); + + // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the + // input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to + // math kernel, input CB and reader + uint32_t src0_cb_index = 0; + uint32_t num_input_tiles = 8; + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig( + num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src0_cb_index, single_tile_size); + auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config); + + uint32_t ouput_cb_index = 16; // output operands start at index 16 + uint32_t num_output_tiles = 1; + tt_metal::CircularBufferConfig cb_output_config = + tt_metal::CircularBufferConfig( + num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(ouput_cb_index, single_tile_size); + auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); + + auto unary_reader_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + + auto unary_writer_kernel = tt_metal::CreateKernel( + program, + "tt_metal/kernels/dataflow/writer_unary.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + vector compute_kernel_args = { + uint(num_tiles) // per_core_tile_cnt + }; + + auto eltwise_unary_kernel = tt_metal::CreateKernel( + program, + "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", + core, + tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); +} + int main(int argc, char **argv) { bool pass = true; @@ -72,65 +134,7 @@ int main(int argc, char **argv) { programs.push_back(Program()); Program& program = programs.back(); - uint32_t single_tile_size = 2 * 1024; - uint32_t num_tiles = 2048; - uint32_t dram_buffer_size = - single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels - - tt_metal::InterleavedBufferConfig buff_config{ - .device = device, - .size = dram_buffer_size, - .page_size = dram_buffer_size, - .buffer_type = tt_metal::BufferType::DRAM}; - - auto src_dram_buffer = CreateBuffer(buff_config); - uint32_t dram_buffer_src_addr = src_dram_buffer->address(); - auto dst_dram_buffer = CreateBuffer(buff_config); - uint32_t dram_buffer_dst_addr = dst_dram_buffer->address(); - - auto dram_src_noc_xy = src_dram_buffer->noc_coordinates(); - auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates(); - - // input CB is larger than the output CB, to test the backpressure from the output CB all the way into the - // input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to - // math kernel, input CB and reader - uint32_t src0_cb_index = 0; - uint32_t num_input_tiles = 8; - tt_metal::CircularBufferConfig cb_src0_config = - tt_metal::CircularBufferConfig( - num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(src0_cb_index, single_tile_size); - auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config); - - uint32_t ouput_cb_index = 16; // output operands start at index 16 - uint32_t num_output_tiles = 1; - tt_metal::CircularBufferConfig cb_output_config = - tt_metal::CircularBufferConfig( - num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}}) - .set_page_size(ouput_cb_index, single_tile_size); - auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config); - - auto unary_reader_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); - - auto unary_writer_kernel = tt_metal::CreateKernel( - program, - "tt_metal/kernels/dataflow/writer_unary.cpp", - core, - DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); - - vector compute_kernel_args = { - uint(num_tiles) // per_core_tile_cnt - }; - - auto eltwise_unary_kernel = tt_metal::CreateKernel( - program, - "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", - core, - tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); + construct_program(program, device, core); //////////////////////////////////////////////////////////////////////////// // Compile Application @@ -166,14 +170,19 @@ int main(int argc, char **argv) { } } tt_metal::detail::ClearKernelCache(); - for (auto& program : programs) { - program.invalidate_compile(); + std::vector new_programs; + for (int i = 0; i < num_devices; i++) { + auto& device = devices[i]; + new_programs.push_back(Program()); + Program& program = new_programs.back(); + construct_program(program, device, core); } + std::vector ths; ths.reserve(num_devices); for (int i = 0; i < num_devices; i++) { auto& device = devices[i]; - auto& program = programs[i]; + auto& program = new_programs[i]; ths.emplace_back([&] { for (int j = 0; j < num_compiles; j++) { uint32_t mask = device->build_key(); diff --git a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp index 0d539b8a3fb..c2aa6e0379f 100644 --- a/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp +++ b/tests/tt_metal/tt_metal/unit_tests/basic/initialize_semaphores.cpp @@ -18,7 +18,7 @@ using namespace tt; namespace unit_tests::initialize_semaphores { -void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program &program, const CoreRange &core_range) { +void initialize_program(tt_metal::Device *device, tt_metal::Program &program, const CoreRange &core_range) { uint32_t single_tile_size = tt_metal::detail::TileSize(tt::DataFormat::Float16_b); uint32_t num_tiles = 2048; @@ -57,8 +57,6 @@ void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program "tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp", core_range, tt_metal::ComputeConfig{.compile_args = compute_kernel_args}); - - tt_metal::detail::CompileProgram(device, program); } void create_and_read_max_num_semaphores( @@ -71,6 +69,8 @@ void create_and_read_max_num_semaphores( ASSERT_TRUE(semaphore_id == i); } + tt_metal::detail::CompileProgram(device, program); + program.finalize(); ASSERT_TRUE(tt_metal::detail::ConfigureDeviceWithProgram(device, program)); @@ -106,7 +106,7 @@ TEST_F(DeviceFixture, InitializeLegalSemaphores) { for (unsigned int id = 0; id < num_devices_; id++) { tt_metal::Program program = tt_metal::CreateProgram(); CoreRange core_range({0, 0}, {1, 1}); - unit_tests::initialize_semaphores::initialize_and_compile_program(devices_.at(id), program, core_range); + unit_tests::initialize_semaphores::initialize_program(devices_.at(id), program, core_range); unit_tests::initialize_semaphores::create_and_read_max_num_semaphores(devices_.at(id), program, core_range); } } @@ -115,7 +115,7 @@ TEST_F(DeviceFixture, InitializeIllegalSemaphores) { for (unsigned int id = 0; id < num_devices_; id++) { tt_metal::Program program = tt_metal::CreateProgram(); CoreRange core_range({0, 0}, {1, 1}); - unit_tests::initialize_semaphores::initialize_and_compile_program(devices_.at(id), program, core_range); + unit_tests::initialize_semaphores::initialize_program(devices_.at(id), program, core_range); unit_tests::initialize_semaphores::try_creating_more_than_max_num_semaphores( devices_.at(id), program, core_range); } diff --git a/tt_metal/impl/program/program.cpp b/tt_metal/impl/program/program.cpp index 3c3fbfef000..3fcef7fae7f 100644 --- a/tt_metal/impl/program/program.cpp +++ b/tt_metal/impl/program/program.cpp @@ -122,7 +122,7 @@ Program::Program() : } KernelHandle Program::add_kernel(std::shared_ptr kernel, const HalProgrammableCoreType &programmable_core_type) { - this->invalidate_compile(); + TT_FATAL(this->compiled_.empty(), "Cannot add kernel to an already compiled program {}", this->id); // Id is unique across all kernels on all core types KernelHandle id = this->num_kernels(); uint32_t index = hal.get_programmable_core_type_index(programmable_core_type); @@ -362,7 +362,7 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s } CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) { - this->invalidate_compile(); + TT_FATAL(this->compiled_.empty(), "Cannot add circular buffer to an already compiled program {}", this->id); std::shared_ptr circular_buffer = std::make_shared(core_range_set, config); // Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory // allocator @@ -558,7 +558,7 @@ void Program::init_semaphores(const Device &device, const CoreCoord &logical_cor } void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) { - this->invalidate_compile(); + TT_FATAL(this->compiled_.empty(), "Cannot add semaphore to an already compiled program {}", this->id); semaphores_.emplace_back(Semaphore(crs, semaphore_id, init_value, core_type)); } @@ -638,12 +638,6 @@ void Program::set_cb_tile_dims(Device *device, const std::vector &crs } } -void Program::invalidate_compile() { - for (auto &[device_id, compile_needed] : compile_needed_) { - compile_needed = true; - } -} - void Program::populate_dispatch_data(Device *device) { static const uint32_t processor_to_firmware_base[] = { MEM_BRISC_FIRMWARE_BASE, @@ -1035,8 +1029,7 @@ void Program::finalize() { void Program::compile(Device *device, bool fd_bootloader_mode) { ZoneScoped; - bool first_compile_on_device = compile_needed_.find(device->id()) == compile_needed_.end(); - if (not first_compile_on_device and (not compile_needed_.at(device->id()))) { + if (compiled_.contains(device->id())) { return; } @@ -1142,7 +1135,7 @@ void Program::compile(Device *device, bool fd_bootloader_mode) { if (detail::MemoryReporter::enabled()) { detail::MemoryReporter::inst().flush_program_memory_usage(*this, device); } - compile_needed_[device->id()] = false; + compiled_.insert(device->id()); } void Program::set_runtime_id(uint64_t id) { this->runtime_id = id; } diff --git a/tt_metal/impl/program/program.hpp b/tt_metal/impl/program/program.hpp index 74b148a8544..7d2535893bb 100644 --- a/tt_metal/impl/program/program.hpp +++ b/tt_metal/impl/program/program.hpp @@ -141,8 +141,6 @@ class Program { void compile(Device * device, bool fd_bootloader_mode = false); - void invalidate_compile(); - void invalidate_circular_buffer_allocation(); void allocate_circular_buffers(const Device *device); @@ -213,7 +211,7 @@ class Program { std::vector semaphores_; CoreRangeSet worker_crs_; - std::unordered_map compile_needed_; + std::unordered_set compiled_; bool local_circular_buffer_allocation_needed_; static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;