Skip to content

Commit

Permalink
#13656: Deprecate support for mutating program after initial compilat…
Browse files Browse the repository at this point in the history
…ion on any device
  • Loading branch information
tt-aho committed Oct 10, 2024
1 parent 49f764c commit a11f11f
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 80 deletions.
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_compile_program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,9 @@ bool test_compile_program_after_clean_kernel_binary_directory(Device *device) {
std::unordered_map<std::string, std::string> kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str;

ClearKernelCache(device->build_key());
program.invalidate_compile();
auto second_kernel_cache_status = CompileProgramTestWrapper(device, program);
assert_program_cache_hit_status(program, /*hit_expected=*/false, second_kernel_cache_status);
auto second_program = create_program(device, default_attributes);
auto second_kernel_cache_status = CompileProgramTestWrapper(device, second_program);
assert_program_cache_hit_status(second_program, /*hit_expected=*/false, second_kernel_cache_status);
assert_kernel_hash_matches(kernel_name_to_hash, second_kernel_cache_status);

return pass;
Expand Down
133 changes: 71 additions & 62 deletions tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,68 @@ std::string get_latest_kernel_binary_path(uint32_t mask, const std::shared_ptr<K
return kernel->name() + "/" + latest_hash;
}

void construct_program(Program& program, Device * device, CoreCoord& core) {
uint32_t single_tile_size = 2 * 1024;
uint32_t num_tiles = 2048;
uint32_t dram_buffer_size =
single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels

tt_metal::InterleavedBufferConfig buff_config{
.device = device,
.size = dram_buffer_size,
.page_size = dram_buffer_size,
.buffer_type = tt_metal::BufferType::DRAM};

auto src_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_src_addr = src_dram_buffer->address();
auto dst_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_dst_addr = dst_dram_buffer->address();

auto dram_src_noc_xy = src_dram_buffer->noc_coordinates();
auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();

// input CB is larger than the output CB, to test the backpressure from the output CB all the way into the
// input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to
// math kernel, input CB and reader
uint32_t src0_cb_index = 0;
uint32_t num_input_tiles = 8;
tt_metal::CircularBufferConfig cb_src0_config =
tt_metal::CircularBufferConfig(
num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(src0_cb_index, single_tile_size);
auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);

uint32_t ouput_cb_index = 16; // output operands start at index 16
uint32_t num_output_tiles = 1;
tt_metal::CircularBufferConfig cb_output_config =
tt_metal::CircularBufferConfig(
num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(ouput_cb_index, single_tile_size);
auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);

auto unary_reader_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});

auto unary_writer_kernel = tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/writer_unary.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});

vector<uint32_t> compute_kernel_args = {
uint(num_tiles) // per_core_tile_cnt
};

auto eltwise_unary_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
core,
tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
}

int main(int argc, char **argv) {
bool pass = true;

Expand Down Expand Up @@ -72,65 +134,7 @@ int main(int argc, char **argv) {
programs.push_back(Program());
Program& program = programs.back();

uint32_t single_tile_size = 2 * 1024;
uint32_t num_tiles = 2048;
uint32_t dram_buffer_size =
single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels

tt_metal::InterleavedBufferConfig buff_config{
.device = device,
.size = dram_buffer_size,
.page_size = dram_buffer_size,
.buffer_type = tt_metal::BufferType::DRAM};

auto src_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_src_addr = src_dram_buffer->address();
auto dst_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_dst_addr = dst_dram_buffer->address();

auto dram_src_noc_xy = src_dram_buffer->noc_coordinates();
auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();

// input CB is larger than the output CB, to test the backpressure from the output CB all the way into the
// input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to
// math kernel, input CB and reader
uint32_t src0_cb_index = 0;
uint32_t num_input_tiles = 8;
tt_metal::CircularBufferConfig cb_src0_config =
tt_metal::CircularBufferConfig(
num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(src0_cb_index, single_tile_size);
auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);

uint32_t ouput_cb_index = 16; // output operands start at index 16
uint32_t num_output_tiles = 1;
tt_metal::CircularBufferConfig cb_output_config =
tt_metal::CircularBufferConfig(
num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(ouput_cb_index, single_tile_size);
auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);

auto unary_reader_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});

auto unary_writer_kernel = tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/writer_unary.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});

vector<uint32_t> compute_kernel_args = {
uint(num_tiles) // per_core_tile_cnt
};

auto eltwise_unary_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
core,
tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
construct_program(program, device, core);

////////////////////////////////////////////////////////////////////////////
// Compile Application
Expand Down Expand Up @@ -166,14 +170,19 @@ int main(int argc, char **argv) {
}
}
tt_metal::detail::ClearKernelCache();
for (auto& program : programs) {
program.invalidate_compile();
std::vector<Program> new_programs;
for (int i = 0; i < num_devices; i++) {
auto& device = devices[i];
new_programs.push_back(Program());
Program& program = new_programs.back();
construct_program(program, device, core);
}

std::vector<std::thread> ths;
ths.reserve(num_devices);
for (int i = 0; i < num_devices; i++) {
auto& device = devices[i];
auto& program = programs[i];
auto& program = new_programs[i];
ths.emplace_back([&] {
for (int j = 0; j < num_compiles; j++) {
uint32_t mask = device->build_key();
Expand Down
17 changes: 5 additions & 12 deletions tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ Program::Program() :
}

KernelHandle Program::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &programmable_core_type) {
this->invalidate_compile();
TT_FATAL(this->compiled_.empty(), "Cannot add kernel to an already compiled program {}", this->id);
// Id is unique across all kernels on all core types
KernelHandle id = this->num_kernels();
uint32_t index = hal.get_programmable_core_type_index(programmable_core_type);
Expand Down Expand Up @@ -362,7 +362,7 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s
}

CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
this->invalidate_compile();
TT_FATAL(this->compiled_.empty(), "Cannot add circular buffer to an already compiled program {}", this->id);
std::shared_ptr<CircularBuffer> circular_buffer = std::make_shared<CircularBuffer>(core_range_set, config);
// Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory
// allocator
Expand Down Expand Up @@ -558,7 +558,7 @@ void Program::init_semaphores(const Device &device, const CoreCoord &logical_cor
}

void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
this->invalidate_compile();
TT_FATAL(this->compiled_.empty(), "Cannot add semaphore to an already compiled program {}", this->id);
semaphores_.emplace_back(Semaphore(crs, semaphore_id, init_value, core_type));
}

Expand Down Expand Up @@ -638,12 +638,6 @@ void Program::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs
}
}

void Program::invalidate_compile() {
for (auto &[device_id, compile_needed] : compile_needed_) {
compile_needed = true;
}
}

void Program::populate_dispatch_data(Device *device) {
static const uint32_t processor_to_firmware_base[] = {
MEM_BRISC_FIRMWARE_BASE,
Expand Down Expand Up @@ -1035,8 +1029,7 @@ void Program::finalize() {

void Program::compile(Device *device, bool fd_bootloader_mode) {
ZoneScoped;
bool first_compile_on_device = compile_needed_.find(device->id()) == compile_needed_.end();
if (not first_compile_on_device and (not compile_needed_.at(device->id()))) {
if (compiled_.contains(device->id())) {
return;
}

Expand Down Expand Up @@ -1142,7 +1135,7 @@ void Program::compile(Device *device, bool fd_bootloader_mode) {
if (detail::MemoryReporter::enabled()) {
detail::MemoryReporter::inst().flush_program_memory_usage(*this, device);
}
compile_needed_[device->id()] = false;
compiled_.insert(device->id());
}

void Program::set_runtime_id(uint64_t id) { this->runtime_id = id; }
Expand Down
4 changes: 1 addition & 3 deletions tt_metal/impl/program/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,6 @@ class Program {

void compile(Device * device, bool fd_bootloader_mode = false);

void invalidate_compile();

void invalidate_circular_buffer_allocation();

void allocate_circular_buffers(const Device *device);
Expand Down Expand Up @@ -213,7 +211,7 @@ class Program {
std::vector<Semaphore> semaphores_;

CoreRangeSet worker_crs_;
std::unordered_map<chip_id_t, bool> compile_needed_;
std::unordered_set<chip_id_t> compiled_;
bool local_circular_buffer_allocation_needed_;

static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;
Expand Down

0 comments on commit a11f11f

Please sign in to comment.