Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#13656: Deprecate support for mutating program after initial compilation on any device #13684

Merged
merged 1 commit into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions tests/tt_metal/tt_metal/test_compile_program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,9 +210,9 @@ bool test_compile_program_after_clean_kernel_binary_directory(Device *device) {
std::unordered_map<std::string, std::string> kernel_name_to_hash = kernel_cache_status.kernel_name_to_hash_str;

ClearKernelCache(device->build_key());
program.invalidate_compile();
auto second_kernel_cache_status = CompileProgramTestWrapper(device, program);
assert_program_cache_hit_status(program, /*hit_expected=*/false, second_kernel_cache_status);
auto second_program = create_program(device, default_attributes);
auto second_kernel_cache_status = CompileProgramTestWrapper(device, second_program);
assert_program_cache_hit_status(second_program, /*hit_expected=*/false, second_kernel_cache_status);
assert_kernel_hash_matches(kernel_name_to_hash, second_kernel_cache_status);

return pass;
Expand Down
133 changes: 71 additions & 62 deletions tests/tt_metal/tt_metal/test_compile_sets_kernel_binaries.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,68 @@ std::string get_latest_kernel_binary_path(uint32_t mask, const std::shared_ptr<K
return kernel->name() + "/" + latest_hash;
}

void construct_program(Program& program, Device * device, CoreCoord& core) {
uint32_t single_tile_size = 2 * 1024;
uint32_t num_tiles = 2048;
uint32_t dram_buffer_size =
single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels

tt_metal::InterleavedBufferConfig buff_config{
.device = device,
.size = dram_buffer_size,
.page_size = dram_buffer_size,
.buffer_type = tt_metal::BufferType::DRAM};

auto src_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_src_addr = src_dram_buffer->address();
auto dst_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_dst_addr = dst_dram_buffer->address();

auto dram_src_noc_xy = src_dram_buffer->noc_coordinates();
auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();

// input CB is larger than the output CB, to test the backpressure from the output CB all the way into the
// input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to
// math kernel, input CB and reader
uint32_t src0_cb_index = 0;
uint32_t num_input_tiles = 8;
tt_metal::CircularBufferConfig cb_src0_config =
tt_metal::CircularBufferConfig(
num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(src0_cb_index, single_tile_size);
auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);

uint32_t ouput_cb_index = 16; // output operands start at index 16
uint32_t num_output_tiles = 1;
tt_metal::CircularBufferConfig cb_output_config =
tt_metal::CircularBufferConfig(
num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(ouput_cb_index, single_tile_size);
auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);

auto unary_reader_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});

auto unary_writer_kernel = tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/writer_unary.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});

vector<uint32_t> compute_kernel_args = {
uint(num_tiles) // per_core_tile_cnt
};

auto eltwise_unary_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
core,
tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
}

int main(int argc, char **argv) {
bool pass = true;

Expand Down Expand Up @@ -72,65 +134,7 @@ int main(int argc, char **argv) {
programs.push_back(Program());
Program& program = programs.back();

uint32_t single_tile_size = 2 * 1024;
uint32_t num_tiles = 2048;
uint32_t dram_buffer_size =
single_tile_size * num_tiles; // num_tiles of FP16_B, hard-coded in the reader/writer kernels

tt_metal::InterleavedBufferConfig buff_config{
.device = device,
.size = dram_buffer_size,
.page_size = dram_buffer_size,
.buffer_type = tt_metal::BufferType::DRAM};

auto src_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_src_addr = src_dram_buffer->address();
auto dst_dram_buffer = CreateBuffer(buff_config);
uint32_t dram_buffer_dst_addr = dst_dram_buffer->address();

auto dram_src_noc_xy = src_dram_buffer->noc_coordinates();
auto dram_dst_noc_xy = dst_dram_buffer->noc_coordinates();

// input CB is larger than the output CB, to test the backpressure from the output CB all the way into the
// input CB CB_out size = 1 forces the serialization of packer and writer kernel, generating backpressure to
// math kernel, input CB and reader
uint32_t src0_cb_index = 0;
uint32_t num_input_tiles = 8;
tt_metal::CircularBufferConfig cb_src0_config =
tt_metal::CircularBufferConfig(
num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(src0_cb_index, single_tile_size);
auto cb_src0 = tt_metal::CreateCircularBuffer(program, core, cb_src0_config);

uint32_t ouput_cb_index = 16; // output operands start at index 16
uint32_t num_output_tiles = 1;
tt_metal::CircularBufferConfig cb_output_config =
tt_metal::CircularBufferConfig(
num_output_tiles * single_tile_size, {{ouput_cb_index, tt::DataFormat::Float16_b}})
.set_page_size(ouput_cb_index, single_tile_size);
auto cb_output = tt_metal::CreateCircularBuffer(program, core, cb_output_config);

auto unary_reader_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/dataflow/reader_unary_push_4.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default});

auto unary_writer_kernel = tt_metal::CreateKernel(
program,
"tt_metal/kernels/dataflow/writer_unary.cpp",
core,
DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default});

vector<uint32_t> compute_kernel_args = {
uint(num_tiles) // per_core_tile_cnt
};

auto eltwise_unary_kernel = tt_metal::CreateKernel(
program,
"tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
core,
tt_metal::ComputeConfig{.compile_args = compute_kernel_args});
construct_program(program, device, core);

////////////////////////////////////////////////////////////////////////////
// Compile Application
Expand Down Expand Up @@ -166,14 +170,19 @@ int main(int argc, char **argv) {
}
}
tt_metal::detail::ClearKernelCache();
for (auto& program : programs) {
program.invalidate_compile();
std::vector<Program> new_programs;
for (int i = 0; i < num_devices; i++) {
auto& device = devices[i];
new_programs.push_back(Program());
Program& program = new_programs.back();
construct_program(program, device, core);
}

std::vector<std::thread> ths;
ths.reserve(num_devices);
for (int i = 0; i < num_devices; i++) {
auto& device = devices[i];
auto& program = programs[i];
auto& program = new_programs[i];
ths.emplace_back([&] {
for (int j = 0; j < num_compiles; j++) {
uint32_t mask = device->build_key();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ using namespace tt;

namespace unit_tests::initialize_semaphores {

void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program &program, const CoreRange &core_range) {
void initialize_program(tt_metal::Device *device, tt_metal::Program &program, const CoreRange &core_range) {
uint32_t single_tile_size = tt_metal::detail::TileSize(tt::DataFormat::Float16_b);
uint32_t num_tiles = 2048;

Expand Down Expand Up @@ -57,8 +57,6 @@ void initialize_and_compile_program(tt_metal::Device *device, tt_metal::Program
"tests/tt_metal/tt_metal/test_kernels/compute/eltwise_copy_3m.cpp",
core_range,
tt_metal::ComputeConfig{.compile_args = compute_kernel_args});

tt_metal::detail::CompileProgram(device, program);
}

void create_and_read_max_num_semaphores(
Expand All @@ -71,6 +69,8 @@ void create_and_read_max_num_semaphores(
ASSERT_TRUE(semaphore_id == i);
}

tt_metal::detail::CompileProgram(device, program);

program.finalize();

ASSERT_TRUE(tt_metal::detail::ConfigureDeviceWithProgram(device, program));
Expand Down Expand Up @@ -106,7 +106,7 @@ TEST_F(DeviceFixture, InitializeLegalSemaphores) {
for (unsigned int id = 0; id < num_devices_; id++) {
tt_metal::Program program = tt_metal::CreateProgram();
CoreRange core_range({0, 0}, {1, 1});
unit_tests::initialize_semaphores::initialize_and_compile_program(devices_.at(id), program, core_range);
unit_tests::initialize_semaphores::initialize_program(devices_.at(id), program, core_range);
unit_tests::initialize_semaphores::create_and_read_max_num_semaphores(devices_.at(id), program, core_range);
}
}
Expand All @@ -115,7 +115,7 @@ TEST_F(DeviceFixture, InitializeIllegalSemaphores) {
for (unsigned int id = 0; id < num_devices_; id++) {
tt_metal::Program program = tt_metal::CreateProgram();
CoreRange core_range({0, 0}, {1, 1});
unit_tests::initialize_semaphores::initialize_and_compile_program(devices_.at(id), program, core_range);
unit_tests::initialize_semaphores::initialize_program(devices_.at(id), program, core_range);
unit_tests::initialize_semaphores::try_creating_more_than_max_num_semaphores(
devices_.at(id), program, core_range);
}
Expand Down
17 changes: 5 additions & 12 deletions tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ Program::Program() :
}

KernelHandle Program::add_kernel(std::shared_ptr<Kernel> kernel, const HalProgrammableCoreType &programmable_core_type) {
this->invalidate_compile();
TT_FATAL(this->compiled_.empty(), "Cannot add kernel to an already compiled program {}", this->id);
// Id is unique across all kernels on all core types
KernelHandle id = this->num_kernels();
uint32_t index = hal.get_programmable_core_type_index(programmable_core_type);
Expand Down Expand Up @@ -362,7 +362,7 @@ void Program::CircularBufferAllocator::mark_address(uint64_t address, uint64_t s
}

CBHandle Program::add_circular_buffer(const CoreRangeSet &core_range_set, const CircularBufferConfig &config) {
this->invalidate_compile();
TT_FATAL(this->compiled_.empty(), "Cannot add circular buffer to an already compiled program {}", this->id);
std::shared_ptr<CircularBuffer> circular_buffer = std::make_shared<CircularBuffer>(core_range_set, config);
// Globally allocated circular buffer do not invalidate allocation because their addresses are tracked by memory
// allocator
Expand Down Expand Up @@ -558,7 +558,7 @@ void Program::init_semaphores(const Device &device, const CoreCoord &logical_cor
}

void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type) {
this->invalidate_compile();
TT_FATAL(this->compiled_.empty(), "Cannot add semaphore to an already compiled program {}", this->id);
semaphores_.emplace_back(Semaphore(crs, semaphore_id, init_value, core_type));
}

Expand Down Expand Up @@ -638,12 +638,6 @@ void Program::set_cb_tile_dims(Device *device, const std::vector<CoreRange> &crs
}
}

void Program::invalidate_compile() {
for (auto &[device_id, compile_needed] : compile_needed_) {
compile_needed = true;
}
}

void Program::populate_dispatch_data(Device *device) {
static const uint32_t processor_to_firmware_base[] = {
MEM_BRISC_FIRMWARE_BASE,
Expand Down Expand Up @@ -1035,8 +1029,7 @@ void Program::finalize() {

void Program::compile(Device *device, bool fd_bootloader_mode) {
ZoneScoped;
bool first_compile_on_device = compile_needed_.find(device->id()) == compile_needed_.end();
if (not first_compile_on_device and (not compile_needed_.at(device->id()))) {
if (compiled_.contains(device->id())) {
return;
}

Expand Down Expand Up @@ -1142,7 +1135,7 @@ void Program::compile(Device *device, bool fd_bootloader_mode) {
if (detail::MemoryReporter::enabled()) {
detail::MemoryReporter::inst().flush_program_memory_usage(*this, device);
}
compile_needed_[device->id()] = false;
compiled_.insert(device->id());
}

void Program::set_runtime_id(uint64_t id) { this->runtime_id = id; }
Expand Down
4 changes: 1 addition & 3 deletions tt_metal/impl/program/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,8 +141,6 @@ class Program {

void compile(Device * device, bool fd_bootloader_mode = false);

void invalidate_compile();

void invalidate_circular_buffer_allocation();

void allocate_circular_buffers(const Device *device);
Expand Down Expand Up @@ -213,7 +211,7 @@ class Program {
std::vector<Semaphore> semaphores_;

CoreRangeSet worker_crs_;
std::unordered_map<chip_id_t, bool> compile_needed_;
std::unordered_set<chip_id_t> compiled_;
bool local_circular_buffer_allocation_needed_;

static constexpr uint8_t core_to_kernel_group_invalid_index = 0xff;
Expand Down