Skip to content

Commit

Permalink
#9514: refactor dispatch core manager init
Browse files Browse the repository at this point in the history
  • Loading branch information
aliuTT committed Jul 30, 2024
1 parent 32f45ba commit 087c0fe
Show file tree
Hide file tree
Showing 11 changed files with 104 additions and 93 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ void measure_latency(string kernel_name) {

uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(device->id());
uint8_t num_hw_cqs = device->num_hw_cqs();
CoreCoord producer_logical_core = tt_metal::dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device->id(), channel, 0);
CoreCoord consumer_logical_core = tt_metal::dispatch_core_manager::get(num_hw_cqs).dispatcher_core(device->id(), channel, 0);
CoreCoord producer_logical_core = tt_metal::dispatch_core_manager::instance().prefetcher_core(device->id(), channel, 0);
CoreCoord consumer_logical_core = tt_metal::dispatch_core_manager::instance().dispatcher_core(device->id(), channel, 0);

TT_ASSERT(producer_logical_core != consumer_logical_core, "Producer and consumer core are {}. They should not be the same!", producer_logical_core.str());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanAndUnalignedToTransferPa
TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanMaxPrefetchCommandSize) {
constexpr uint32_t num_round_robins = 1;
for (Device *device : devices_) {
CoreType dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
TestBufferConfig config = {
.num_pages = 1,
Expand All @@ -389,7 +389,7 @@ TEST_F(CommandQueueSingleCardFixture, TestPageLargerThanMaxPrefetchCommandSize)
TEST_F(CommandQueueSingleCardFixture, TestUnalignedPageLargerThanMaxPrefetchCommandSize) {
constexpr uint32_t num_round_robins = 1;
for (Device *device : devices_) {
CoreType dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
CoreType dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
uint32_t unaligned_page_size = max_prefetch_command_size + 4;
TestBufferConfig config = {
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/core_descriptors/grayskull_120_arch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ E150:
[[2, -1],[3, -1],[4, -1],[5, -1],[8, -1],[9, -1],[10, -1],[11, -1]]

dispatch_cores:
[[0, -1], [1, -1], [6, -1], [7, -1]]
[[0, -1], [6, -1], [1, -1], [7, -1]]

E75:
1:
Expand Down
107 changes: 53 additions & 54 deletions tt_metal/impl/device/device.cpp

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion tt_metal/impl/device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ class Device {
friend class SystemMemoryManager;

static constexpr MemoryAllocator allocator_scheme_ = MemoryAllocator::L1_BANKING;
static constexpr uint32_t max_num_hw_cqs = 2;
chip_id_t id_;
uint32_t build_key_;
std::unique_ptr<Allocator> allocator_ = nullptr;
Expand Down
1 change: 1 addition & 0 deletions tt_metal/impl/device/device_pool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ void bind_current_thread_to_free_cores(const std::unordered_set<uint32_t>& free_
} // namespace device_cpu_allocator

DevicePool* DevicePool::_inst = nullptr;
tt_metal::dispatch_core_manager* tt_metal::dispatch_core_manager::_inst = nullptr;

void DevicePool::initialize_device(Device* dev) const {
detail::ClearProfilerControlBuffer(dev);
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/impl/device/device_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class DevicePool {
size_t trace_region_size = DEFAULT_TRACE_REGION_SIZE,
const std::vector<uint32_t> &l1_bank_remap = {}) noexcept {
log_debug(tt::LogMetal, "DevicePool initialize");
tt::tt_metal::dispatch_core_manager::initialize();

if (_inst == nullptr) {
static DevicePool device_pool(device_ids, num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap);
_inst = &device_pool;
Expand Down
34 changes: 17 additions & 17 deletions tt_metal/impl/dispatch/command_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ EnqueueReadBufferCommand::EnqueueReadBufferCommand(
TT_ASSERT(buffer.is_dram() or buffer.is_l1(), "Trying to read an invalid buffer");

this->device = device;
this->dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
this->dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
}

void EnqueueReadInterleavedBufferCommand::add_prefetch_relay(HugepageDeviceCommand& command) {
Expand Down Expand Up @@ -148,7 +148,7 @@ EnqueueWriteBufferCommand::EnqueueWriteBufferCommand(
pages_to_write(pages_to_write.has_value() ? pages_to_write.value() : buffer.num_pages()) {
TT_ASSERT(buffer.is_dram() or buffer.is_l1(), "Trying to write to an invalid buffer");
this->device = device;
this->dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
this->dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
}

void EnqueueWriteInterleavedBufferCommand::add_dispatch_write(HugepageDeviceCommand& command_sequence) {
Expand Down Expand Up @@ -307,7 +307,7 @@ EnqueueProgramCommand::EnqueueProgramCommand(
program(program),
dispatch_core(dispatch_core) {
this->device = device;
this->dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
this->dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
this->packed_write_max_unicast_sub_cmds = get_packed_write_max_unicast_sub_cmds(this->device);
}

Expand Down Expand Up @@ -493,7 +493,7 @@ void EnqueueProgramCommand::assemble_runtime_args_commands() {
static vector<CoreType> core_types = {CoreType::WORKER, CoreType::ETH};

CoreType dispatch_core_type =
dispatch_core_manager::get(this->device->num_hw_cqs()).get_dispatch_core_type(this->device->id());
dispatch_core_manager::instance().get_dispatch_core_type(this->device->id());
const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();

// Note: each sub_cmd contain data for multiple kernels (DM*, COMPUTE)
Expand Down Expand Up @@ -1245,7 +1245,7 @@ void EnqueueProgramCommand::process() {
stall_fetch_size_bytes + preamble_fetch_size_bytes + runtime_args_fetch_size_bytes + program_fetch_size_bytes;

CoreType dispatch_core_type =
dispatch_core_manager::get(this->device->num_hw_cqs()).get_dispatch_core_type(this->device->id());
dispatch_core_manager::instance().get_dispatch_core_type(this->device->id());
if (total_fetch_size_bytes <= dispatch_constants::get(dispatch_core_type).max_prefetch_command_size()) {
this->manager.issue_queue_reserve(total_fetch_size_bytes, this->command_queue_id);
uint32_t write_ptr = this->manager.get_issue_queue_write_ptr(this->command_queue_id);
Expand Down Expand Up @@ -1391,7 +1391,7 @@ void EnqueueRecordEventCommand::process() {
command_sequence.add_dispatch_wait(
false, DISPATCH_MESSAGE_ADDR, this->expected_num_workers_completed, this->clear_count);

CoreType core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(this->device->id());
CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(this->device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());
std::vector<CQDispatchWritePackedUnicastSubCmd> unicast_sub_cmds(num_hw_cqs);
std::vector<std::pair<const void*, uint32_t>> event_payloads(num_hw_cqs);
Expand All @@ -1400,10 +1400,10 @@ void EnqueueRecordEventCommand::process() {
tt_cxy_pair dispatch_location;
if (device->is_mmio_capable()) {
dispatch_location =
dispatch_core_manager::get(num_hw_cqs).dispatcher_core(this->device->id(), channel, cq_id);
dispatch_core_manager::instance().dispatcher_core(this->device->id(), channel, cq_id);
} else {
dispatch_location =
dispatch_core_manager::get(num_hw_cqs).dispatcher_d_core(this->device->id(), channel, cq_id);
dispatch_core_manager::instance().dispatcher_d_core(this->device->id(), channel, cq_id);
}

CoreCoord dispatch_physical_core = get_physical_core_coordinate(dispatch_location, core_type);
Expand All @@ -1426,7 +1426,7 @@ void EnqueueRecordEventCommand::process() {
if (not device->is_mmio_capable()) {
for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) {
tt_cxy_pair prefetch_location =
dispatch_core_manager::get(num_hw_cqs).prefetcher_core(this->device->id(), channel, cq_id);
dispatch_core_manager::instance().prefetcher_core(this->device->id(), channel, cq_id);
CoreCoord prefetch_physical_core = get_physical_core_coordinate(prefetch_location, core_type);
command_sequence.add_dispatch_write_remote(
this->event_id,
Expand Down Expand Up @@ -1456,7 +1456,7 @@ EnqueueWaitForEventCommand::EnqueueWaitForEventCommand(
manager(manager),
sync_event(sync_event),
clear_count(clear_count) {
this->dispatch_core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
this->dispatch_core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
// Should not be encountered under normal circumstances (record, wait) unless user is modifying sync event ID.
// TT_ASSERT(command_queue_id != sync_event.cq_id || event != sync_event.event_id,
// "EnqueueWaitForEventCommand cannot wait on it's own event id on the same CQ. Event ID: {} CQ ID: {}",
Expand Down Expand Up @@ -1575,17 +1575,17 @@ HWCommandQueue::HWCommandQueue(Device* device, uint32_t id, NOC noc_index) :
CoreCoord enqueue_program_dispatch_core;
if (device->is_mmio_capable()) {
enqueue_program_dispatch_core =
dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_core(device->id(), channel, id);
dispatch_core_manager::instance().dispatcher_core(device->id(), channel, id);
} else {
enqueue_program_dispatch_core =
dispatch_core_manager::get(device->num_hw_cqs()).dispatcher_d_core(device->id(), channel, id);
dispatch_core_manager::instance().dispatcher_d_core(device->id(), channel, id);
}
CoreType core_type = dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(device->id());
this->physical_enqueue_program_dispatch_core =
device->physical_core_from_logical_core(enqueue_program_dispatch_core, core_type);

tt_cxy_pair completion_q_writer_location =
dispatch_core_manager::get(device->num_hw_cqs()).completion_queue_writer_core(device->id(), channel, this->id);
dispatch_core_manager::instance().completion_queue_writer_core(device->id(), channel, this->id);

this->completion_queue_writer_core = CoreCoord(completion_q_writer_location.x, completion_q_writer_location.y);

Expand Down Expand Up @@ -1654,7 +1654,7 @@ void HWCommandQueue::enqueue_read_buffer(Buffer& buffer, void* dst, bool blockin
chip_id_t mmio_device_id = tt::Cluster::instance().get_associated_mmio_device(this->device->id());
uint16_t channel = tt::Cluster::instance().get_assigned_channel_for_device(this->device->id());
CoreType dispatch_core_type =
dispatch_core_manager::get(this->device->num_hw_cqs()).get_dispatch_core_type(this->device->id());
dispatch_core_manager::instance().get_dispatch_core_type(this->device->id());

uint32_t padded_page_size = buffer.aligned_page_size();
uint32_t pages_to_read = buffer.num_pages();
Expand Down Expand Up @@ -1792,7 +1792,7 @@ void HWCommandQueue::enqueue_write_buffer(
}

CoreType HWCommandQueue::get_dispatch_core_type() {
return dispatch_core_manager::get(device->num_hw_cqs()).get_dispatch_core_type(device->id());
return dispatch_core_manager::instance().get_dispatch_core_type(device->id());
}

void HWCommandQueue::enqueue_write_buffer(const Buffer& buffer, const void* src, bool blocking) {
Expand All @@ -1803,7 +1803,7 @@ void HWCommandQueue::enqueue_write_buffer(const Buffer& buffer, const void* src,

const uint32_t command_issue_limit = this->manager.get_issue_queue_limit(this->id);
CoreType dispatch_core_type =
dispatch_core_manager::get(this->device->num_hw_cqs()).get_dispatch_core_type(this->device->id());
dispatch_core_manager::instance().get_dispatch_core_type(this->device->id());
const uint32_t max_prefetch_command_size = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
uint32_t max_data_sizeB =
max_prefetch_command_size - ((sizeof(CQPrefetchCmd) + sizeof(CQDispatchCmd)) * 2); // * 2 to account for issue
Expand Down
11 changes: 6 additions & 5 deletions tt_metal/impl/dispatch/command_queue_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct dispatch_constants {
return inst;
}

static constexpr uint8_t MAX_NUM_HW_CQS = 2;
typedef uint16_t prefetch_q_entry_type;
static constexpr uint32_t PREFETCH_Q_LOG_MINSIZE = 4;
static constexpr uint32_t PREFETCH_Q_BASE = DISPATCH_L1_UNRESERVED_BASE;
Expand Down Expand Up @@ -387,17 +388,17 @@ class SystemMemoryManager {
}
this->channel_offset = MAX_HUGEPAGE_SIZE * get_umd_channel(channel) + (channel >> 2) * MAX_DEV_CHANNEL_SIZE;

CoreType core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id);
CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_id);
for (uint8_t cq_id = 0; cq_id < num_hw_cqs; cq_id++) {
tt_cxy_pair prefetcher_core =
dispatch_core_manager::get(num_hw_cqs).prefetcher_core(device_id, channel, cq_id);
dispatch_core_manager::instance().prefetcher_core(device_id, channel, cq_id);
tt_cxy_pair prefetcher_physical_core =
tt_cxy_pair(prefetcher_core.chip, tt::get_physical_core_coordinate(prefetcher_core, core_type));
this->prefetcher_cores[cq_id] = prefetcher_physical_core;
this->prefetch_q_writers.emplace_back(tt::Cluster::instance().get_static_tlb_writer(prefetcher_physical_core));

tt_cxy_pair completion_queue_writer_core =
dispatch_core_manager::get(num_hw_cqs).completion_queue_writer_core(device_id, channel, cq_id);
dispatch_core_manager::instance().completion_queue_writer_core(device_id, channel, cq_id);
const std::tuple<uint32_t, uint32_t> completion_interface_tlb_data =
tt::Cluster::instance()
.get_tlb_data(tt_cxy_pair(
Expand Down Expand Up @@ -688,7 +689,7 @@ class SystemMemoryManager {
wait_for_fetch_q_space();

// Wrap FetchQ if possible
CoreType core_type = dispatch_core_manager::get(num_hw_cqs).get_dispatch_core_type(device_id);
CoreType core_type = dispatch_core_manager::instance().get_dispatch_core_type(device_id);
uint32_t prefetch_q_base = DISPATCH_L1_UNRESERVED_BASE;
uint32_t prefetch_q_limit = prefetch_q_base + dispatch_constants::get(core_type).prefetch_q_entries() *
sizeof(dispatch_constants::prefetch_q_entry_type);
Expand All @@ -700,7 +701,7 @@ class SystemMemoryManager {

void fetch_queue_write(uint32_t command_size_B, const uint8_t cq_id, bool stall_prefetcher = false) {
CoreType dispatch_core_type =
dispatch_core_manager::get(this->num_hw_cqs).get_dispatch_core_type(this->device_id);
dispatch_core_manager::instance().get_dispatch_core_type(this->device_id);
uint32_t max_command_size_B = dispatch_constants::get(dispatch_core_type).max_prefetch_command_size();
TT_ASSERT(
command_size_B <= max_command_size_B,
Expand Down
30 changes: 19 additions & 11 deletions tt_metal/impl/dispatch/dispatch_core_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,27 @@ struct dispatch_core_placement_t {
std::optional<tt_cxy_pair> tunneler_d = std::nullopt; // ethernet tunneler
};


class dispatch_core_manager {
public:
dispatch_core_manager &operator=(const dispatch_core_manager &) = delete;
dispatch_core_manager &operator=(dispatch_core_manager &&other) noexcept = delete;
dispatch_core_manager(const dispatch_core_manager &) = delete;
dispatch_core_manager(dispatch_core_manager &&other) noexcept = delete;

// Ugly to accept num HW CQs here but it is needed to pull the correct number of initially available dispatch cores for assignment
static dispatch_core_manager &get(uint8_t num_hw_cqs) {
static std::unordered_map<uint8_t, std::unique_ptr<dispatch_core_manager>> dispatch_core_managers;
if (dispatch_core_managers[num_hw_cqs] == nullptr) {
// Need to do this since dispatch_core_manager constructor is private
dispatch_core_managers[num_hw_cqs] = std::unique_ptr<dispatch_core_manager>(new dispatch_core_manager(num_hw_cqs));

//TODO: this should probably be in command_queue_interface.hpp, but it's here for now due to circular dependency
static constexpr uint8_t MAX_NUM_HW_CQS = 2;
static void initialize() noexcept {
log_debug(tt::LogMetal, "DevicePool initialize");
if (_inst == nullptr) {
static dispatch_core_manager dispatch_core_manager;
_inst = &dispatch_core_manager;
}
return *dispatch_core_managers[num_hw_cqs];
}

static dispatch_core_manager &instance() {
TT_ASSERT(_inst != nullptr, "Trying to get dispatch_core_manager without initializing it");
return *_inst;
}

/// @brief Gets the location of the kernel desginated to read from the issue queue region from a particular command queue
Expand Down Expand Up @@ -343,6 +348,7 @@ class dispatch_core_manager {
}

void add_dispatch_core_to_device(chip_id_t device_id, const CoreCoord& core) {
// TODO: remove this API, we should read the core descriptor once, should not have backdoors like this to add cores
auto& dispatch_cores = available_dispatch_cores_by_device.at(device_id);
if (std::find(dispatch_cores.begin(), dispatch_cores.end(), core) == dispatch_cores.end()) {
dispatch_cores.push_back(core);
Expand All @@ -353,13 +359,13 @@ class dispatch_core_manager {
/// @brief dispatch_core_manager constructor initializes a list of cores per device that are designated for any dispatch functionality
/// This list contains dispatch cores that have not been assigned to a particular dispatch function
/// @param num_hw_cqs is used to get the correct collection of dispatch cores for a particular device
dispatch_core_manager(uint8_t num_hw_cqs) {
dispatch_core_manager() {
for (chip_id_t device_id = 0; device_id < tt::Cluster::instance().number_of_devices(); device_id++) {
std::list<CoreCoord> &logical_dispatch_cores = this->available_dispatch_cores_by_device[device_id];
for (const CoreCoord &logical_dispatch_core : tt::get_logical_dispatch_cores(device_id, num_hw_cqs)) {
for (const CoreCoord &logical_dispatch_core : tt::get_logical_dispatch_cores(device_id, MAX_NUM_HW_CQS)) {
logical_dispatch_cores.push_back(logical_dispatch_core);
}
this->dispatch_core_type_by_device[device_id] = tt::get_dispatch_core_type(device_id, num_hw_cqs);
this->dispatch_core_type_by_device[device_id] = tt::get_dispatch_core_type(device_id, MAX_NUM_HW_CQS);
}
}

Expand All @@ -383,6 +389,8 @@ class dispatch_core_manager {
std::unordered_map<chip_id_t, std::unordered_map<uint16_t, std::unordered_map<uint8_t, dispatch_core_placement_t>>> dispatch_core_assignments;
std::unordered_map<chip_id_t, std::list<CoreCoord>> available_dispatch_cores_by_device;
std::unordered_map<chip_id_t, CoreType> dispatch_core_type_by_device;
static dispatch_core_manager *_inst;

};


Expand Down
1 change: 1 addition & 0 deletions tt_metal/tt_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,6 +734,7 @@ Device *CreateDevice(

Device *CreateDeviceMinimal(chip_id_t device_id, const uint8_t num_hw_cqs) {
ZoneScoped;
tt::tt_metal::dispatch_core_manager::initialize();
Device *dev = new Device(device_id, num_hw_cqs, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, {}, true);
tt::Cluster::instance().set_internal_routing_info_for_ethernet_cores(true);
return dev;
Expand Down

0 comments on commit 087c0fe

Please sign in to comment.