Skip to content

Commit

Permalink
Cluster descriptor API for active/idle cores (#541)
Browse files Browse the repository at this point in the history
### Issue

/

### Description

Add structs and functions to get active/idle ETH channels from Cluster
descriptor. This is implement only for Blackhole, to unblock metal.
Issue to port it to Wormhole is #540

### List of the changes

- Add struct for active cores
- Add struct for idle cores
- Populate during creation of cluster desc
- Add API for getting ETH channels

### Testing
CI + additional test

### API Changes
/
  • Loading branch information
pjanevskiTT authored Feb 26, 2025
1 parent eef73bf commit da0cc30
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 2 deletions.
5 changes: 5 additions & 0 deletions device/api/umd/device/tt_cluster_descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class tt_ClusterDescriptor {
std::unordered_map<chip_id_t, tt::ARCH> chip_arch = {};
std::map<ChipUID, chip_id_t> chip_uid_to_chip_id = {};
std::map<chip_id_t, ChipUID> chip_id_to_chip_uid = {};
std::map<chip_id_t, std::set<uint32_t>> active_eth_channels = {};
std::map<chip_id_t, std::set<uint32_t>> idle_eth_channels = {};

// one-to-many chip connections
struct Chip2ChipConnection {
Expand Down Expand Up @@ -129,4 +131,7 @@ class tt_ClusterDescriptor {
std::string serialize() const;

std::filesystem::path serialize_to_file() const;

std::set<uint32_t> get_active_eth_channels(chip_id_t chip_id);
std::set<uint32_t> get_idle_eth_channels(chip_id_t chip_id);
};
7 changes: 5 additions & 2 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3504,8 +3504,9 @@ std::unique_ptr<tt_ClusterDescriptor> Cluster::create_cluster_descriptor(
sizeof(boot_results));

if (boot_results.eth_status.port_status == port_status_e::PORT_UP) {
log_debug(LogSiliconDriver, "Eth core ({}, {}) on chip {} is active", eth_core.x, eth_core.y, chip_id);
// active eth core
desc->active_eth_channels[chip_id].insert(eth_channel);
log_debug(LogSiliconDriver, "Eth core ({}, {}) on chip {} is active", eth_core.x, eth_core.y, chip_id);
const chip_info_t& local_info = boot_results.local_info;
const chip_info_t& remote_info = boot_results.remote_info;

Expand All @@ -3525,12 +3526,14 @@ std::unique_ptr<tt_ClusterDescriptor> Cluster::create_cluster_descriptor(
desc->ethernet_connections[local_chip_id][local_info.eth_id] = {
remote_chip_id.value(), remote_info.eth_id};
}

} else if (boot_results.eth_status.port_status == port_status_e::PORT_DOWN) {
// active eth core, just with link being down.
desc->active_eth_channels[chip_id].insert(eth_channel);
log_debug(
LogSiliconDriver, "Port on eth core ({}, {}) on chip {} is down", eth_core.x, eth_core.y, chip_id);
} else if (boot_results.eth_status.port_status == port_status_e::PORT_UNUSED) {
// idle core
desc->idle_eth_channels[chip_id].insert(eth_channel);
log_debug(LogSiliconDriver, "Eth core ({}, {}) on chip {} is idle");
} else if (boot_results.eth_status.port_status == port_status_e::PORT_UNKNOWN) {
log_debug(
Expand Down
18 changes: 18 additions & 0 deletions device/tt_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -997,3 +997,21 @@ std::filesystem::path tt_ClusterDescriptor::serialize_to_file() const {

return cluster_path;
}

std::set<uint32_t> tt_ClusterDescriptor::get_active_eth_channels(chip_id_t chip_id) {
auto it = active_eth_channels.find(chip_id);
if (it == active_eth_channels.end()) {
return {};
}

return it->second;
}

std::set<uint32_t> tt_ClusterDescriptor::get_idle_eth_channels(chip_id_t chip_id) {
auto it = idle_eth_channels.find(chip_id);
if (it == idle_eth_channels.end()) {
return {};
}

return it->second;
}
13 changes: 13 additions & 0 deletions tests/blackhole/test_cluster_bh.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -978,3 +978,16 @@ TEST(SiliconDriverBH, RandomSysmemTestWithPcie) {
}
}
}

// Verifies that all ETH channels are classified as either active/idle.
TEST(ClusterBH, TotalNumberOfEthCores) {
std::unique_ptr<Cluster> cluster = std::make_unique<Cluster>();

const uint32_t num_eth_cores = cluster->get_soc_descriptor(0).get_cores(CoreType::ETH).size();

tt_ClusterDescriptor* cluster_desc = cluster->get_cluster_description();
const uint32_t num_active_channels = cluster_desc->get_active_eth_channels(0).size();
const uint32_t num_idle_channels = cluster_desc->get_idle_eth_channels(0).size();

EXPECT_EQ(num_eth_cores, num_active_channels + num_idle_channels);
}

0 comments on commit da0cc30

Please sign in to comment.