Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

detect_arch in ClusterDescriptor #345

Merged
merged 6 commits into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@

using TLB_DATA = tt::umd::tlb_data;

// TODO: Remove this - it's here for Metal backwards compatibility.
// Implementation is in cluster.cpp.
tt::ARCH detect_arch(int pci_device_num);
tt::ARCH detect_arch();

namespace boost::interprocess {
class named_mutex;
}
Expand Down
6 changes: 5 additions & 1 deletion device/api/umd/device/tt_cluster_descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ enum BoardType : uint32_t {
E150 = 2,
P150A = 3,
GALAXY = 4,
DEFAULT = 5,
UNKNOWN = 5,
};

class tt_ClusterDescriptor {
Expand All @@ -53,6 +53,7 @@ class tt_ClusterDescriptor {
std::unordered_map<chip_id_t, chip_id_t> closest_mmio_chip_cache = {};
std::unordered_map<chip_id_t, BoardType> chip_board_type = {};
std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio;
std::unordered_map<chip_id_t, tt::ARCH> chip_arch = {};

// one-to-many chip connections
struct Chip2ChipConnection {
Expand All @@ -77,6 +78,7 @@ class tt_ClusterDescriptor {
static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc);

void fill_chips_grouped_by_closest_mmio();
static tt::ARCH arch_from_string(std::string arch_str);

public:
/*
Expand All @@ -96,6 +98,7 @@ class tt_ClusterDescriptor {
static std::string get_cluster_descriptor_file_path();
static std::unique_ptr<tt_ClusterDescriptor> create_from_yaml(const std::string &cluster_descriptor_file_path);
static std::unique_ptr<tt_ClusterDescriptor> create();
static tt::ARCH detect_arch(const chip_id_t chip_id);

// This function is used to create mock cluster descriptor yaml files, for example for simulation.
static std::unique_ptr<tt_ClusterDescriptor> create_mock_cluster(
Expand All @@ -115,6 +118,7 @@ class tt_ClusterDescriptor {
int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const;

BoardType get_board_type(chip_id_t chip_id) const;
tt::ARCH get_arch(chip_id_t chip_id) const;

bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(
Expand Down
33 changes: 0 additions & 33 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,39 +59,6 @@ const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024;
// Remove 256MB from full 1GB for channel 3 (iATU limitation)
static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368;

// TODO: Remove in favor of cluster descriptor method, when it becomes available.
// Metal uses this function to determine the architecture of the first PCIe chip
// and then verifies that all subsequent chips are of the same architecture. It
// looks like Metal is doing this because we don't provide any other way... When
// we are further along in our refactoring efforts and `tt_device` is more of a
// Cluster abstraction, we should provide Metal with interfaces for:
// 1. Checking that all chips are of the same architecture (we may not care
// about this, but the application might).
// 2. Getting the architecture of a specific chip.
// Until then... I'm putting this function back so that Metal will still build
// next time someone bumps its UMD submodule version.
tt::ARCH detect_arch(int pci_device_num) {
const auto devices_info = PCIDevice::enumerate_devices_info();
const auto it = devices_info.find(pci_device_num);
if (it == devices_info.end()) {
return tt::ARCH::Invalid;
}

const auto info = it->second;
return info.get_arch();
}

// TODO: Remove in favor of cluster descriptor method, when it becomes available.
// There is also a function which just wants to get any architecture, since it
// presumably already checked that all archs are the same.
tt::ARCH detect_arch() {
const auto devices_info = PCIDevice::enumerate_devices_info();
if (devices_info.empty()) {
return tt::ARCH::Invalid;
}
return devices_info.begin()->second.get_arch();
}

template <typename T>
void size_buffer_to_capacity(std::vector<T>& data_buf, std::size_t size_in_bytes) {
std::size_t target_size = 0;
Expand Down
41 changes: 36 additions & 5 deletions device/tt_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_mock_cluster(
log_debug(tt::LogSiliconDriver, "{} - adding logical: {}", __FUNCTION__, logical_id);
desc->chip_board_type.insert({logical_id, board_type});
desc->chips_with_mmio.insert({logical_id, logical_id});
desc->chip_arch.insert({logical_id, arch});
}

desc->enable_all_devices();
Expand Down Expand Up @@ -693,7 +694,9 @@ void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) {
void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) {
chip_id_t chip_id = node->first.as<int>();
std::string arch_str = node->second.as<std::string>();
desc.all_chips.insert(chip_id);
desc.chip_arch.insert({chip_id, arch_from_string(arch_str)});
}

for (YAML::const_iterator node = yaml["chips"].begin(); node != yaml["chips"].end(); ++node) {
Expand Down Expand Up @@ -747,15 +750,15 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
log_warning(
LogSiliconDriver,
"Unknown board type for chip {}. This might happen because chip is running old firmware. "
"Defaulting to DEFAULT",
"Defaulting to UNKNOWN",
chip);
board_type = BoardType::DEFAULT;
board_type = BoardType::UNKNOWN;
}
desc.chip_board_type.insert({chip, board_type});
}
} else {
for (const auto &chip : desc.all_chips) {
desc.chip_board_type.insert({chip, BoardType::DEFAULT});
desc.chip_board_type.insert({chip, BoardType::UNKNOWN});
}
}
}
Expand All @@ -781,6 +784,19 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() {
}
}

tt::ARCH tt_ClusterDescriptor::arch_from_string(std::string arch_str) {
if (arch_str == "Grayskull") {
return tt::ARCH::GRAYSKULL;
}
if (arch_str == "Wormhole") {
return tt::ARCH::WORMHOLE_B0;
}
if (arch_str == "Blackhole") {
return tt::ARCH::BLACKHOLE;
}
return tt::ARCH::Invalid;
}

const std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
tt_ClusterDescriptor::get_ethernet_connections() const {
auto eth_connections = std::
Expand Down Expand Up @@ -856,8 +872,23 @@ int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t
}

BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const {
BoardType board_type = this->chip_board_type.at(chip_id);
return board_type;
log_assert(
chip_board_type.find(chip_id) != chip_board_type.end(),
"Chip {} does not have a board type in the cluster descriptor",
chip_id);
return chip_board_type.at(chip_id);
}

tt::ARCH tt_ClusterDescriptor::get_arch(chip_id_t chip_id) const {
log_assert(
chip_arch.find(chip_id) != chip_arch.end(),
"Chip {} does not have an architecture in the cluster descriptor",
chip_id);
return chip_arch.at(chip_id);
}

/* static */ tt::ARCH tt_ClusterDescriptor::detect_arch(chip_id_t chip_id) {
return tt_ClusterDescriptor::create()->get_arch(chip_id);
}

const std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> &
Expand Down
31 changes: 21 additions & 10 deletions tests/api/test_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,25 +13,36 @@
#include "umd/device/pci_device.hpp"
#include "umd/device/tt_cluster_descriptor.h"

// TODO: Needed for detect_arch, remove when it is part of cluster descriptor.
#include "umd/device/cluster.h"

TEST(ApiClusterDescriptorTest, DetectArch) {
// TODO: This should be part of cluster descriptor. It is currently used like this from tt_metal.
tt::ARCH arch = detect_arch();
tt::ARCH arch = tt_ClusterDescriptor::detect_arch(0);

// Expect it to be invalid if no devices are found.
if (PCIDevice::enumerate_devices().empty()) {
EXPECT_EQ(arch, tt::ARCH::Invalid);
} else {
EXPECT_NE(arch, tt::ARCH::Invalid);

// TODO: This should be the only available API, previous call should be routed to this one to get any arch.
tt::ARCH arch2 = detect_arch(PCIDevice::enumerate_devices()[0]);
EXPECT_NE(arch2, tt::ARCH::Invalid);
std::unique_ptr<tt_ClusterDescriptor> cluster_desc = tt_ClusterDescriptor::create();

// Test that cluster descriptor and PCIDevice::enumerate_devices_info() return the same set of chips.
std::map<int, PciDeviceInfo> pci_device_infos = PCIDevice::enumerate_devices_info();
std::unordered_set<chip_id_t> pci_chips_set;
for (auto [pci_device_number, _] : pci_device_infos) {
pci_chips_set.insert(pci_device_number);
}

// In our current setup, we expect all arch to be the same.
EXPECT_EQ(arch, arch2);
std::unordered_map<chip_id_t, chip_id_t> chips_with_mmio = cluster_desc->get_chips_with_mmio();
std::unordered_set<chip_id_t> cluster_chips_set;
for (auto [_, pci_device_number] : chips_with_mmio) {
cluster_chips_set.insert(pci_device_number);
}

EXPECT_EQ(pci_chips_set, cluster_chips_set);

// Test that cluster descriptor holds the same arch as pci_device.
for (auto [chip, pci_device_number] : cluster_desc->get_chips_with_mmio()) {
EXPECT_EQ(cluster_desc->get_arch(chip), pci_device_infos.at(pci_device_number).get_arch());
}
}
}

Expand Down
Loading