Skip to content

Commit

Permalink
Revert "detect_arch in ClusterDescriptor (#345)"
Browse files Browse the repository at this point in the history
This reverts commit dca4e49.
  • Loading branch information
broskoTT committed Dec 5, 2024
1 parent dc5e371 commit 5914bb1
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 64 deletions.
5 changes: 5 additions & 0 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@

using TLB_DATA = tt::umd::tlb_data;

// TODO: Remove this - it's here for Metal backwards compatibility.
// Implementation is in cluster.cpp.
tt::ARCH detect_arch(int pci_device_num);
tt::ARCH detect_arch();

namespace boost::interprocess {
class named_mutex;
}
Expand Down
6 changes: 1 addition & 5 deletions device/api/umd/device/tt_cluster_descriptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ enum BoardType : uint32_t {
E150 = 2,
P150A = 3,
GALAXY = 4,
UNKNOWN = 5,
DEFAULT = 5,
};

class tt_ClusterDescriptor {
Expand All @@ -53,7 +53,6 @@ class tt_ClusterDescriptor {
std::unordered_map<chip_id_t, chip_id_t> closest_mmio_chip_cache = {};
std::unordered_map<chip_id_t, BoardType> chip_board_type = {};
std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> chips_grouped_by_closest_mmio;
std::unordered_map<chip_id_t, tt::ARCH> chip_arch = {};

// one-to-many chip connections
struct Chip2ChipConnection {
Expand All @@ -78,7 +77,6 @@ class tt_ClusterDescriptor {
static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc);

void fill_chips_grouped_by_closest_mmio();
static tt::ARCH arch_from_string(std::string arch_str);

public:
/*
Expand All @@ -98,7 +96,6 @@ class tt_ClusterDescriptor {
static std::string get_cluster_descriptor_file_path();
static std::unique_ptr<tt_ClusterDescriptor> create_from_yaml(const std::string &cluster_descriptor_file_path);
static std::unique_ptr<tt_ClusterDescriptor> create();
static tt::ARCH detect_arch(const chip_id_t chip_id);

// This function is used to create mock cluster descriptor yaml files, for example for simulation.
static std::unique_ptr<tt_ClusterDescriptor> create_mock_cluster(
Expand All @@ -118,7 +115,6 @@ class tt_ClusterDescriptor {
int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const;

BoardType get_board_type(chip_id_t chip_id) const;
tt::ARCH get_arch(chip_id_t chip_id) const;

bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const;
std::tuple<chip_id_t, ethernet_channel_t> get_chip_and_channel_of_remote_ethernet_core(
Expand Down
33 changes: 33 additions & 0 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,39 @@ const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024;
// Remove 256MB from full 1GB for channel 3 (iATU limitation)
static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368;

// TODO: Remove in favor of cluster descriptor method, when it becomes available.
// Metal uses this function to determine the architecture of the first PCIe chip
// and then verifies that all subsequent chips are of the same architecture. It
// looks like Metal is doing this because we don't provide any other way... When
// we are further along in our refactoring efforts and `tt_device` is more of a
// Cluster abstraction, we should provide Metal with interfaces for:
// 1. Checking that all chips are of the same architecture (we may not care
// about this, but the application might).
// 2. Getting the architecture of a specific chip.
// Until then... I'm putting this function back so that Metal will still build
// next time someone bumps its UMD submodule version.
tt::ARCH detect_arch(int pci_device_num) {
const auto devices_info = PCIDevice::enumerate_devices_info();
const auto it = devices_info.find(pci_device_num);
if (it == devices_info.end()) {
return tt::ARCH::Invalid;
}

const auto info = it->second;
return info.get_arch();
}

// TODO: Remove in favor of cluster descriptor method, when it becomes available.
// There is also a function which just wants to get any architecture, since it
// presumably already checked that all archs are the same.
tt::ARCH detect_arch() {
const auto devices_info = PCIDevice::enumerate_devices_info();
if (devices_info.empty()) {
return tt::ARCH::Invalid;
}
return devices_info.begin()->second.get_arch();
}

template <typename T>
void size_buffer_to_capacity(std::vector<T>& data_buf, std::size_t size_in_bytes) {
std::size_t target_size = 0;
Expand Down
41 changes: 5 additions & 36 deletions device/tt_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,6 @@ std::unique_ptr<tt_ClusterDescriptor> tt_ClusterDescriptor::create_mock_cluster(
log_debug(tt::LogSiliconDriver, "{} - adding logical: {}", __FUNCTION__, logical_id);
desc->chip_board_type.insert({logical_id, board_type});
desc->chips_with_mmio.insert({logical_id, logical_id});
desc->chip_arch.insert({logical_id, arch});
}

desc->enable_all_devices();
Expand Down Expand Up @@ -694,9 +693,7 @@ void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) {
void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) {
for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) {
chip_id_t chip_id = node->first.as<int>();
std::string arch_str = node->second.as<std::string>();
desc.all_chips.insert(chip_id);
desc.chip_arch.insert({chip_id, arch_from_string(arch_str)});
}

for (YAML::const_iterator node = yaml["chips"].begin(); node != yaml["chips"].end(); ++node) {
Expand Down Expand Up @@ -750,15 +747,15 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y
log_warning(
LogSiliconDriver,
"Unknown board type for chip {}. This might happen because chip is running old firmware. "
"Defaulting to UNKNOWN",
"Defaulting to DEFAULT",
chip);
board_type = BoardType::UNKNOWN;
board_type = BoardType::DEFAULT;
}
desc.chip_board_type.insert({chip, board_type});
}
} else {
for (const auto &chip : desc.all_chips) {
desc.chip_board_type.insert({chip, BoardType::UNKNOWN});
desc.chip_board_type.insert({chip, BoardType::DEFAULT});
}
}
}
Expand All @@ -784,19 +781,6 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() {
}
}

tt::ARCH tt_ClusterDescriptor::arch_from_string(std::string arch_str) {
if (arch_str == "Grayskull") {
return tt::ARCH::GRAYSKULL;
}
if (arch_str == "Wormhole") {
return tt::ARCH::WORMHOLE_B0;
}
if (arch_str == "Blackhole") {
return tt::ARCH::BLACKHOLE;
}
return tt::ARCH::Invalid;
}

const std::unordered_map<chip_id_t, std::unordered_map<ethernet_channel_t, std::tuple<chip_id_t, ethernet_channel_t>>>
tt_ClusterDescriptor::get_ethernet_connections() const {
auto eth_connections = std::
Expand Down Expand Up @@ -872,23 +856,8 @@ int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t
}

BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const {
log_assert(
chip_board_type.find(chip_id) != chip_board_type.end(),
"Chip {} does not have a board type in the cluster descriptor",
chip_id);
return chip_board_type.at(chip_id);
}

tt::ARCH tt_ClusterDescriptor::get_arch(chip_id_t chip_id) const {
log_assert(
chip_arch.find(chip_id) != chip_arch.end(),
"Chip {} does not have an architecture in the cluster descriptor",
chip_id);
return chip_arch.at(chip_id);
}

/* static */ tt::ARCH tt_ClusterDescriptor::detect_arch(chip_id_t chip_id) {
return tt_ClusterDescriptor::create()->get_arch(chip_id);
BoardType board_type = this->chip_board_type.at(chip_id);
return board_type;
}

const std::unordered_map<chip_id_t, std::unordered_set<chip_id_t>> &
Expand Down
36 changes: 13 additions & 23 deletions tests/api/test_cluster_descriptor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,35 +13,25 @@
#include "umd/device/pci_device.hpp"
#include "umd/device/tt_cluster_descriptor.h"

// TODO: Needed for detect_arch, remove when it is part of cluster descriptor.
#include "umd/device/cluster.h"

TEST(ApiClusterDescriptorTest, DetectArch) {
std::unique_ptr<tt_ClusterDescriptor> cluster_desc = tt_ClusterDescriptor::create();
// TODO: This should be part of cluster descriptor. It is currently used like this from tt_metal.
tt::ARCH arch = detect_arch();

if (cluster_desc->get_number_of_chips() == 0) {
// Expect it to be invalid if no devices are found.
EXPECT_THROW(tt_ClusterDescriptor::detect_arch(0), std::runtime_error);
// Expect it to be invalid if no devices are found.
if (PCIDevice::enumerate_devices().empty()) {
EXPECT_EQ(arch, tt::ARCH::Invalid);
} else {
tt::ARCH arch = tt_ClusterDescriptor::detect_arch(0);
EXPECT_NE(arch, tt::ARCH::Invalid);

// Test that cluster descriptor and PCIDevice::enumerate_devices_info() return the same set of chips.
std::map<int, PciDeviceInfo> pci_device_infos = PCIDevice::enumerate_devices_info();
std::unordered_set<chip_id_t> pci_chips_set;
for (auto [pci_device_number, _] : pci_device_infos) {
pci_chips_set.insert(pci_device_number);
}
// TODO: This should be the only available API, previous call should be routed to this one to get any arch.
tt::ARCH arch2 = detect_arch(PCIDevice::enumerate_devices()[0]);
EXPECT_NE(arch2, tt::ARCH::Invalid);

std::unordered_map<chip_id_t, chip_id_t> chips_with_mmio = cluster_desc->get_chips_with_mmio();
std::unordered_set<chip_id_t> cluster_chips_set;
for (auto [_, pci_device_number] : chips_with_mmio) {
cluster_chips_set.insert(pci_device_number);
}

EXPECT_EQ(pci_chips_set, cluster_chips_set);

// Test that cluster descriptor holds the same arch as pci_device.
for (auto [chip, pci_device_number] : cluster_desc->get_chips_with_mmio()) {
EXPECT_EQ(cluster_desc->get_arch(chip), pci_device_infos.at(pci_device_number).get_arch());
}
// In our current setup, we expect all arch to be the same.
EXPECT_EQ(arch, arch2);
}
}

Expand Down

0 comments on commit 5914bb1

Please sign in to comment.