From 89d073d0369386c7618e0d23521c73d69e3ec75e Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 27 Nov 2024 11:11:20 +0000 Subject: [PATCH 1/4] detect_arch --- device/api/umd/device/cluster.h | 5 --- device/api/umd/device/tt_cluster_descriptor.h | 4 +++ device/cluster.cpp | 33 ------------------- device/tt_cluster_descriptor.cpp | 33 +++++++++++++++++-- tests/api/test_cluster_descriptor.cpp | 31 +++++++++++------ 5 files changed, 56 insertions(+), 50 deletions(-) diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h index 9f60adef..dd405a45 100644 --- a/device/api/umd/device/cluster.h +++ b/device/api/umd/device/cluster.h @@ -24,11 +24,6 @@ using TLB_DATA = tt::umd::tlb_data; -// TODO: Remove this - it's here for Metal backwards compatibility. -// Implementation is in cluster.cpp. -tt::ARCH detect_arch(int pci_device_num); -tt::ARCH detect_arch(); - namespace boost::interprocess { class named_mutex; } diff --git a/device/api/umd/device/tt_cluster_descriptor.h b/device/api/umd/device/tt_cluster_descriptor.h index 8fea01f0..1f4b21b3 100644 --- a/device/api/umd/device/tt_cluster_descriptor.h +++ b/device/api/umd/device/tt_cluster_descriptor.h @@ -53,6 +53,7 @@ class tt_ClusterDescriptor { std::unordered_map closest_mmio_chip_cache = {}; std::unordered_map chip_board_type = {}; std::unordered_map> chips_grouped_by_closest_mmio; + std::unordered_map chip_arch = {}; // one-to-many chip connections struct Chip2ChipConnection { @@ -77,6 +78,7 @@ class tt_ClusterDescriptor { static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); void fill_chips_grouped_by_closest_mmio(); + static tt::ARCH arch_from_string(std::string arch_str); public: /* @@ -96,6 +98,7 @@ class tt_ClusterDescriptor { static std::string get_cluster_descriptor_file_path(); static std::unique_ptr create_from_yaml(const std::string &cluster_descriptor_file_path); static std::unique_ptr create(); + static tt::ARCH detect_arch(const chip_id_t chip_id); // This function is used to create mock cluster descriptor yaml files, for example for simulation. static std::unique_ptr create_mock_cluster( @@ -115,6 +118,7 @@ class tt_ClusterDescriptor { int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const; BoardType get_board_type(chip_id_t chip_id) const; + tt::ARCH get_arch(chip_id_t chip_id) const; bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; std::tuple get_chip_and_channel_of_remote_ethernet_core( diff --git a/device/cluster.cpp b/device/cluster.cpp index 28d18dc5..d3975804 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -57,39 +57,6 @@ const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024; // Remove 256MB from full 1GB for channel 3 (iATU limitation) static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368; -// TODO: Remove in favor of cluster descriptor method, when it becomes available. -// Metal uses this function to determine the architecture of the first PCIe chip -// and then verifies that all subsequent chips are of the same architecture. It -// looks like Metal is doing this because we don't provide any other way... When -// we are further along in our refactoring efforts and `tt_device` is more of a -// Cluster abstraction, we should provide Metal with interfaces for: -// 1. Checking that all chips are of the same architecture (we may not care -// about this, but the application might). -// 2. Getting the architecture of a specific chip. -// Until then... I'm putting this function back so that Metal will still build -// next time someone bumps its UMD submodule version. -tt::ARCH detect_arch(int pci_device_num) { - const auto devices_info = PCIDevice::enumerate_devices_info(); - const auto it = devices_info.find(pci_device_num); - if (it == devices_info.end()) { - return tt::ARCH::Invalid; - } - - const auto info = it->second; - return info.get_arch(); -} - -// TODO: Remove in favor of cluster descriptor method, when it becomes available. -// There is also a function which just wants to get any architecture, since it -// presumably already checked that all archs are the same. -tt::ARCH detect_arch() { - const auto devices_info = PCIDevice::enumerate_devices_info(); - if (devices_info.empty()) { - return tt::ARCH::Invalid; - } - return devices_info.begin()->second.get_arch(); -} - template void size_buffer_to_capacity(std::vector& data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0; diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index dc9b0b42..215b77dc 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -467,6 +467,7 @@ std::unique_ptr tt_ClusterDescriptor::create_mock_cluster( log_debug(tt::LogSiliconDriver, "{} - adding logical: {}", __FUNCTION__, logical_id); desc->chip_board_type.insert({logical_id, board_type}); desc->chips_with_mmio.insert({logical_id, logical_id}); + desc->chip_arch.insert({logical_id, arch}); } desc->enable_all_devices(); @@ -693,7 +694,9 @@ void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) { void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) { chip_id_t chip_id = node->first.as(); + std::string arch_str = node->second.as(); desc.all_chips.insert(chip_id); + desc.chip_arch.insert({chip_id, arch_from_string(arch_str)}); } for (YAML::const_iterator node = yaml["chips"].begin(); node != yaml["chips"].end(); ++node) { @@ -781,6 +784,19 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() { } } +tt::ARCH tt_ClusterDescriptor::arch_from_string(std::string arch_str) { + if (arch_str == "Grayskull") { + return tt::ARCH::GRAYSKULL; + } + if (arch_str == "Wormhole") { + return tt::ARCH::WORMHOLE_B0; + } + if (arch_str == "Blackhole") { + return tt::ARCH::BLACKHOLE; + } + return tt::ARCH::Invalid; +} + const std::unordered_map>> tt_ClusterDescriptor::get_ethernet_connections() const { auto eth_connections = std:: @@ -856,8 +872,21 @@ int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t } BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const { - BoardType board_type = this->chip_board_type.at(chip_id); - return board_type; + if (chip_board_type.find(chip_id) == chip_board_type.end()) { + return BoardType::DEFAULT; + } + return chip_board_type.at(chip_id); +} + +tt::ARCH tt_ClusterDescriptor::get_arch(chip_id_t chip_id) const { + if (chip_arch.find(chip_id) == chip_arch.end()) { + return tt::ARCH::Invalid; + } + return chip_arch.at(chip_id); +} + +/* static */ tt::ARCH tt_ClusterDescriptor::detect_arch(chip_id_t chip_id) { + return tt_ClusterDescriptor::create()->get_arch(chip_id); } const std::unordered_map> & diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index 97be7ab1..fa9f65b1 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -13,12 +13,8 @@ #include "umd/device/pci_device.hpp" #include "umd/device/tt_cluster_descriptor.h" -// TODO: Needed for detect_arch, remove when it is part of cluster descriptor. -#include "umd/device/cluster.h" - TEST(ApiClusterDescriptorTest, DetectArch) { - // TODO: This should be part of cluster descriptor. It is currently used like this from tt_metal. - tt::ARCH arch = detect_arch(); + tt::ARCH arch = tt_ClusterDescriptor::detect_arch(0); // Expect it to be invalid if no devices are found. if (PCIDevice::enumerate_devices().empty()) { @@ -26,12 +22,27 @@ TEST(ApiClusterDescriptorTest, DetectArch) { } else { EXPECT_NE(arch, tt::ARCH::Invalid); - // TODO: This should be the only available API, previous call should be routed to this one to get any arch. - tt::ARCH arch2 = detect_arch(PCIDevice::enumerate_devices()[0]); - EXPECT_NE(arch2, tt::ARCH::Invalid); + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create(); + + // Test that cluster descriptor and PCIDevice::enumerate_devices_info() return the same set of chips. + std::map pci_device_infos = PCIDevice::enumerate_devices_info(); + std::unordered_set pci_chips_set; + for (auto [pci_device_number, _] : pci_device_infos) { + pci_chips_set.insert(pci_device_number); + } - // In our current setup, we expect all arch to be the same. - EXPECT_EQ(arch, arch2); + std::unordered_map chips_with_mmio = cluster_desc->get_chips_with_mmio(); + std::unordered_set cluster_chips_set; + for (auto [_, pci_device_number] : chips_with_mmio) { + cluster_chips_set.insert(pci_device_number); + } + + EXPECT_EQ(pci_chips_set, cluster_chips_set); + + // Test that cluster descriptor holds the same arch as pci_device. + for (auto [chip, pci_device_number] : cluster_desc->get_chips_with_mmio()) { + EXPECT_EQ(cluster_desc->get_arch(chip), pci_device_infos.at(pci_device_number).get_arch()); + } } } From a17dd0a1db9873a7f1436784ce3bb5bbcc4691d2 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Fri, 29 Nov 2024 08:14:44 +0000 Subject: [PATCH 2/4] resolved comment --- device/api/umd/device/tt_cluster_descriptor.h | 2 +- device/tt_cluster_descriptor.cpp | 20 ++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/device/api/umd/device/tt_cluster_descriptor.h b/device/api/umd/device/tt_cluster_descriptor.h index 1f4b21b3..df9ea6ed 100644 --- a/device/api/umd/device/tt_cluster_descriptor.h +++ b/device/api/umd/device/tt_cluster_descriptor.h @@ -30,7 +30,7 @@ enum BoardType : uint32_t { E150 = 2, P150A = 3, GALAXY = 4, - DEFAULT = 5, + UNKNOWN = 5, }; class tt_ClusterDescriptor { diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index 215b77dc..a62eb0c4 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -750,15 +750,15 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y log_warning( LogSiliconDriver, "Unknown board type for chip {}. This might happen because chip is running old firmware. " - "Defaulting to DEFAULT", + "Defaulting to UNKNOWN", chip); - board_type = BoardType::DEFAULT; + board_type = BoardType::UNKNOWN; } desc.chip_board_type.insert({chip, board_type}); } } else { for (const auto &chip : desc.all_chips) { - desc.chip_board_type.insert({chip, BoardType::DEFAULT}); + desc.chip_board_type.insert({chip, BoardType::UNKNOWN}); } } } @@ -872,16 +872,18 @@ int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t } BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const { - if (chip_board_type.find(chip_id) == chip_board_type.end()) { - return BoardType::DEFAULT; - } + log_assert( + chip_board_type.find(chip_id) != chip_board_type.end(), + "Chip {} does not have a board type in the cluster descriptor", + chip_id); return chip_board_type.at(chip_id); } tt::ARCH tt_ClusterDescriptor::get_arch(chip_id_t chip_id) const { - if (chip_arch.find(chip_id) == chip_arch.end()) { - return tt::ARCH::Invalid; - } + log_assert( + chip_arch.find(chip_id) != chip_arch.end(), + "Chip {} does not have an architecture in the cluster descriptor", + chip_id); return chip_arch.at(chip_id); } From f57816a755fa02921a2f79a67d70a03f093add67 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Tue, 3 Dec 2024 14:42:19 +0000 Subject: [PATCH 3/4] minor fix for compute machine --- tests/api/test_cluster_descriptor.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index fa9f65b1..a0ee9860 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -14,16 +14,15 @@ #include "umd/device/tt_cluster_descriptor.h" TEST(ApiClusterDescriptorTest, DetectArch) { - tt::ARCH arch = tt_ClusterDescriptor::detect_arch(0); + EXPECT_THROW(tt_ClusterDescriptor::detect_arch(0), std::runtime_error); + + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create(); // Expect it to be invalid if no devices are found. - if (PCIDevice::enumerate_devices().empty()) { - EXPECT_EQ(arch, tt::ARCH::Invalid); - } else { + if (cluster_desc->get_number_of_chips() > 0) { + tt::ARCH arch = tt_ClusterDescriptor::detect_arch(0); EXPECT_NE(arch, tt::ARCH::Invalid); - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create(); - // Test that cluster descriptor and PCIDevice::enumerate_devices_info() return the same set of chips. std::map pci_device_infos = PCIDevice::enumerate_devices_info(); std::unordered_set pci_chips_set; From d22d6bc0d75fc17f455bd70b9e06d7d69de12dd1 Mon Sep 17 00:00:00 2001 From: Bojan Rosko Date: Wed, 4 Dec 2024 15:58:15 +0000 Subject: [PATCH 4/4] fix test --- tests/api/test_cluster_descriptor.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index a0ee9860..68a0a61e 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -14,12 +14,12 @@ #include "umd/device/tt_cluster_descriptor.h" TEST(ApiClusterDescriptorTest, DetectArch) { - EXPECT_THROW(tt_ClusterDescriptor::detect_arch(0), std::runtime_error); - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create(); - // Expect it to be invalid if no devices are found. - if (cluster_desc->get_number_of_chips() > 0) { + if (cluster_desc->get_number_of_chips() == 0) { + // Expect it to be invalid if no devices are found. + EXPECT_THROW(tt_ClusterDescriptor::detect_arch(0), std::runtime_error); + } else { tt::ARCH arch = tt_ClusterDescriptor::detect_arch(0); EXPECT_NE(arch, tt::ARCH::Invalid);