From b8f5010f490b2a59a9c61977582a2b3e997905e3 Mon Sep 17 00:00:00 2001 From: Joel Smith Date: Wed, 16 Oct 2024 11:34:06 -0700 Subject: [PATCH] Fix Metal build: put detect_arch() back This family of functions was removed during a refactor. Metal is relying on one of them. This change reintroduces it. --- device/pcie/pci_device.cpp | 28 ++++++++++++++++++++++++++++ device/pcie/pci_device.hpp | 16 +++++++++++++--- device/tt_device.h | 3 +++ device/tt_silicon_driver.cpp | 21 +++++++++++++++++++++ 4 files changed, 65 insertions(+), 3 deletions(-) diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index 1db58674..931de4ce 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -190,6 +190,17 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte } } +tt::ARCH PciDeviceInfo::get_arch() const { + if (this->device_id == GS_PCIE_DEVICE_ID){ + return tt::ARCH::GRAYSKULL; + } else if (this->device_id == WH_PCIE_DEVICE_ID) { + return tt::ARCH::WORMHOLE_B0; + } else if (this->device_id == WH_PCIE_DEVICE_ID){ + return tt::ARCH::BLACKHOLE; + } + return tt::ARCH::Invalid; +} + /* static */ std::vector PCIDevice::enumerate_devices() { std::vector device_ids; @@ -212,6 +223,23 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte return device_ids; } +/* static */ std::map PCIDevice::enumerate_devices_info() { + std::map infos; + for (int n : PCIDevice::enumerate_devices()) { + int fd = open(fmt::format("/dev/tenstorrent/{}", n).c_str(), O_RDWR | O_CLOEXEC); + if (fd == -1) { + continue; + } + + try { + infos[n] = read_device_info(fd); + } catch (...) {} + + close(fd); + } + return infos; +} + PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)) , pci_device_num(pci_device_number) diff --git a/device/pcie/pci_device.hpp b/device/pcie/pci_device.hpp index e0a5230c..af325ce1 100644 --- a/device/pcie/pci_device.hpp +++ b/device/pcie/pci_device.hpp @@ -8,8 +8,9 @@ #include #include -#include +#include #include +#include #include "device/tt_xy_pair.h" #include "device/tt_arch_types.h" @@ -41,13 +42,17 @@ struct PciDeviceInfo uint16_t pci_bus; uint16_t pci_device; uint16_t pci_function; + + tt::ARCH get_arch() const; + // TODO: does it make sense to move attributes that we can read from sysfs + // onto this struct as methods? e.g. current_link_width etc. }; class PCIDevice { const std::string device_path; // Path to character device: /dev/tenstorrent/N - const int pci_device_num; // N in /dev/tenstorrent/N + const int pci_device_num; // N in /dev/tenstorrent/N const int logical_id; // Unique identifier for each device in entire network topology - const int pci_device_file_desc; // Character device file descriptor + const int pci_device_file_desc; // Character device file descriptor const PciDeviceInfo info; // PCI device info const int numa_node; // -1 if non-NUMA const int revision; // PCI revision value from sysfs @@ -60,6 +65,11 @@ class PCIDevice { */ static std::vector enumerate_devices(); + /** + * @return a map of PCI device numbers (/dev/tenstorrent/N) to PciDeviceInfo + */ + static std::map enumerate_devices_info(); + /** * PCI device constructor. * diff --git a/device/tt_device.h b/device/tt_device.h index ee0ea940..96c1e729 100644 --- a/device/tt_device.h +++ b/device/tt_device.h @@ -25,6 +25,9 @@ using TLB_DATA = tt::umd::tlb_data; +// TODO: Remove this - it's here for Metal backwards compatibility. +// Implementation is in tt_silicon_driver.cpp. +tt::ARCH detect_arch(int pci_device_num); namespace boost::interprocess{ class named_mutex; diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index 988a73c7..a9285b64 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -66,6 +66,27 @@ std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages // TLB size for DRAM on blackhole - 4GB const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024; +// Metal uses this function to determine the architecture of the first PCIe chip +// and then verifies that all subsequent chips are of the same architecture. It +// looks like Metal is doing this because we don't provide any other way... When +// we are further along in our refactoring efforts and `tt_device` is more of a +// Cluster abstraction, we should provide Metal with interfaces for: +// 1. Checking that all chips are of the same architecture (we may not care +// about this, but the application might). +// 2. Getting the architecture of a specific chip. +// Until then... I'm putting this function back so that Metal will still build +// next time someone bumps its UMD submodule version. +tt::ARCH detect_arch(int pci_device_num) { + const auto devices_info = PCIDevice::enumerate_devices_info(); + const auto it = devices_info.find(pci_device_num); + if (it == devices_info.end()) { + return tt::ARCH::Invalid; + } + + const auto info = it->second; + return info.get_arch(); +} + template void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0;