Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Metal build: put detect_arch() back #175

Merged
merged 1 commit into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions device/pcie/pci_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,17 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
}
}

tt::ARCH PciDeviceInfo::get_arch() const {
if (this->device_id == GS_PCIE_DEVICE_ID){
return tt::ARCH::GRAYSKULL;
} else if (this->device_id == WH_PCIE_DEVICE_ID) {
return tt::ARCH::WORMHOLE_B0;
} else if (this->device_id == WH_PCIE_DEVICE_ID){
return tt::ARCH::BLACKHOLE;
}
return tt::ARCH::Invalid;
}


/* static */ std::vector<int> PCIDevice::enumerate_devices() {
std::vector<int> device_ids;
Expand All @@ -212,6 +223,23 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte
return device_ids;
}

/* static */ std::map<int, PciDeviceInfo> PCIDevice::enumerate_devices_info() {
std::map<int, PciDeviceInfo> infos;
for (int n : PCIDevice::enumerate_devices()) {
int fd = open(fmt::format("/dev/tenstorrent/{}", n).c_str(), O_RDWR | O_CLOEXEC);
if (fd == -1) {
continue;
}

try {
infos[n] = read_device_info(fd);
} catch (...) {}

close(fd);
}
return infos;
}

PCIDevice::PCIDevice(int pci_device_number, int logical_device_id)
: device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number))
, pci_device_num(pci_device_number)
Expand Down
16 changes: 13 additions & 3 deletions device/pcie/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@

#include <cstdint>
#include <cstdio>
#include <vector>
#include <map>
#include <unordered_map>
#include <vector>

#include "device/tt_xy_pair.h"
#include "device/tt_arch_types.h"
Expand Down Expand Up @@ -41,13 +42,17 @@ struct PciDeviceInfo
uint16_t pci_bus;
uint16_t pci_device;
uint16_t pci_function;

tt::ARCH get_arch() const;
// TODO: does it make sense to move attributes that we can read from sysfs
// onto this struct as methods? e.g. current_link_width etc.
};

class PCIDevice {
const std::string device_path; // Path to character device: /dev/tenstorrent/N
const int pci_device_num; // N in /dev/tenstorrent/N
const int pci_device_num; // N in /dev/tenstorrent/N
const int logical_id; // Unique identifier for each device in entire network topology
const int pci_device_file_desc; // Character device file descriptor
const int pci_device_file_desc; // Character device file descriptor
const PciDeviceInfo info; // PCI device info
const int numa_node; // -1 if non-NUMA
const int revision; // PCI revision value from sysfs
Expand All @@ -60,6 +65,11 @@ class PCIDevice {
*/
static std::vector<int> enumerate_devices();

/**
* @return a map of PCI device numbers (/dev/tenstorrent/N) to PciDeviceInfo
*/
static std::map<int, PciDeviceInfo> enumerate_devices_info();

/**
* PCI device constructor.
*
Expand Down
3 changes: 3 additions & 0 deletions device/tt_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

using TLB_DATA = tt::umd::tlb_data;

// TODO: Remove this - it's here for Metal backwards compatibility.
// Implementation is in tt_silicon_driver.cpp.
tt::ARCH detect_arch(int pci_device_num);

namespace boost::interprocess{
class named_mutex;
Expand Down
21 changes: 21 additions & 0 deletions device/tt_silicon_driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,27 @@ std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages
// TLB size for DRAM on blackhole - 4GB
const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024;

// Metal uses this function to determine the architecture of the first PCIe chip
// and then verifies that all subsequent chips are of the same architecture. It
// looks like Metal is doing this because we don't provide any other way... When
// we are further along in our refactoring efforts and `tt_device` is more of a
// Cluster abstraction, we should provide Metal with interfaces for:
// 1. Checking that all chips are of the same architecture (we may not care
// about this, but the application might).
// 2. Getting the architecture of a specific chip.
// Until then... I'm putting this function back so that Metal will still build
// next time someone bumps its UMD submodule version.
tt::ARCH detect_arch(int pci_device_num) {
const auto devices_info = PCIDevice::enumerate_devices_info();
const auto it = devices_info.find(pci_device_num);
if (it == devices_info.end()) {
return tt::ARCH::Invalid;
}

const auto info = it->second;
return info.get_arch();
}

template <typename T>
void size_buffer_to_capacity(std::vector<T> &data_buf, std::size_t size_in_bytes) {
std::size_t target_size = 0;
Expand Down