From 1ab0621e0345eada20b1418b941353d78d640042 Mon Sep 17 00:00:00 2001 From: Joel Smith Date: Fri, 11 Oct 2024 07:53:02 -0700 Subject: [PATCH 1/2] PCIDevice cleanup --- device/pcie/pci_device.cpp | 208 +++++++++++++++----------------- device/pcie/pci_device.hpp | 155 ++++++++++++++++-------- device/pcie/utils.hpp | 87 ------------- device/tt_arch_types.h | 3 + device/tt_silicon_driver.cpp | 74 ++++++++---- tests/pcie/test_pcie_device.cpp | 4 +- 6 files changed, 258 insertions(+), 273 deletions(-) delete mode 100644 device/pcie/utils.hpp diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index 11272199..fedac7d0 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -14,7 +14,7 @@ #include // for PCI_SLOT, PCI_FUNC #include "pci_device.hpp" -#include "utils.hpp" +#include "ioctl.h" #include "ioctl.h" #include "device/tt_arch_types.h" @@ -23,6 +23,49 @@ #include "common/assert.hpp" #include "common/logger.hpp" +static const uint16_t GS_PCIE_DEVICE_ID = 0xfaca; +static const uint16_t WH_PCIE_DEVICE_ID = 0x401e; +static const uint16_t BH_PCIE_DEVICE_ID = 0xb140; + +// TODO: we'll have to rethink this when KMD takes control of the inbound PCIe +// TLB windows and there is no longer a pre-defined WC/UC split. +static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); + +// Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC +static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; + +static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; +static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; + +template +static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name) { + const auto sysfs_path = fmt::format("/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}", + device_info.pci_domain, device_info.pci_bus, + device_info.pci_device, device_info.pci_function, attribute_name); + std::ifstream attribute_file(sysfs_path); + std::string value_str; + T value; + + if (!std::getline(attribute_file, value_str)) { + TT_THROW("Failed reading sysfs attribute: {}", sysfs_path); + } + + std::istringstream iss(value_str); + + // Handle hexadecimal input for integer types + if constexpr (std::is_integral_v) { + if (value_str.substr(0, 2) == "0x") { + iss >> std::hex; + } + } + + if (!(iss >> value)) { + TT_THROW("Failed to parse sysfs attribute value: {}", value_str); + } + + return value; +} + static PciDeviceInfo read_device_info(int fd) { tenstorrent_get_device_info info{}; @@ -39,26 +82,21 @@ static PciDeviceInfo read_device_info(int fd) return PciDeviceInfo{info.out.vendor_id, info.out.device_id, info.out.pci_domain, bus, dev, fn}; } -static int determine_numa_node(int fd) -{ - const auto device_info = read_device_info(fd); - const auto sysfs_path = fmt::format("/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{}/numa_node", - device_info.pci_domain, device_info.pci_bus, - device_info.pci_device, device_info.pci_function); - - std::ifstream numa_file(sysfs_path); - int numa_node = -1; - if (numa_file >> numa_node) { - return numa_node; +static tt::ARCH detect_arch(uint32_t pcie_device_id, uint32_t pcie_revision_id) { + if (pcie_device_id == GS_PCIE_DEVICE_ID){ + return tt::ARCH::GRAYSKULL; + } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){ + return tt::ARCH::WORMHOLE_B0; + } else if (pcie_device_id == WH_PCIE_DEVICE_ID){ + // TODO: did we ship any of these? I've never seen one. Can we stop + // having an ARCH for it if they don't exist? + TT_THROW("Wormhole is not supported. Please use Wormhole B0 instead."); + return tt::ARCH::WORMHOLE; + } else if (pcie_device_id == WH_PCIE_DEVICE_ID){ + return tt::ARCH::BLACKHOLE; + } else { + TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id); } - return -1; -} - - -tt::ARCH detect_arch(int device_id){ - std::uint32_t pcie_device_id = get_pcie_info(device_id, "pcie_device_id"); - std::uint32_t pcie_revision_id = get_pcie_info(device_id, "revision"); - return detect_arch(pcie_device_id, pcie_revision_id); } // Custom device memcpy. This is only safe for memory-like regions on the device (Tensix L1, DRAM, ARC CSM). @@ -152,9 +190,6 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte } } -// -------------------------------------------------------------------------------------------------------------- -// -------------------------------------------------------------------------------------------------------------- -// -------------------------------------------------------------------------------------------------------------- /* static */ std::vector PCIDevice::enumerate_devices() { std::vector device_ids; @@ -177,31 +212,17 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte return device_ids; } -PCIDevice::PCIDevice(int device_id, int logical_device_id) { - // TODO: use C++ constructor to do everything - // TODO: make public member vars const - // TODO: get logical_id out of here - this->device_id = device_id; - this->logical_id = logical_device_id; - setup_device(); - - this->info = read_device_info(device_fd); - -} - -PCIDevice::~PCIDevice() { - close_device(); -} - - -void PCIDevice::setup_device() { - this->device_fd = find_device(this->device_id); - this->numa_node = determine_numa_node(this->device_fd); - this->pcie_device_id = get_pcie_info(this->device_id, "pcie_device_id"); - this->pcie_revision_id = get_pcie_info(this->device_id, "revision"); - this->arch = detect_arch(pcie_device_id, pcie_revision_id); - this->architecture_implementation = tt::umd::architecture_implementation::create(static_cast(arch)); - +PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) + : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)) + , pci_device_num(pci_device_number) + , logical_id(logical_device_id) + , pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)) + , info(read_device_info(pci_device_file_desc)) + , numa_node(read_sysfs(info, "numa_node")) + , revision(read_sysfs(info, "revision")) + , arch(detect_arch(info.device_id, revision)) + , architecture_implementation(tt::umd::architecture_implementation::create(static_cast(arch))) +{ struct { tenstorrent_query_mappings query_mappings; tenstorrent_mapping mapping_array[8]; @@ -209,27 +230,20 @@ void PCIDevice::setup_device() { memset(&mappings, 0, sizeof(mappings)); mappings.query_mappings.in.output_mapping_count = 8; - if (ioctl(device_fd, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings.query_mappings) == -1) { - throw std::runtime_error(fmt::format("Query mappings failed on device {}.", device_id)); + if (ioctl(pci_device_file_desc, TENSTORRENT_IOCTL_QUERY_MAPPINGS, &mappings.query_mappings) == -1) { + throw std::runtime_error(fmt::format("Query mappings failed on device {}.", pci_device_num)); } // Mapping resource to BAR // Resource 0 -> BAR0 // Resource 1 -> BAR2 // Resource 2 -> BAR4 - tenstorrent_mapping bar0_uc_mapping; - tenstorrent_mapping bar0_wc_mapping; - tenstorrent_mapping bar2_uc_mapping; - tenstorrent_mapping bar2_wc_mapping; - tenstorrent_mapping bar4_uc_mapping; - tenstorrent_mapping bar4_wc_mapping; - - memset(&bar0_uc_mapping, 0, sizeof(bar0_uc_mapping)); - memset(&bar0_wc_mapping, 0, sizeof(bar0_wc_mapping)); - memset(&bar2_uc_mapping, 0, sizeof(bar2_uc_mapping)); - memset(&bar2_wc_mapping, 0, sizeof(bar2_wc_mapping)); - memset(&bar4_uc_mapping, 0, sizeof(bar4_uc_mapping)); - memset(&bar4_wc_mapping, 0, sizeof(bar4_wc_mapping)); + tenstorrent_mapping bar0_uc_mapping{}; + tenstorrent_mapping bar0_wc_mapping{}; + tenstorrent_mapping bar2_uc_mapping{}; + tenstorrent_mapping bar2_wc_mapping{}; + tenstorrent_mapping bar4_uc_mapping{}; + tenstorrent_mapping bar4_wc_mapping{}; for (unsigned int i = 0; i < mappings.query_mappings.in.output_mapping_count; i++) { if (mappings.mapping_array[i].mapping_id == TENSTORRENT_MAPPING_RESOURCE0_UC) { @@ -263,7 +277,7 @@ void PCIDevice::setup_device() { } if (bar0_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE0_UC) { - throw std::runtime_error(fmt::format("Device {} has no BAR0 UC mapping.", device_id)); + throw std::runtime_error(fmt::format("Device {} has no BAR0 UC mapping.", pci_device_num)); } auto wc_mapping_size = arch == tt::ARCH::BLACKHOLE ? BH_BAR0_WC_MAPPING_SIZE : GS_BAR0_WC_MAPPING_SIZE; @@ -271,7 +285,7 @@ void PCIDevice::setup_device() { // Attempt WC mapping first so we can fall back to all-UC if it fails. if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { bar0_wc_size = std::min(bar0_wc_mapping.mapping_size, wc_mapping_size); - bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_wc_mapping.mapping_base); + bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base); if (bar0_wc == MAP_FAILED) { bar0_wc_size = 0; bar0_wc = nullptr; @@ -288,10 +302,10 @@ void PCIDevice::setup_device() { bar0_uc_offset = 0; } - bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar0_uc_mapping.mapping_base + bar0_uc_offset); + bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_uc_mapping.mapping_base + bar0_uc_offset); if (bar0_uc == MAP_FAILED) { - throw std::runtime_error(fmt::format("BAR0 UC mapping failed for device {}.", device_id)); + throw std::runtime_error(fmt::format("BAR0 UC mapping failed for device {}.", pci_device_num)); } if (!bar0_wc) { @@ -300,43 +314,43 @@ void PCIDevice::setup_device() { if (arch == tt::ARCH::WORMHOLE_B0) { if (bar4_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_UC) { - throw std::runtime_error(fmt::format("Device {} has no BAR4 UC mapping.", device_id)); + throw std::runtime_error(fmt::format("Device {} has no BAR4 UC mapping.", pci_device_num)); } system_reg_mapping_size = bar4_uc_mapping.mapping_size; - system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_uc_mapping.mapping_base); + system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_uc_mapping.mapping_base); if (system_reg_mapping == MAP_FAILED) { - throw std::runtime_error(fmt::format("BAR4 UC mapping failed for device {}.", device_id)); + throw std::runtime_error(fmt::format("BAR4 UC mapping failed for device {}.", pci_device_num)); } system_reg_start_offset = (512 - 16) * 1024*1024; system_reg_offset_adjust = (512 - 32) * 1024*1024; } else if(arch == tt::ARCH::BLACKHOLE) { if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) { - throw std::runtime_error(fmt::format("Device {} has no BAR2 UC mapping.", device_id)); + throw std::runtime_error(fmt::format("Device {} has no BAR2 UC mapping.", pci_device_num)); } // Using UnCachable memory mode. This is used for accessing registers on Blackhole. bar2_uc_size = bar2_uc_mapping.mapping_size; - bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar2_uc_mapping.mapping_base); + bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar2_uc_mapping.mapping_base); if (bar2_uc == MAP_FAILED) { - throw std::runtime_error(fmt::format("BAR2 UC mapping failed for device {}.", device_id)); + throw std::runtime_error(fmt::format("BAR2 UC mapping failed for device {}.", pci_device_num)); } if (bar4_wc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE2_WC) { - throw std::runtime_error(fmt::format("Device {} has no BAR4 WC mapping.", device_id)); + throw std::runtime_error(fmt::format("Device {} has no BAR4 WC mapping.", pci_device_num)); } // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole. // WC doesn't guarantee write ordering but has better performance. bar4_wc_size = bar4_wc_mapping.mapping_size; - bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, device_fd, bar4_wc_mapping.mapping_base); + bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_wc_mapping.mapping_base); if (bar4_wc == MAP_FAILED) { - throw std::runtime_error(fmt::format("BAR4 WC mapping failed for device {}.", device_id)); + throw std::runtime_error(fmt::format("BAR4 WC mapping failed for device {}.", pci_device_num)); } } @@ -344,19 +358,20 @@ void PCIDevice::setup_device() { read_checking_offset = arch == tt::ARCH::BLACKHOLE ? BH_NOC_NODE_ID_OFFSET : GS_WH_ARC_SCRATCH_6_OFFSET; } -void PCIDevice::close_device() { +PCIDevice::~PCIDevice() { if (arch == tt::ARCH::BLACKHOLE && bar2_uc != nullptr && bar2_uc != MAP_FAILED) { // Disable ATU index 0 // TODO: Implement disabling for all indexes, once more host channels are enabled. + + // This is not going to happen if the application crashes, so if it's + // essential for correctness then it needs to move to the driver. uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 - write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); + write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); } - if (device_fd != -1) { - ::close(device_fd); - } + close(pci_device_file_desc); if (bar0_wc != nullptr && bar0_wc != MAP_FAILED && bar0_wc != bar0_uc) { munmap(bar0_wc, bar0_wc_size); @@ -377,33 +392,10 @@ void PCIDevice::close_device() { if (system_reg_mapping != nullptr && system_reg_mapping != MAP_FAILED) { munmap(system_reg_mapping, system_reg_mapping_size); } - - device_fd = -1; - bar0_uc = nullptr; - bar0_wc = nullptr; - bar2_uc = nullptr; - bar4_wc = nullptr; - system_reg_mapping = nullptr; -} - -// Open a unique device_id per host memory channel (workaround for ttkmd < 1.21 support for more than 1 pin per fd) -void PCIDevice::open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels) { - for (int ch = 0; ch < num_host_mem_channels; ch++) { - log_debug(LogSiliconDriver, "Opening device_fd_per_host_ch device index: {} ch: {} (num_host_mem_channels: {})", device_id, ch, num_host_mem_channels); - int device_fd_for_host_mem = find_device(device_id); - if (device_fd_for_host_mem == -1) { - throw std::runtime_error(fmt::format("Failed opening a host memory device handle for device {}.", device_id)); - } - device_fd_per_host_ch.push_back(device_fd_for_host_mem); - } -} - -tt::ARCH PCIDevice::get_arch() const { - return arch; } template -T* PCIDevice::get_register_address(std::uint32_t register_offset) { +T* PCIDevice::get_register_address(uint32_t register_offset) { // Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole). // Should clarify this interface void *reg_mapping; @@ -481,11 +473,11 @@ void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { } } -void PCIDevice::write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size){ +void PCIDevice::write_tlb_reg(uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size){ log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); - volatile uint64_t *dest_qw = get_register_address(byte_addr); - volatile uint32_t *dest_extra_dw = get_register_address(byte_addr+8); + volatile uint64_t *dest_qw = get_register_address(byte_addr); + volatile uint32_t *dest_extra_dw = get_register_address(byte_addr+8); #if defined(__ARM_ARCH) || defined(__riscv) // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses. // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses. diff --git a/device/pcie/pci_device.hpp b/device/pcie/pci_device.hpp index d033a094..05ca83cd 100644 --- a/device/pcie/pci_device.hpp +++ b/device/pcie/pci_device.hpp @@ -5,23 +5,23 @@ */ #pragma once + #include +#include #include #include "device/tt_arch_types.h" #include "device/architecture_implementation.h" -static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); -static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC - -static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; -static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; - +// TODO: this is used up in tt_silicon_driver.cpp but that logic ought to be +// lowered into the PCIDevice class since it is specific to PCIe cards. // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200; +// TODO: this is a bit of a hack... something to revisit when we formalize an +// abstraction for IO. // BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4 -const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024; +static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024; constexpr unsigned int c_hang_read_value = 0xffffffffu; struct PciDeviceInfo @@ -35,19 +35,99 @@ struct PciDeviceInfo }; class PCIDevice { + const std::string device_path; // Path to character device: /dev/tenstorrent/N + const int pci_device_num; // N in /dev/tenstorrent/N + const int logical_id; // Unique identifier for each device in entire network topology + const int pci_device_file_desc; // Character device file descriptor + const PciDeviceInfo info; // PCI device info + const int numa_node; // -1 if non-NUMA + const int revision; // PCI revision value from sysfs + const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole + std::unique_ptr architecture_implementation; + public: /** - * Return a list of integers corresponding to character devices in /dev/tenstorrent/ + * @return a list of integers corresponding to character devices in /dev/tenstorrent/ */ static std::vector enumerate_devices(); - PCIDevice(int device_id, int logical_device_id); + /** + * PCI device constructor. + * + * Opens the character device file descriptor, reads device information from + * sysfs, and maps device memory region(s) into the process address space. + * + * @param pci_device_number N in /dev/tenstorrent/N + * @param logical_device_id unique identifier for this device in the network topology + */ + PCIDevice(int pci_device_number, int logical_device_id = 0); + + /** + * PCIDevice destructor. + * Unmaps device memory and closes chardev file descriptor. + */ ~PCIDevice(); + PCIDevice(const PCIDevice&) = delete; // copy void operator=(const PCIDevice&) = delete; // copy assignment + /** + * @return PCI device info + */ const PciDeviceInfo get_device_info() const { return info; } + /** + * @return which NUMA node this device is associated with, or -1 if non-NUMA + */ + int get_numa_node() const { return numa_node; } + + /** + * @return underlying file descriptor + * TODO: this is an abstraction violation to be removed when this class + * assumes control over hugepage/DMA mapping code. + */ + int get_fd() const { return pci_device_file_desc; } + + /** + * @return N in /dev/tenstorrent/N + * TODO: target for removal; upper layers should not care about this. + */ + int get_device_num() const { return pci_device_num; } + + /** + * @return unique integer for each device in entire network topology + * TODO: target for removal; upper layers shouldn't to pass this in here. It + * is unused by this class. + */ + int get_logical_id() const { return logical_id; } + + /** + * @return PCI device id + */ + int get_pci_device_id() const { return info.device_id; } + + /** + * @return PCI revision value from sysfs. + * TODO: target for removal; upper layers should not care about this. + */ + int get_pci_revision() const { return revision; } + + /** + * @return what architecture this device is (e.g. Wormhole, Blackhole, etc.) + */ + tt::ARCH get_arch() const { return arch; } + + // Note: byte_addr is (mostly but not always) offset into BAR0. This + // interface assumes the caller knows what they are doing - but it's unclear + // how to use this interface correctly without knowing details of the chip + // and its state. + // TODO: build a proper abstraction for IO. At this level, that is access + // to registers in BAR0 (although possibly the right abstraction is to add + // methods that perform specific operations as opposed to generic register + // read/write methods) and access to segments of BAR0/4 that are mapped to + // NOC endpoints. Probably worth waiting for the KMD to start owning the + // resource management aspect of these PCIe->NOC mappings (the "TLBs") + // before doing too much work here... void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr); void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr); void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data); @@ -55,63 +135,38 @@ class PCIDevice { void read_regs(uint32_t byte_addr, uint32_t word_len, void *data); void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); - void open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels); tt::umd::architecture_implementation* get_architecture_implementation() const { return architecture_implementation.get(); } + void detect_hang_read(uint32_t data_read = c_hang_read_value); - PciDeviceInfo info; - - int device_id; // N in /dev/tenstorrent/N - int logical_id; // TODO: does not belong in here - int device_fd = -1; - - // PCIe device info - int numa_node; - std::uint32_t pcie_device_id; - std::uint32_t pcie_revision_id; - - // BAR and regs mapping setup - std::vector device_fd_per_host_ch; +public: + // TODO: we can and should make all of these private. void *bar0_uc = nullptr; - std::size_t bar0_uc_size = 0; - std::size_t bar0_uc_offset = 0; + size_t bar0_uc_size = 0; + size_t bar0_uc_offset = 0; void *bar0_wc = nullptr; - std::size_t bar0_wc_size = 0; + size_t bar0_wc_size = 0; void *bar2_uc = nullptr; - std::size_t bar2_uc_size; + size_t bar2_uc_size; void *bar4_wc = nullptr; - std::uint64_t bar4_wc_size; + uint64_t bar4_wc_size; + // TODO: let's get rid of this unless we need to run UMD on WH systems with + // shrunk BAR0. If we don't (and we shouldn't), then we can just use BAR0 + // and simplify the code. void *system_reg_mapping = nullptr; - std::size_t system_reg_mapping_size; - - // These two are currently not used. - void *system_reg_wc_mapping = nullptr; - std::size_t system_reg_wc_mapping_size; - - std::uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. - std::uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. + size_t system_reg_mapping_size; + uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. + uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. - std::uint32_t read_checking_offset; + uint32_t read_checking_offset; - tt::ARCH get_arch() const; - - void detect_hang_read(std::uint32_t data_read = c_hang_read_value); - private: - void setup_device(); - void close_device(); - bool is_hardware_hung(); template - T* get_register_address(std::uint32_t register_offset); - - tt::ARCH arch; - std::unique_ptr architecture_implementation; - + T* get_register_address(uint32_t register_offset); }; -tt::ARCH detect_arch(int device_id=0); diff --git a/device/pcie/utils.hpp b/device/pcie/utils.hpp deleted file mode 100644 index 0a91d784..00000000 --- a/device/pcie/utils.hpp +++ /dev/null @@ -1,87 +0,0 @@ -/* - * SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#pragma once - -#include -#include -#include -#include -#include // for ::open -#include // for ioctl -#include // for PCI_SLOT, PCI_FUNC - -#include "ioctl.h" -#include "common/assert.hpp" -#include "device/tt_arch_types.h" - -// PCIe device IDs through ioctl -static const uint16_t GS_PCIE_DEVICE_ID = 0xfaca; -static const uint16_t WH_PCIE_DEVICE_ID = 0x401e; -static const uint16_t BH_PCIE_DEVICE_ID = 0xb140; - -int find_device(const uint16_t device_id) { - const char device_name_pattern [] = "/dev/tenstorrent/%u"; - char device_name[sizeof(device_name_pattern) + std::numeric_limits::digits10]; - snprintf(device_name, sizeof(device_name), device_name_pattern, (unsigned int)device_id); - int device_fd = open(device_name, O_RDWR | O_CLOEXEC); - if (device_fd == -1) { - TT_THROW("Failed opening a handle for device ", device_id); - } - return device_fd; -} - -tenstorrent_get_device_info get_pcie_device_info(int device_fd) { - tenstorrent_get_device_info device_info; - memset(&device_info, 0, sizeof(device_info)); - device_info.in.output_size_bytes = sizeof(device_info.out); - if (ioctl(device_fd, TENSTORRENT_IOCTL_GET_DEVICE_INFO, &device_info) == -1) { - TT_THROW("Get PCIe device info failed on device fd: ", device_fd); - } - return device_info; -} - -std::uint32_t get_pcie_info(int device_id, const std::string &info_needed) { - // Get PCIe device info through iotcl - int device_fd = find_device(device_id); - auto device_info = get_pcie_device_info(device_fd); - - if(info_needed == "pcie_device_id"){ - return device_info.out.device_id; - } - - std::uint16_t pcie_domain = device_info.out.pci_domain; - std::uint8_t pcie_bus = device_info.out.bus_dev_fn >> 8; - std::uint8_t pcie_device = PCI_SLOT(device_info.out.bus_dev_fn); - std::uint8_t pcie_function = PCI_FUNC(device_info.out.bus_dev_fn); - - // Get the PCIe info from sysfs - static const char sys_pattern[] = "/sys/bus/pci/devices/%04x:%02x:%02x.%u/%s"; - char buf[sizeof(sys_pattern) + 10]; - snprintf(buf, sizeof(buf), sys_pattern, pcie_domain, pcie_bus, pcie_device, pcie_function, info_needed.c_str()); - std::ifstream pcie_info_file(buf); - std::string pcie_info_string; - - if (!std::getline(pcie_info_file, pcie_info_string)) { - TT_THROW("/sys/* read failed for device: ", device_id); - } - return std::stoul(pcie_info_string, nullptr, 0); -} - -tt::ARCH detect_arch(std::uint32_t pcie_device_id, std::uint32_t pcie_revision_id) { - if (pcie_device_id == GS_PCIE_DEVICE_ID){ - return tt::ARCH::GRAYSKULL; - } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){ - return tt::ARCH::WORMHOLE_B0; - } else if (pcie_device_id == WH_PCIE_DEVICE_ID){ - TT_THROW("Wormhole is not supported. Please use Wormhole B0 instead."); - return tt::ARCH::WORMHOLE; - } else if (pcie_device_id == WH_PCIE_DEVICE_ID){ - return tt::ARCH::BLACKHOLE; - } else { - TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id); - } -} diff --git a/device/tt_arch_types.h b/device/tt_arch_types.h index bd77e2be..8344db6b 100644 --- a/device/tt_arch_types.h +++ b/device/tt_arch_types.h @@ -10,6 +10,9 @@ #include "device/architecture.h" namespace tt { + +// TODO: why do we have ARCH and architecture? This is a mess. Can we have just one? +// Can we get rid of the entries that (for all practical purposes) do not exist? /** * @brief ARCH Enums */ diff --git a/device/tt_silicon_driver.cpp b/device/tt_silicon_driver.cpp index f098ecae..97287928 100644 --- a/device/tt_silicon_driver.cpp +++ b/device/tt_silicon_driver.cpp @@ -52,8 +52,6 @@ using namespace boost::interprocess; using namespace tt; -// Workaround for tkmd < 1.21 use device_fd_per_host_ch[ch] instead of device_fd once per channel. -const bool g_SINGLE_PIN_PAGE_PER_FD_WORKAROND = true; const uint32_t g_MAX_HOST_MEM_CHANNELS = 4; const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB @@ -225,9 +223,23 @@ namespace { }; } // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it. -dynamic_tlb set_dynamic_tlb(PCIDevice *dev, unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, - std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { +dynamic_tlb set_dynamic_tlb( + PCIDevice* dev, + unsigned int tlb_index, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t address, + bool multicast, + std::unordered_map>& harvested_coord_translation, + std::uint64_t ordering) +{ auto architecture_implementation = dev->get_architecture_implementation(); + + // TODO(Joel): the PCIDevice should not really be carring this around - this + // is one of two places that extracts it from the PCIDevice. Since KMD will + // eventually take over the TLB programming, this can get removed later on. + auto logical_id = dev->get_logical_id(); + if (multicast) { std::tie(start, end) = architecture_implementation->multicast_workaround(start, end); } @@ -237,8 +249,8 @@ dynamic_tlb set_dynamic_tlb(PCIDevice *dev, unsigned int tlb_index, tt_xy_pair s tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index); std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes(); - auto translated_start_coords = harvested_coord_translation.at(dev->logical_id).at(start); - auto translated_end_coords = harvested_coord_translation.at(dev->logical_id).at(end); + auto translated_start_coords = harvested_coord_translation.at(logical_id).at(start); + auto translated_end_coords = harvested_coord_translation.at(logical_id).at(end); uint32_t tlb_address = address / tlb_config.size; uint32_t local_offset = address % tlb_config.size; uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); @@ -347,7 +359,10 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target } auto dev = m_pci_device_map.at(logical_device_id).get(); - m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, dev->pcie_device_id, dev->pcie_revision_id); + uint16_t pcie_device_id = dev->get_pci_device_id(); + uint32_t pcie_revision = dev->get_pci_revision(); + // TODO: get rid of this, it doesn't make any sense. + m_num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); if (dev->get_arch() == tt::ARCH::BLACKHOLE && m_num_host_mem_channels > 1) { // TODO: Implement support for multiple host channels on BLACKHOLE. log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported."); @@ -355,11 +370,7 @@ void tt_SiliconDevice::create_device(const std::unordered_set &target } log_debug(LogSiliconDriver, "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} device_id: 0x{:x} revision: {})", - m_num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->device_id, pci_device->revision_id); - - if (g_SINGLE_PIN_PAGE_PER_FD_WORKAROND) { - dev->open_hugepage_per_host_mem_ch(m_num_host_mem_channels); - } + m_num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->get_device_num(), pci_device->revision_id); // Initialize these. Used to be in header file. for (int ch = 0; ch < g_MAX_HOST_MEM_CHANNELS; ch ++) { @@ -817,12 +828,23 @@ void tt_SiliconDevice::broadcast_pcie_tensix_risc_reset(PCIDevice *device, const log_debug(LogSiliconDriver, "tt_SiliconDevice::broadcast_tensix_risc_reset"); auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; + auto logical_id = device->get_logical_id(); log_debug(LogSiliconDriver, "== For all tensix set soft-reset for {} risc cores.", TensixSoftResetOptionsToString(valid).c_str()); auto architecture_implementation = device->get_architecture_implementation(); - auto [soft_reset_reg, _] = set_dynamic_tlb_broadcast(device, architecture_implementation->get_reg_tlb(), architecture_implementation->get_tensix_soft_reset_addr(), harvested_coord_translation, tt_xy_pair(0, 0), - tt_xy_pair(architecture_implementation->get_grid_size_x() - 1, architecture_implementation->get_grid_size_y() - 1 - num_rows_harvested.at(device -> logical_id)), TLB_DATA::Posted); + + // TODO: this is clumsy and difficult to read + auto [soft_reset_reg, _] = set_dynamic_tlb_broadcast( + device, + architecture_implementation->get_reg_tlb(), + architecture_implementation->get_tensix_soft_reset_addr(), + harvested_coord_translation, + tt_xy_pair(0, 0), + tt_xy_pair( + architecture_implementation->get_grid_size_x() - 1, + architecture_implementation->get_grid_size_y() - 1 - num_rows_harvested.at(logical_id)), + TLB_DATA::Posted); device->write_regs(soft_reset_reg, 1, &valid); tt_driver_atomics::sfence(); } @@ -975,7 +997,7 @@ void tt_SiliconDevice::write_device_memory(const void *mem_ptr, uint32_t size_in } } else { const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, dev->device_id)); + const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); while(size_in_bytes > 0) { @@ -1018,7 +1040,7 @@ void tt_SiliconDevice::read_device_memory(void *mem_ptr, tt_cxy_pair target, std log_debug(LogSiliconDriver, " read_block called with tlb_offset: {}, tlb_size: {}", tlb_offset, tlb_size); } else { const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, dev->device_id)); + const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); while(size_in_bytes > 0) { @@ -1342,7 +1364,7 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { // Convert from logical (device_id in netlist) to physical device_id (in case of virtualization) auto dev = m_pci_device_map.at(device_id).get(); - auto physical_device_id = dev->device_id; + auto physical_device_id = dev->get_device_num(); std::string hugepage_dir = find_hugepage_dir(hugepage_size); if (hugepage_dir.empty()) { @@ -1389,7 +1411,7 @@ bool tt_SiliconDevice::init_hugepage(chip_id_t device_id) { pin_pages.in.virtual_address = reinterpret_cast(mapping); pin_pages.in.size = hugepage_size; - auto &fd = g_SINGLE_PIN_PAGE_PER_FD_WORKAROND ? dev->device_fd_per_host_ch[ch] : dev->device_fd; + auto fd = dev->get_fd(); if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) { log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed (errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", physical_device_id, ch, strerror(errno)); @@ -1488,7 +1510,7 @@ int tt_SiliconDevice::pcie_arc_msg(int logical_device_id, uint32_t msg_code, boo // Exclusive access for a single process at a time. Based on physical pci interface id. std::string msg_type = "ARC_MSG"; - const scoped_lock lock(*get_mutex(msg_type, pci_device->device_id)); + const scoped_lock lock(*get_mutex(msg_type, pci_device->get_device_num())); uint32_t fw_arg = arg0 | (arg1<<16); int exit_code = 0; @@ -1797,7 +1819,7 @@ void tt_SiliconDevice::write_to_non_mmio_device( // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->device_id)); + const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; @@ -1967,7 +1989,7 @@ void tt_SiliconDevice::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->device_id)); + const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0); read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); @@ -2267,7 +2289,7 @@ void tt_SiliconDevice::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, PCIDevice *pci_device = get_pci_device(chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const uint8_t* buffer_addr = static_cast(mem_ptr); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->device_id)); + const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); while(size_in_bytes > 0) { auto [mapped_address, tlb_size] = set_dynamic_tlb_broadcast(pci_device, tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); @@ -2532,7 +2554,7 @@ void tt_SiliconDevice::set_membar_flag(const chip_id_t chip, const std::unordere void tt_SiliconDevice::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) { // Ensure that this memory barrier is atomic across processes/threads - const scoped_lock lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->device_id)); + const scoped_lock lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num())); set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb); set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb); } @@ -2633,7 +2655,7 @@ void tt_SiliconDevice::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core PCIDevice *pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->device_id)); + const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); @@ -2652,7 +2674,7 @@ void tt_SiliconDevice::write_mmio_device_register(const void* mem_ptr, tt_cxy_pa PCIDevice *pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); - const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->device_id)); + const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); auto [mapped_address, tlb_size] = set_dynamic_tlb(pci_device, tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); @@ -2919,7 +2941,7 @@ std::uint32_t tt_SiliconDevice::get_host_channel_size(std::uint32_t device_id, s } std::uint32_t tt_SiliconDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { - return get_pci_device(device_id)->numa_node; + return get_pci_device(device_id)->get_numa_node(); } std::uint64_t tt_SiliconDevice::get_pcie_base_addr_from_device() const { diff --git a/tests/pcie/test_pcie_device.cpp b/tests/pcie/test_pcie_device.cpp index 613b089f..a9d07790 100644 --- a/tests/pcie/test_pcie_device.cpp +++ b/tests/pcie/test_pcie_device.cpp @@ -14,8 +14,8 @@ TEST(PcieDeviceTest, Numa) { std::vector nodes; for (auto device_id : PCIDevice::enumerate_devices()) { - PCIDevice device(device_id, 0); - nodes.push_back(device.numa_node); + PCIDevice device(device_id); + nodes.push_back(device.get_numa_node()); } // Acceptable outcomes: From 212ebe04fd3679aeead816a2d82dc27f18c7b9f3 Mon Sep 17 00:00:00 2001 From: Joel Smith Date: Tue, 15 Oct 2024 10:38:00 -0700 Subject: [PATCH 2/2] Generate Doxygen for PCIDevice class Also, update the docs/README.md explaining how to add source files to Doxygen's input. --- Doxyfile | 2 +- docs/README.md | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/Doxyfile b/Doxyfile index cfd463db..e4edd8c5 100644 --- a/Doxyfile +++ b/Doxyfile @@ -917,7 +917,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = device/tt_device.h +INPUT = device/tt_device.h device/pcie/pci_device.hpp # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/docs/README.md b/docs/README.md index 15cbdc43..06e68be6 100644 --- a/docs/README.md +++ b/docs/README.md @@ -10,4 +10,8 @@ In order to build docs environment variable `TT_UMD_HOME` needs to be set to roo After that you can run [`build_docs.sh`](build_docs.sh) -In `build/docs` directory you will find multiple formats of the docs. \ No newline at end of file +In `build/docs` directory you will find multiple formats of the docs. + +## Adding documentation + +Augment the `INPUT` line in [`Doxyfile`](../Doxyfile)