Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PCIDevice class cleanup #149

Merged
merged 3 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -917,7 +917,7 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.

INPUT = device/tt_device.h
INPUT = device/tt_device.h device/pcie/pci_device.hpp

# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
Expand Down
208 changes: 100 additions & 108 deletions device/pcie/pci_device.cpp

Large diffs are not rendered by default.

155 changes: 105 additions & 50 deletions device/pcie/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,23 @@
*/

#pragma once

#include <cstdint>
#include <cstdio>
#include <vector>

#include "device/tt_arch_types.h"
#include "device/architecture_implementation.h"

static uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24);
static uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC

static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044;
static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078;

// TODO: this is used up in tt_silicon_driver.cpp but that logic ought to be
// lowered into the PCIDevice class since it is specific to PCIe cards.
// See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h
static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200;

// TODO: this is a bit of a hack... something to revisit when we formalize an
// abstraction for IO.
// BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4
const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;
static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024;

constexpr unsigned int c_hang_read_value = 0xffffffffu;
struct PciDeviceInfo
Expand All @@ -35,83 +35,138 @@ struct PciDeviceInfo
};

class PCIDevice {
const std::string device_path; // Path to character device: /dev/tenstorrent/N
const int pci_device_num; // N in /dev/tenstorrent/N
const int logical_id; // Unique identifier for each device in entire network topology
const int pci_device_file_desc; // Character device file descriptor
const PciDeviceInfo info; // PCI device info
const int numa_node; // -1 if non-NUMA
const int revision; // PCI revision value from sysfs
const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole
std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;

public:
/**
* Return a list of integers corresponding to character devices in /dev/tenstorrent/
* @return a list of integers corresponding to character devices in /dev/tenstorrent/
*/
static std::vector<int> enumerate_devices();

PCIDevice(int device_id, int logical_device_id);
/**
* PCI device constructor.
*
* Opens the character device file descriptor, reads device information from
* sysfs, and maps device memory region(s) into the process address space.
*
* @param pci_device_number N in /dev/tenstorrent/N
* @param logical_device_id unique identifier for this device in the network topology
*/
PCIDevice(int pci_device_number, int logical_device_id = 0);

/**
* PCIDevice destructor.
* Unmaps device memory and closes chardev file descriptor.
*/
~PCIDevice();

PCIDevice(const PCIDevice&) = delete; // copy
void operator=(const PCIDevice&) = delete; // copy assignment

/**
* @return PCI device info
*/
const PciDeviceInfo get_device_info() const { return info; }

/**
* @return which NUMA node this device is associated with, or -1 if non-NUMA
*/
int get_numa_node() const { return numa_node; }

/**
* @return underlying file descriptor
* TODO: this is an abstraction violation to be removed when this class
* assumes control over hugepage/DMA mapping code.
*/
int get_fd() const { return pci_device_file_desc; }

/**
* @return N in /dev/tenstorrent/N
* TODO: target for removal; upper layers should not care about this.
*/
int get_device_num() const { return pci_device_num; }

/**
* @return unique integer for each device in entire network topology
* TODO: target for removal; upper layers shouldn't to pass this in here. It
* is unused by this class.
*/
int get_logical_id() const { return logical_id; }

/**
* @return PCI device id
*/
int get_pci_device_id() const { return info.device_id; }

/**
* @return PCI revision value from sysfs.
* TODO: target for removal; upper layers should not care about this.
*/
int get_pci_revision() const { return revision; }

/**
* @return what architecture this device is (e.g. Wormhole, Blackhole, etc.)
*/
tt::ARCH get_arch() const { return arch; }

// Note: byte_addr is (mostly but not always) offset into BAR0. This
// interface assumes the caller knows what they are doing - but it's unclear
// how to use this interface correctly without knowing details of the chip
// and its state.
// TODO: build a proper abstraction for IO. At this level, that is access
// to registers in BAR0 (although possibly the right abstraction is to add
// methods that perform specific operations as opposed to generic register
// read/write methods) and access to segments of BAR0/4 that are mapped to
// NOC endpoints. Probably worth waiting for the KMD to start owning the
// resource management aspect of these PCIe->NOC mappings (the "TLBs")
// before doing too much work here...
void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr);
void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr);
void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data);
void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len);
void read_regs(uint32_t byte_addr, uint32_t word_len, void *data);
void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size);

void open_hugepage_per_host_mem_ch(uint32_t num_host_mem_channels);
tt::umd::architecture_implementation* get_architecture_implementation() const { return architecture_implementation.get(); }
void detect_hang_read(uint32_t data_read = c_hang_read_value);

PciDeviceInfo info;

int device_id; // N in /dev/tenstorrent/N
int logical_id; // TODO: does not belong in here
int device_fd = -1;

// PCIe device info
int numa_node;
std::uint32_t pcie_device_id;
std::uint32_t pcie_revision_id;

// BAR and regs mapping setup
std::vector<int> device_fd_per_host_ch;
public:
// TODO: we can and should make all of these private.
void *bar0_uc = nullptr;
std::size_t bar0_uc_size = 0;
std::size_t bar0_uc_offset = 0;
size_t bar0_uc_size = 0;
size_t bar0_uc_offset = 0;

void *bar0_wc = nullptr;
std::size_t bar0_wc_size = 0;
size_t bar0_wc_size = 0;

void *bar2_uc = nullptr;
std::size_t bar2_uc_size;
size_t bar2_uc_size;

void *bar4_wc = nullptr;
std::uint64_t bar4_wc_size;
uint64_t bar4_wc_size;

// TODO: let's get rid of this unless we need to run UMD on WH systems with
// shrunk BAR0. If we don't (and we shouldn't), then we can just use BAR0
// and simplify the code.
void *system_reg_mapping = nullptr;
std::size_t system_reg_mapping_size;

// These two are currently not used.
void *system_reg_wc_mapping = nullptr;
std::size_t system_reg_wc_mapping_size;

std::uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping.
std::uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping.
size_t system_reg_mapping_size;
uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping.
uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping.

std::uint32_t read_checking_offset;
uint32_t read_checking_offset;

tt::ARCH get_arch() const;

void detect_hang_read(std::uint32_t data_read = c_hang_read_value);

private:
void setup_device();
void close_device();

bool is_hardware_hung();

template <typename T>
T* get_register_address(std::uint32_t register_offset);

tt::ARCH arch;
std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;

T* get_register_address(uint32_t register_offset);
};

tt::ARCH detect_arch(int device_id=0);
87 changes: 0 additions & 87 deletions device/pcie/utils.hpp

This file was deleted.

3 changes: 3 additions & 0 deletions device/tt_arch_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
#include "device/architecture.h"

namespace tt {

// TODO: why do we have ARCH and architecture? This is a mess. Can we have just one?
// Can we get rid of the entries that (for all practical purposes) do not exist?
/**
* @brief ARCH Enums
*/
Expand Down
Loading