Skip to content

Commit

Permalink
Add IOMMU support (#338)
Browse files Browse the repository at this point in the history
### Issue
#257

### Description
Adds support for using a normal (i.e. not backed by 1G page) buffer for
sysmem. This requires a system with IOMMU enabled and not in passthrough
mode, and also a recent (>= 1.29.0) KMD.

### List of the changes
* Adds IOMMU detection logic to pci_device.cpp
* KMD version check if IOMMU is enabled
* Adds API for initializing per-device sysmem without hugepages
* Defaults to not using hugepages if the system IOMMU is enabled

### Testing
Manual testing on my development machine.

### API Changes
There are no API changes in this PR.
  • Loading branch information
joelsmithTT authored Dec 4, 2024
1 parent dca4e49 commit dc5e371
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 7 deletions.
31 changes: 30 additions & 1 deletion device/api/umd/device/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ struct dynamic_tlb {
uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB.
};

// These are not necessarily hugepages if IOMMU is enabled.
struct hugepage_mapping {
void *mapping = nullptr;
size_t mapping_size = 0;
uint64_t physical_address = 0;
uint64_t physical_address = 0; // or IOVA, if IOMMU is enabled
};

struct PciDeviceInfo {
Expand Down Expand Up @@ -72,6 +73,7 @@ class PCIDevice {
const int revision; // PCI revision value from sysfs
const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole
const semver_t kmd_version; // KMD version
const bool iommu_enabled; // Whether the system is protected from this device by an IOMMU
std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;

public:
Expand Down Expand Up @@ -143,6 +145,11 @@ class PCIDevice {
*/
tt::ARCH get_arch() const { return arch; }

/**
* @return whether the system is protected from this device by an IOMMU
*/
bool is_iommu_enabled() const { return iommu_enabled; }

// Note: byte_addr is (mostly but not always) offset into BAR0. This
// interface assumes the caller knows what they are doing - but it's unclear
// how to use this interface correctly without knowing details of the chip
Expand Down Expand Up @@ -191,9 +198,31 @@ class PCIDevice {

// TODO: this also probably has more sense to live in the future TTDevice class.
bool init_hugepage(uint32_t num_host_mem_channels);

/**
* Allocate sysmem without hugepages and map it through IOMMU.
* This is used when the system is protected by an IOMMU. The mappings will
* still appear as hugepages to the caller.
* @param size sysmem size in bytes; size % (1UL << 30) == 0
* @return whether allocation/mapping succeeded.
*/
bool init_iommu(size_t size);

int get_num_host_mem_channels() const;
hugepage_mapping get_hugepage_mapping(int channel) const;

/**
* Map a buffer for DMA access by the device.
*
* Supports mapping physically-contiguous buffers (e.g. hugepages) for the
* no-IOMMU case.
*
* @param buffer must be page-aligned
* @param size must be a multiple of the page size
* @return uint64_t PA (no IOMMU) or IOVA (with IOMMU) for use by the device
*/
uint64_t map_for_dma(void *buffer, size_t size);

public:
// TODO: we can and should make all of these private.
void *bar0_uc = nullptr;
Expand Down
8 changes: 8 additions & 0 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1375,6 +1375,14 @@ void Cluster::init_pcie_iatus() {
int logical_id = src_device_it.first;
PCIDevice* src_pci_device = src_device_it.second.get();

// TODO: with the IOMMU case, I think we can get away with using just
// one iATU region for WH. (On BH, we don't need iATU). We can only
// cover slightly less than 4GB with WH, and the iATU can cover 4GB.
// Splitting it into multiple regions is fine, but it's not necessary.
//
// ... something to consider when this code is refactored into PCIDevice
// where it belongs.

// Device to Host (multiple channels)
for (int channel_id = 0; channel_id < src_pci_device->get_num_host_mem_channels(); channel_id++) {
hugepage_mapping hugepage_map = src_pci_device->get_hugepage_mapping(channel_id);
Expand Down
112 changes: 109 additions & 3 deletions device/pcie/pci_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name
}
}

static bool detect_iommu(const PciDeviceInfo &device_info) {
try {
auto iommu_type = read_sysfs<std::string>(device_info, "iommu_group/type");
return iommu_type.substr(0, 3) == "DMA"; // DMA or DMA-FQ
} catch (...) {
return false;
}
}

static PciDeviceInfo read_device_info(int fd) {
tenstorrent_get_device_info info{};
info.in.output_size_bytes = sizeof(info.out);
Expand Down Expand Up @@ -258,6 +267,8 @@ tt::ARCH PciDeviceInfo::get_arch() const {
return infos;
}

static const semver_t kmd_ver_for_iommu = semver_t(1, 29, 0);

PCIDevice::PCIDevice(int pci_device_number) :
device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)),
pci_device_num(pci_device_number),
Expand All @@ -266,9 +277,19 @@ PCIDevice::PCIDevice(int pci_device_number) :
numa_node(read_sysfs<int>(info, "numa_node", -1)), // default to -1 if not found
revision(read_sysfs<int>(info, "revision")),
arch(detect_arch(info.device_id, revision)),
architecture_implementation(tt::umd::architecture_implementation::create(arch)),
kmd_version(read_kmd_version()) {
log_info(LogSiliconDriver, "Opened PCI device {}; KMD version: {}", pci_device_num, kmd_version.to_string());
kmd_version(read_kmd_version()),
iommu_enabled(detect_iommu(info)),
architecture_implementation(tt::umd::architecture_implementation::create(arch)) {
if (iommu_enabled && kmd_version < kmd_ver_for_iommu) {
TT_THROW("Running with IOMMU support requires KMD version {} or newer", kmd_ver_for_iommu.to_string());
}

log_info(
LogSiliconDriver,
"Opened PCI device {}; KMD version: {}, IOMMU: {}",
pci_device_num,
kmd_version.to_string(),
iommu_enabled ? "enabled" : "disabled");

struct {
tenstorrent_query_mappings query_mappings;
Expand Down Expand Up @@ -687,6 +708,11 @@ tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation
bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
const size_t hugepage_size = HUGEPAGE_REGION_SIZE;

if (is_iommu_enabled()) {
size_t size = hugepage_size * num_host_mem_channels;
return init_iommu(size);
}

auto physical_device_id = get_device_num();

std::string hugepage_dir = find_hugepage_dir(hugepage_size);
Expand Down Expand Up @@ -800,6 +826,37 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
return success;
}

bool PCIDevice::init_iommu(size_t size) {
const size_t num_fake_mem_channels = size / HUGEPAGE_REGION_SIZE;

if (!is_iommu_enabled()) {
TT_THROW("IOMMU is required for sysmem without hugepages.");
}

log_info(LogSiliconDriver, "Allocating sysmem without hugepages (size: {:#x}).", size);
void *mapping = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, -1, 0);

if (mapping == MAP_FAILED) {
TT_THROW(
"UMD: Failed to allocate memory for device/host shared buffer (size: {} errno: {}).",
size,
strerror(errno));
}

uint64_t iova = map_for_dma(mapping, size);
log_info(LogSiliconDriver, "Mapped sysmem without hugepages to IOVA {:#x}.", iova);

hugepage_mapping_per_channel.resize(num_fake_mem_channels);

// Support for more than 1GB host memory accessible per device, via channels.
for (size_t ch = 0; ch < num_fake_mem_channels; ch++) {
uint8_t *base = static_cast<uint8_t *>(mapping) + ch * HUGEPAGE_REGION_SIZE;
hugepage_mapping_per_channel[ch] = {base, HUGEPAGE_REGION_SIZE, iova + ch * HUGEPAGE_REGION_SIZE};
}

return true;
}

int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); }

hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
Expand All @@ -810,6 +867,55 @@ hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
}
}

uint64_t PCIDevice::map_for_dma(void *buffer, size_t size) {
static const auto page_size = sysconf(_SC_PAGESIZE);

const uint64_t vaddr = reinterpret_cast<uint64_t>(buffer);
const uint32_t flags = is_iommu_enabled() ? 0 : TENSTORRENT_PIN_PAGES_CONTIGUOUS;

if (vaddr % page_size != 0 || size % page_size != 0) {
TT_THROW("Buffer must be page-aligned with a size that is a multiple of the page size");
}

tenstorrent_pin_pages pin_pages{};
pin_pages.in.output_size_bytes = sizeof(pin_pages.out);
pin_pages.in.flags = flags;
pin_pages.in.virtual_address = vaddr;
pin_pages.in.size = size;

// With IOMMU, this will probably fail on you if you're mapping something
// large. The situation today is that the kernel driver uses a 32-bit DMA
// address mask, so all DMA allocations and mappings show up in the IOVA
// range of 0x0 to 0xffff'ffff. According to syseng, we can get up to 3GB
// on Intel, 3.75GB on AMD, but this requires multiple mappings with small
// chunks, down to 2MB. It's possible to make such non-contiguous mappings
// appear both virtually contiguous (to the application) and physically
// contiguous (to the NOC, using iATU), but it's not clear that this is
// worth the effort... the scheme this is intended to replace supports up
// to 4GB which is what application developers want.
//
// What can we do here?
// 1. Use hugepages (part of what we are trying to avoid here).
// 2. Use a larger value for the driver's dma_address_bits (currently 32;
// has implications for non-UMD based applications -- basically that any
// DMA buffer mapped beyond the 4GB boundary requires iATU configuration
// for the hardware to be able to reach it).
// 3. Use multiple mappings with small chunks (won't get us to 4GB; adds
// complexity).
// 4. Modify the driver so that DMA allocations are in the low 4GB IOVA
// range but mappings from userspace can be further up (requires driver
// changes).
// 5. ???
//
// If you need a quick workaround here, I suggest:
// sudo insmod ./tenstorrent.ko dma_address_bits=48
if (ioctl(pci_device_file_desc, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) {
TT_THROW("Failed to pin pages for DMA: {}", strerror(errno));
}

return pin_pages.out.physical_address;
}

void PCIDevice::print_file_contents(std::string filename, std::string hint) {
if (std::filesystem::exists(filename)) {
std::ifstream meminfo(filename);
Expand Down
5 changes: 2 additions & 3 deletions tests/microbenchmark/device_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
#include <iostream>
#include <random>

#include "cluster.h"
#include "device/tt_soc_descriptor.h"
#include "l1_address_map.h"
#include "tests/test_utils/generate_cluster_desc.hpp"
#include "umd/device/cluster.h"
#include "umd/device/tt_soc_descriptor.h"

using tt::umd::Cluster;

Expand All @@ -33,7 +33,6 @@ class uBenchmarkFixture : public ::testing::Test {
uint32_t num_host_mem_ch_per_mmio_device = 1;
device = std::make_shared<Cluster>(
test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"),
"",
target_devices,
num_host_mem_ch_per_mmio_device,
false,
Expand Down

0 comments on commit dc5e371

Please sign in to comment.