Skip to content

Commit

Permalink
Remove cpuset_lib code
Browse files Browse the repository at this point in the history
Also remove the cruft that sprouted up to support its continued
existence in UMD

Remove const int& antipattern

Use libnuma
  • Loading branch information
joelsmithTT committed Dec 2, 2024
1 parent 1d8c9dd commit 18a73fe
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 109 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Usermode Driver for Tenstorrent AI Accelerators
## Dependencies
Required Ubuntu dependencies:
```
sudo apt install -y libhwloc-dev cmake ninja-build
sudo apt install -y cmake ninja-build libnuma-dev
```

Suggested third-party dependency is Clang 17:
Expand All @@ -17,7 +17,7 @@ sudo ./llvm.sh 17

## Build flow

To build `libdevice.so`:
To build `libdevice.so`:
```
cmake -B build -G Ninja
cmake --build build
Expand Down Expand Up @@ -70,7 +70,7 @@ add_subdirectory(<path to umd>)

### Ubuntu
```
apt install ./umd-dev-x.y.z-Linux.deb
apt install ./umd-dev-x.y.z-Linux.deb
```

## Simulator Integration
Expand Down Expand Up @@ -100,7 +100,7 @@ To set up pre-commit on your local machine, follow these steps:
Ensure you have Python installed, then run:
```bash
pip install pre-commit
```
```
2. **Install the Git Hook Scripts**:
In your local repository, run the following command to install the pre-commit hooks:
```bash
Expand All @@ -113,10 +113,10 @@ To set up pre-commit on your local machine, follow these steps:
pre-commit run --all-files
```
## Why You Should Use Pre-commit
By setting up pre-commit locally, you can help maintain the quality of the codebase and ensure that commits consistently meet the project's formatting standards. This saves time during code reviews and reduces the likelihood of code formatting issues slipping into the repository.
Since the hooks run automatically before each commit, you don't need to remember to manually format or check your code, making it easier to maintain consistency.
By setting up pre-commit locally, you can help maintain the quality of the codebase and ensure that commits consistently meet the project's formatting standards. This saves time during code reviews and reduces the likelihood of code formatting issues slipping into the repository.

Since the hooks run automatically before each commit, you don't need to remember to manually format or check your code, making it easier to maintain consistency.

We strongly encourage all developers to integrate pre-commit into their workflow.

# Formatting C++ code
Expand Down
3 changes: 1 addition & 2 deletions device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ target_sources(
architecture_implementation.cpp
cluster.cpp
coordinate_manager.cpp
cpuset_lib.cpp
grayskull/grayskull_implementation.cpp
wormhole/wormhole_implementation.cpp
blackhole/blackhole_implementation.cpp
Expand Down Expand Up @@ -64,7 +63,7 @@ target_link_libraries(
PRIVATE
umd::Common
umd::Firmware
hwloc
numa
nng
rt
uv_a
Expand Down
10 changes: 5 additions & 5 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ class Cluster : public tt_device {
Cluster(
const std::string& sdesc_path,
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const uint32_t num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
Expand All @@ -686,7 +686,7 @@ class Cluster : public tt_device {
* @param simulated_harvesting_masks
*/
Cluster(
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const uint32_t num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
Expand All @@ -704,7 +704,7 @@ class Cluster : public tt_device {
*/
Cluster(
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const uint32_t num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
Expand Down Expand Up @@ -834,7 +834,7 @@ class Cluster : public tt_device {
// Startup + teardown
void create_device(
const std::unordered_set<chip_id_t>& target_mmio_device_ids,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources);
void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm);
Expand Down Expand Up @@ -962,7 +962,7 @@ class Cluster : public tt_device {

void construct_cluster(
const std::string& sdesc_path,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down
16 changes: 6 additions & 10 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup

void Cluster::create_device(
const std::unordered_set<chip_id_t>& target_mmio_device_ids,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources) {
log_debug(LogSiliconDriver, "Cluster::Cluster");
Expand Down Expand Up @@ -272,11 +272,7 @@ void Cluster::create_device(
}
auto dev = m_pci_device_map.at(logical_device_id).get();

uint16_t pcie_device_id = dev->get_pci_device_id();
uint32_t pcie_revision = dev->get_pci_revision();
// TODO: get rid of this, it doesn't make any sense.
int num_host_mem_channels =
get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);
int num_host_mem_channels = num_host_mem_ch_per_mmio_device;
if (dev->get_arch() == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1) {
// TODO: Implement support for multiple host channels on BLACKHOLE.
log_warning(
Expand Down Expand Up @@ -350,7 +346,7 @@ std::unordered_map<chip_id_t, uint32_t> Cluster::get_harvesting_masks_for_soc_de

void Cluster::construct_cluster(
const std::string& sdesc_path,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down Expand Up @@ -519,7 +515,7 @@ void Cluster::construct_cluster(
}

Cluster::Cluster(
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down Expand Up @@ -567,7 +563,7 @@ Cluster::Cluster(

Cluster::Cluster(
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down Expand Up @@ -612,7 +608,7 @@ Cluster::Cluster(
Cluster::Cluster(
const std::string& sdesc_path,
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down
71 changes: 3 additions & 68 deletions device/hugepage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,12 @@

#include <fcntl.h> // for O_RDWR and other constants
#include <sys/stat.h> // for umask
#include <unistd.h> // for unlink

#include "cpuset_lib.hpp"
#include "logger.hpp"
#include <regex>

const uint32_t g_MAX_HOST_MEM_CHANNELS = 4;

// Hardcode (but allow override) of path now, to support environments with other 1GB hugepage mounts not for runtime.
const char* hugepage_dir_env = std::getenv("TT_BACKEND_HUGEPAGE_DIR");
std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages-1G";
static const std::string hugepage_dir = "/dev/hugepages-1G";

namespace tt::umd {

Expand All @@ -37,68 +34,6 @@ uint32_t get_num_hugepages() {
return num_hugepages;
}

uint32_t get_available_num_host_mem_channels(
const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) {
// To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id.
uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices();
uint32_t num_tt_mmio_devices_for_arch =
tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
uint32_t total_hugepages = get_num_hugepages();

// This shouldn't happen on silicon machines.
if (num_tt_mmio_devices_for_arch == 0) {
log_warning(
LogSiliconDriver,
"No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0",
device_id,
revision_id);
return 0;
}

// GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups
// that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices.
uint32_t num_channels_per_device_available =
std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch));

// Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later
// on.
if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) {
log_warning(
LogSiliconDriver,
"Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient "
"Hugepages/HostMemChannels per device.");
}

if (total_hugepages < num_tt_mmio_devices_for_arch) {
log_warning(
LogSiliconDriver,
"Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. "
"NumHostMemChannels would be 0, bumping to 1.",
total_hugepages,
num_tt_mmio_devices_for_arch,
device_id,
revision_id);
}

if (num_channels_per_device_available < num_channels_per_device_target) {
log_warning(
LogSiliconDriver,
"NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds "
"NumHostMemChannels. Increase Number of Hugepages.",
num_channels_per_device_available,
device_id,
num_channels_per_device_target);
}

log_assert(
num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS,
"NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.",
num_channels_per_device_available,
g_MAX_HOST_MEM_CHANNELS);

return num_channels_per_device_available;
}

std::string find_hugepage_dir(std::size_t pagesize) {
static const std::regex hugetlbfs_mount_re(
fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir));
Expand Down
23 changes: 9 additions & 14 deletions device/pcie/pci_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <fcntl.h> // for ::open
#include <linux/pci.h> // for PCI_SLOT, PCI_FUNC
#include <numa.h>
#include <sys/ioctl.h> // for ioctl
#include <sys/mman.h> // for mmap, munmap
#include <sys/stat.h> // for fstat
Expand All @@ -18,7 +19,6 @@
#include <vector>

#include "assert.hpp"
#include "cpuset_lib.hpp"
#include "ioctl.h"
#include "logger.hpp"
#include "umd/device/architecture_implementation.h"
Expand Down Expand Up @@ -687,6 +687,14 @@ tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation
bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
const size_t hugepage_size = HUGEPAGE_REGION_SIZE;

if (numa_node > numa_max_node()) {
log_warning(LogSiliconDriver, "numa_node: {} is greater than numa_max_node: {}.", numa_node, numa_max_node());
}

// Prefer allocations on the NUMA node associated with the device.
numa_set_preferred(numa_node);

// Convert from logical (device_id in netlist) to physical device_id (in case of virtualization)
auto physical_device_id = get_device_num();

std::string hugepage_dir = find_hugepage_dir(hugepage_size);
Expand Down Expand Up @@ -748,19 +756,6 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
continue;
}

// Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same
// numanode as TT device.
if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)) {
log_warning(
LogSiliconDriver,
"---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: "
"{}). "
"Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf "
"(Issue #893).",
physical_device_id,
ch);
}

tenstorrent_pin_pages pin_pages;
memset(&pin_pages, 0, sizeof(pin_pages));
pin_pages.in.output_size_bytes = sizeof(pin_pages.out);
Expand Down
4 changes: 2 additions & 2 deletions tests/microbenchmark/device_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
#include <iostream>
#include <random>

#include "cluster.h"
#include "device/tt_soc_descriptor.h"
#include "l1_address_map.h"
#include "tests/test_utils/generate_cluster_desc.hpp"
#include "umd/device/cluster.h"
#include "umd/device/tt_soc_descriptor.h"

using tt::umd::Cluster;

Expand Down

0 comments on commit 18a73fe

Please sign in to comment.