Remove cpuset_lib code

Also remove the cruft that sprouted up to support its continued existence in UMD Remove const int& antipattern Use libnuma
tenstorrent · Dec 2, 2024 · 18a73fe · 18a73fe
1 parent 1d8c9dd
commit 18a73fe
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 109 deletions.
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ Usermode Driver for Tenstorrent AI Accelerators
 ## Dependencies
 Required Ubuntu dependencies:
 ```
-sudo apt install -y libhwloc-dev cmake ninja-build
+sudo apt install -y cmake ninja-build libnuma-dev
 ```
 
 Suggested third-party dependency is Clang 17:
@@ -17,7 +17,7 @@ sudo ./llvm.sh 17
 
 ## Build flow
 
-To build `libdevice.so`: 
+To build `libdevice.so`:
 ```
 cmake -B build -G Ninja
 cmake --build build
@@ -70,7 +70,7 @@ add_subdirectory(<path to umd>)
 
 ### Ubuntu
 ```
-apt install ./umd-dev-x.y.z-Linux.deb 
+apt install ./umd-dev-x.y.z-Linux.deb
 ```
 
 ## Simulator Integration
@@ -100,7 +100,7 @@ To set up pre-commit on your local machine, follow these steps:
    Ensure you have Python installed, then run:
    ```bash
    pip install pre-commit
-   ```  
+   ```
 2. **Install the Git Hook Scripts**:
    In your local repository, run the following command to install the pre-commit hooks:
    ```bash
@@ -113,10 +113,10 @@ To set up pre-commit on your local machine, follow these steps:
    pre-commit run --all-files
    ```
 ## Why You Should Use Pre-commit
-By setting up pre-commit locally, you can help maintain the quality of the codebase and ensure that commits consistently meet the project's formatting standards. This saves time during code reviews and reduces the likelihood of code formatting issues slipping into the repository.  
-  
-Since the hooks run automatically before each commit, you don't need to remember to manually format or check your code, making it easier to maintain consistency.  
-  
+By setting up pre-commit locally, you can help maintain the quality of the codebase and ensure that commits consistently meet the project's formatting standards. This saves time during code reviews and reduces the likelihood of code formatting issues slipping into the repository.
+
+Since the hooks run automatically before each commit, you don't need to remember to manually format or check your code, making it easier to maintain consistency.
+
 We strongly encourage all developers to integrate pre-commit into their workflow.
 
 # Formatting C++ code

diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt
@@ -26,7 +26,6 @@ target_sources(
         architecture_implementation.cpp
         cluster.cpp
         coordinate_manager.cpp
-        cpuset_lib.cpp
         grayskull/grayskull_implementation.cpp
         wormhole/wormhole_implementation.cpp
         blackhole/blackhole_implementation.cpp
@@ -64,7 +63,7 @@ target_link_libraries(
     PRIVATE
         umd::Common
         umd::Firmware
-        hwloc
+        numa
         nng
         rt
         uv_a

diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h
@@ -669,7 +669,7 @@ class Cluster : public tt_device {
     Cluster(
         const std::string& sdesc_path,
         const std::set<chip_id_t>& target_devices,
-        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const uint32_t num_host_mem_ch_per_mmio_device = 1,
         const bool skip_driver_allocs = false,
         const bool clean_system_resources = false,
         bool perform_harvesting = true,
@@ -686,7 +686,7 @@ class Cluster : public tt_device {
      * @param simulated_harvesting_masks
      */
     Cluster(
-        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const uint32_t num_host_mem_ch_per_mmio_device = 1,
         const bool skip_driver_allocs = false,
         const bool clean_system_resources = false,
         bool perform_harvesting = true,
@@ -704,7 +704,7 @@ class Cluster : public tt_device {
      */
     Cluster(
         const std::set<chip_id_t>& target_devices,
-        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const uint32_t num_host_mem_ch_per_mmio_device = 1,
         const bool skip_driver_allocs = false,
         const bool clean_system_resources = false,
         bool perform_harvesting = true,
@@ -834,7 +834,7 @@ class Cluster : public tt_device {
     // Startup + teardown
     void create_device(
         const std::unordered_set<chip_id_t>& target_mmio_device_ids,
-        const uint32_t& num_host_mem_ch_per_mmio_device,
+        const uint32_t num_host_mem_ch_per_mmio_device,
         const bool skip_driver_allocs,
         const bool clean_system_resources);
     void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm);
@@ -962,7 +962,7 @@ class Cluster : public tt_device {
 
     void construct_cluster(
         const std::string& sdesc_path,
-        const uint32_t& num_host_mem_ch_per_mmio_device,
+        const uint32_t num_host_mem_ch_per_mmio_device,
         const bool skip_driver_allocs,
         const bool clean_system_resources,
         bool perform_harvesting,

diff --git a/device/cluster.cpp b/device/cluster.cpp
@@ -240,7 +240,7 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup
 
 void Cluster::create_device(
     const std::unordered_set<chip_id_t>& target_mmio_device_ids,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources) {
     log_debug(LogSiliconDriver, "Cluster::Cluster");
@@ -272,11 +272,7 @@ void Cluster::create_device(
         }
         auto dev = m_pci_device_map.at(logical_device_id).get();
 
-        uint16_t pcie_device_id = dev->get_pci_device_id();
-        uint32_t pcie_revision = dev->get_pci_revision();
-        // TODO: get rid of this, it doesn't make any sense.
-        int num_host_mem_channels =
-            get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);
+        int num_host_mem_channels = num_host_mem_ch_per_mmio_device;
         if (dev->get_arch() == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1) {
             // TODO: Implement support for multiple host channels on BLACKHOLE.
             log_warning(
@@ -350,7 +346,7 @@ std::unordered_map<chip_id_t, uint32_t> Cluster::get_harvesting_masks_for_soc_de
 
 void Cluster::construct_cluster(
     const std::string& sdesc_path,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -519,7 +515,7 @@ void Cluster::construct_cluster(
 }
 
 Cluster::Cluster(
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -567,7 +563,7 @@ Cluster::Cluster(
 
 Cluster::Cluster(
     const std::set<chip_id_t>& target_devices,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -612,7 +608,7 @@ Cluster::Cluster(
 Cluster::Cluster(
     const std::string& sdesc_path,
     const std::set<chip_id_t>& target_devices,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,

diff --git a/device/hugepage.cpp b/device/hugepage.cpp
@@ -8,15 +8,12 @@
 
 #include <fcntl.h>     // for O_RDWR and other constants
 #include <sys/stat.h>  // for umask
+#include <unistd.h>    // for unlink
 
-#include "cpuset_lib.hpp"
 #include "logger.hpp"
+#include <regex>
 
-const uint32_t g_MAX_HOST_MEM_CHANNELS = 4;
-
-// Hardcode (but allow override) of path now, to support environments with other 1GB hugepage mounts not for runtime.
-const char* hugepage_dir_env = std::getenv("TT_BACKEND_HUGEPAGE_DIR");
-std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages-1G";
+static const std::string hugepage_dir = "/dev/hugepages-1G";
 
 namespace tt::umd {
 
@@ -37,68 +34,6 @@ uint32_t get_num_hugepages() {
     return num_hugepages;
 }
 
-uint32_t get_available_num_host_mem_channels(
-    const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) {
-    // To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id.
-    uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices();
-    uint32_t num_tt_mmio_devices_for_arch =
-        tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id);
-    uint32_t total_hugepages = get_num_hugepages();
-
-    // This shouldn't happen on silicon machines.
-    if (num_tt_mmio_devices_for_arch == 0) {
-        log_warning(
-            LogSiliconDriver,
-            "No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0",
-            device_id,
-            revision_id);
-        return 0;
-    }
-
-    // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups
-    // that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices.
-    uint32_t num_channels_per_device_available =
-        std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch));
-
-    // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later
-    // on.
-    if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) {
-        log_warning(
-            LogSiliconDriver,
-            "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient "
-            "Hugepages/HostMemChannels per device.");
-    }
-
-    if (total_hugepages < num_tt_mmio_devices_for_arch) {
-        log_warning(
-            LogSiliconDriver,
-            "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. "
-            "NumHostMemChannels would be 0, bumping to 1.",
-            total_hugepages,
-            num_tt_mmio_devices_for_arch,
-            device_id,
-            revision_id);
-    }
-
-    if (num_channels_per_device_available < num_channels_per_device_target) {
-        log_warning(
-            LogSiliconDriver,
-            "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds "
-            "NumHostMemChannels. Increase Number of Hugepages.",
-            num_channels_per_device_available,
-            device_id,
-            num_channels_per_device_target);
-    }
-
-    log_assert(
-        num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS,
-        "NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.",
-        num_channels_per_device_available,
-        g_MAX_HOST_MEM_CHANNELS);
-
-    return num_channels_per_device_available;
-}
-
 std::string find_hugepage_dir(std::size_t pagesize) {
     static const std::regex hugetlbfs_mount_re(
         fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir));

diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp
@@ -8,6 +8,7 @@
 
 #include <fcntl.h>      // for ::open
 #include <linux/pci.h>  // for PCI_SLOT, PCI_FUNC
+#include <numa.h>
 #include <sys/ioctl.h>  // for ioctl
 #include <sys/mman.h>   // for mmap, munmap
 #include <sys/stat.h>   // for fstat
@@ -18,7 +19,6 @@
 #include <vector>
 
 #include "assert.hpp"
-#include "cpuset_lib.hpp"
 #include "ioctl.h"
 #include "logger.hpp"
 #include "umd/device/architecture_implementation.h"
@@ -687,6 +687,14 @@ tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation
 bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
     const size_t hugepage_size = HUGEPAGE_REGION_SIZE;
 
+    if (numa_node > numa_max_node()) {
+        log_warning(LogSiliconDriver, "numa_node: {} is greater than numa_max_node: {}.", numa_node, numa_max_node());
+    }
+
+    // Prefer allocations on the NUMA node associated with the device.
+    numa_set_preferred(numa_node);
+
+    // Convert from logical (device_id in netlist) to physical device_id (in case of virtualization)
     auto physical_device_id = get_device_num();
 
     std::string hugepage_dir = find_hugepage_dir(hugepage_size);
@@ -748,19 +756,6 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
             continue;
         }
 
-        // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same
-        // numanode as TT device.
-        if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)) {
-            log_warning(
-                LogSiliconDriver,
-                "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: "
-                "{}). "
-                "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf "
-                "(Issue #893).",
-                physical_device_id,
-                ch);
-        }
-
         tenstorrent_pin_pages pin_pages;
         memset(&pin_pages, 0, sizeof(pin_pages));
         pin_pages.in.output_size_bytes = sizeof(pin_pages.out);

diff --git a/tests/microbenchmark/device_fixture.hpp b/tests/microbenchmark/device_fixture.hpp
@@ -9,10 +9,10 @@
 #include <iostream>
 #include <random>
 
-#include "cluster.h"
-#include "device/tt_soc_descriptor.h"
 #include "l1_address_map.h"
 #include "tests/test_utils/generate_cluster_desc.hpp"
+#include "umd/device/cluster.h"
+#include "umd/device/tt_soc_descriptor.h"
 
 using tt::umd::Cluster;