tenstorrent · joelsmithTT · Nov 26, 2024 · Dec 2, 2024 · Dec 2, 2024 · Dec 6, 2024
diff --git a/.github/docker_install_common.sh b/.github/docker_install_common.sh
@@ -8,7 +8,7 @@ apt-get update && apt-get install -y \
     ninja-build \
     git \
     git-lfs \
-    libhwloc-dev \
+    libnuma-dev \
     libgtest-dev \
     libyaml-cpp-dev \
     libboost-all-dev \

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ UMD requires Tenstorrent's [kernel-mode driver](https://github.com/tenstorrent/t
 
 Required Ubuntu dependencies:
 ```
-sudo apt install -y libhwloc-dev cmake ninja-build
+sudo apt install -y cmake ninja-build libnuma-dev
 ```
 
 Suggested third-party dependency is Clang 17:

diff --git a/device/CMakeLists.txt b/device/CMakeLists.txt
@@ -29,7 +29,6 @@ target_sources(
         tt_device/tlb_manager.cpp
         cluster.cpp
         coordinate_manager.cpp
-        cpuset_lib.cpp
         grayskull/grayskull_implementation.cpp
         wormhole/wormhole_implementation.cpp
         blackhole/blackhole_implementation.cpp
@@ -71,7 +70,7 @@ target_link_libraries(
     PRIVATE
         umd::Common
         umd::Firmware
-        hwloc
+        numa
         nng
         rt
         uv_a

diff --git a/device/api/umd/device/cluster.h b/device/api/umd/device/cluster.h
@@ -494,7 +494,7 @@ class Cluster : public tt_device {
      * The ones defined by the devices itself have to be used, they will be merged with the ones passed here.
      */
     Cluster(
-        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const uint32_t num_host_mem_ch_per_mmio_device = 1,
         const bool skip_driver_allocs = false,
         const bool clean_system_resources = false,
         bool perform_harvesting = true,
@@ -514,7 +514,7 @@ class Cluster : public tt_device {
      */
     Cluster(
         const std::set<chip_id_t>& target_devices,
-        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const uint32_t num_host_mem_ch_per_mmio_device = 1,
         const bool skip_driver_allocs = false,
         const bool clean_system_resources = false,
         bool perform_harvesting = true,
@@ -538,7 +538,7 @@ class Cluster : public tt_device {
     Cluster(
         const std::string& sdesc_path,
         const std::set<chip_id_t>& target_devices,
-        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const uint32_t num_host_mem_ch_per_mmio_device = 1,
         const bool skip_driver_allocs = false,
         const bool clean_system_resources = false,
         bool perform_harvesting = true,
@@ -560,7 +560,7 @@ class Cluster : public tt_device {
      */
     Cluster(
         std::unordered_map<chip_id_t, std::unique_ptr<Chip>>& chips,
-        const uint32_t& num_host_mem_ch_per_mmio_device = 1,
+        const uint32_t num_host_mem_ch_per_mmio_device = 1,
         const bool skip_driver_allocs = false,
         const bool clean_system_resources = false,
         bool perform_harvesting = true,
@@ -737,7 +737,7 @@ class Cluster : public tt_device {
     // Startup + teardown
     void create_device(
         const std::set<chip_id_t>& target_mmio_device_ids,
-        const uint32_t& num_host_mem_ch_per_mmio_device,
+        const uint32_t num_host_mem_ch_per_mmio_device,
         const bool skip_driver_allocs,
         const bool clean_system_resources);
     void initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm);
@@ -875,14 +875,16 @@ class Cluster : public tt_device {
         bool perform_harvesting,
         std::unordered_map<chip_id_t, uint32_t>& simulated_harvesting_masks);
     void construct_cluster(
-        const uint32_t& num_host_mem_ch_per_mmio_device,
+        const uint32_t num_host_mem_ch_per_mmio_device,
         const bool skip_driver_allocs,
         const bool clean_system_resources,
         bool perform_harvesting,
         std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks);
     tt::umd::CoreCoord translate_chip_coord(
         const chip_id_t chip, const tt::umd::CoreCoord core_coord, const CoordSystem coord_system) const;
 
+    void remote_io_sysmem_sanity_check(chip_id_t logical_device_id) const;
+
     // State variables
     tt_device_dram_address_params dram_address_params;
     tt_device_l1_address_params l1_address_params;

diff --git a/device/api/umd/device/pci_device.hpp b/device/api/umd/device/pci_device.hpp
@@ -91,6 +91,11 @@ class PCIDevice {
      */
     const PciDeviceInfo get_device_info() const { return info; }
 
+    /**
+     * @return PCI character device path
+     */
+    const std::string &get_device_path() const { return device_path; }
+
     /**
      * @return which NUMA node this device is associated with, or -1 if non-NUMA
      */
@@ -131,16 +136,15 @@ class PCIDevice {
     bool is_iommu_enabled() const { return iommu_enabled; }
 
     // TODO: this also probably has more sense to live in the future TTDevice class.
-    bool init_hugepage(uint32_t num_host_mem_channels);
+    void init_hugepage(uint32_t num_host_mem_channels);
 
     /**
      * Allocate sysmem without hugepages and map it through IOMMU.
      * This is used when the system is protected by an IOMMU.  The mappings will
      * still appear as hugepages to the caller.
      * @param size sysmem size in bytes; size % (1UL << 30) == 0
-     * @return whether allocation/mapping succeeded.
      */
-    bool init_iommu(size_t size);
+    void init_iommu(size_t size);
 
     size_t get_num_host_mem_channels() const;
     hugepage_mapping get_hugepage_mapping(size_t channel) const;

diff --git a/device/cluster.cpp b/device/cluster.cpp
@@ -199,7 +199,7 @@ void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanu
 
 void Cluster::create_device(
     const std::set<chip_id_t>& target_mmio_device_ids,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources) {
     log_debug(LogSiliconDriver, "Cluster::Cluster");
@@ -211,46 +211,21 @@ void Cluster::create_device(
         target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now.");
 
     for (const chip_id_t& logical_device_id : target_mmio_device_ids) {
-        auto pci_device = get_tt_device(logical_device_id)->get_pci_device();
-
-        uint16_t pcie_device_id = pci_device->get_pci_device_id();
-        uint32_t pcie_revision = pci_device->get_pci_revision();
-        // TODO: get rid of this, it doesn't make any sense.
-        // Update: I did get rid of it and it broke Metal CI, which is passing
-        // tests that ask for more hugepages than exist.  That's wrong, but it
-        // isn't fixed yet, so until then...
-        int num_host_mem_channels =
-            get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);
-
-        log_debug(
-            LogSiliconDriver,
-            "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} "
-            "device_id: 0x{:x} revision: {})",
-            num_host_mem_channels,
-            logical_device_id,
-            pci_device->get_device_num(),
-            pci_device->get_device_num(),
-            pci_device->revision_id);
-
         // TODO: This will be moved to a dedicated Locking class.
         initialize_interprocess_mutexes(logical_device_id, clean_system_resources);
 
-        // MT: Initial BH - hugepages will fail init
-        // For using silicon driver without workload to query mission mode params, no need for hugepage.
-        if (!skip_driver_allocs) {
-            bool hugepages_initialized = pci_device->init_hugepage(num_host_mem_channels);
-            // Large writes to remote chips require hugepages to be initialized.
-            // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused
-            // if using remote only for small transactions)
-            if (remote_chip_ids_.size()) {
-                log_assert(
-                    hugepages_initialized,
-                    "Hugepages must be successfully initialized if workload contains remote chips!");
-            }
-            if (not pci_device->get_hugepage_mapping(0).mapping) {
-                log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id);
-            }
+        // Host memory channel (i.e. hugepage or equivalent) allocation/setup:
+        if (!skip_driver_allocs && num_host_mem_ch_per_mmio_device > 0) {
+            auto pci_device = get_tt_device(logical_device_id)->get_pci_device();
+            log_info(
+                LogSiliconDriver,
+                "Using {} Host Memory Channels for {} (logical id: {})",
+                num_host_mem_ch_per_mmio_device,
+                pci_device->get_device_path(),
+                logical_device_id);
+            pci_device->init_hugepage(num_host_mem_ch_per_mmio_device);
         }
+
         // translation layer for harvested coords. Default is identity map
         harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)});
     }
@@ -277,7 +252,7 @@ std::unordered_map<chip_id_t, uint32_t> Cluster::get_harvesting_masks_for_soc_de
 }
 
 void Cluster::construct_cluster(
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -513,7 +488,7 @@ uint32_t Cluster::get_tensix_harvesting_mask(
 }
 
 Cluster::Cluster(
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -539,7 +514,7 @@ Cluster::Cluster(
 
 Cluster::Cluster(
     const std::set<chip_id_t>& target_devices,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -570,7 +545,7 @@ Cluster::Cluster(
 Cluster::Cluster(
     const std::string& sdesc_path,
     const std::set<chip_id_t>& target_devices,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -607,7 +582,7 @@ Cluster::Cluster(
 
 Cluster::Cluster(
     std::unordered_map<chip_id_t, std::unique_ptr<Chip>>& chips,
-    const uint32_t& num_host_mem_ch_per_mmio_device,
+    const uint32_t num_host_mem_ch_per_mmio_device,
     const bool skip_driver_allocs,
     const bool clean_system_resources,
     bool perform_harvesting,
@@ -1845,6 +1820,11 @@ void Cluster::write_to_non_mmio_device(
     use_dram = broadcast || (size_in_bytes > 256 * DATA_WORD_SIZE);
     max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;
 
+    // See the remark in the equivalent read function about this sanity check.
+    if (use_dram) {
+        remote_io_sysmem_sanity_check(mmio_capable_chip_logical);
+    }
+
     //
     //                    MUTEX ACQUIRE (NON-MMIO)
     //  do not locate any ethernet core reads/writes before this acquire
@@ -2101,11 +2081,20 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
     erisc_q_rptr.resize(1);
     erisc_q_rptr[0] = erisc_q_ptrs[4];
 
-    bool use_dram;
-    uint32_t max_block_size;
+    bool use_dram = size_in_bytes > 1024;
+    uint32_t max_block_size =
+        use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;
 
-    use_dram = size_in_bytes > 1024;
-    max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;
+    // DRAM in this case is the host memory, not the device memory.  Elsewhere
+    // in the code we refer to this memory as sysmem.  So if use_dram is true,
+    // make sure that we actually have some sysmem to use.
+    //
+    // If you are wondering how the application knows which chunk of sysmem is
+    // used for remote IO so that it can avoid stepping on it, the answer is
+    // that it doesn't.
+    if (use_dram) {
+        remote_io_sysmem_sanity_check(mmio_capable_chip_logical);
+    }
 
     uint32_t offset = 0;
     uint32_t block_size;
@@ -3365,4 +3354,19 @@ tt::umd::CoreCoord Cluster::translate_chip_coord(
     return get_soc_descriptor(chip).translate_coord_to(core_coord, coord_system);
 }
 
+/**
+ * The remote IO logic has an implicit dependency on the first host memory
+ * channel for the local, PCIe-attached device that is initiating the transfer.
+ * This function ensures that the first host memory channel exists.
+ */
+void Cluster::remote_io_sysmem_sanity_check(chip_id_t logical_device_id) const {
+    auto* tt_device = get_tt_device(logical_device_id);
+    auto* pci_device = tt_device->get_pci_device();
+    auto channel_0 = pci_device->get_hugepage_mapping(0);
+    if (channel_0.mapping == nullptr) {
+        log_error("No sysmem is configured for logical chip id {}", logical_device_id);
+        throw std::runtime_error("One or more host memory channels (sysmem) must exist for large remote IO");
+    }
+}
+
 }  // namespace tt::umd