Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify hugepage code, Chapter I #332

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/docker_install_common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ apt-get update && apt-get install -y \
ninja-build \
git \
git-lfs \
libhwloc-dev \
libnuma-dev \
libgtest-dev \
libyaml-cpp-dev \
libboost-all-dev \
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ UMD requires Tenstorrent's [kernel-mode driver](https://github.com/tenstorrent/t

Required Ubuntu dependencies:
```
sudo apt install -y libhwloc-dev cmake ninja-build
sudo apt install -y cmake ninja-build libnuma-dev
```

Suggested third-party dependency is Clang 17:
Expand Down
3 changes: 1 addition & 2 deletions device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ target_sources(
tt_device/tlb_manager.cpp
cluster.cpp
coordinate_manager.cpp
cpuset_lib.cpp
grayskull/grayskull_implementation.cpp
wormhole/wormhole_implementation.cpp
blackhole/blackhole_implementation.cpp
Expand Down Expand Up @@ -71,7 +70,7 @@ target_link_libraries(
PRIVATE
umd::Common
umd::Firmware
hwloc
numa
nng
rt
uv_a
Expand Down
14 changes: 8 additions & 6 deletions device/api/umd/device/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ class Cluster : public tt_device {
* The ones defined by the devices itself have to be used, they will be merged with the ones passed here.
*/
Cluster(
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const uint32_t num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
Expand All @@ -514,7 +514,7 @@ class Cluster : public tt_device {
*/
Cluster(
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const uint32_t num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
Expand All @@ -538,7 +538,7 @@ class Cluster : public tt_device {
Cluster(
const std::string& sdesc_path,
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const uint32_t num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
Expand All @@ -560,7 +560,7 @@ class Cluster : public tt_device {
*/
Cluster(
std::unordered_map<chip_id_t, std::unique_ptr<Chip>>& chips,
const uint32_t& num_host_mem_ch_per_mmio_device = 1,
const uint32_t num_host_mem_ch_per_mmio_device = 1,
const bool skip_driver_allocs = false,
const bool clean_system_resources = false,
bool perform_harvesting = true,
Expand Down Expand Up @@ -737,7 +737,7 @@ class Cluster : public tt_device {
// Startup + teardown
void create_device(
const std::set<chip_id_t>& target_mmio_device_ids,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources);
void initialize_interprocess_mutexes(int logical_device_id, bool cleanup_mutexes_in_shm);
Expand Down Expand Up @@ -875,14 +875,16 @@ class Cluster : public tt_device {
bool perform_harvesting,
std::unordered_map<chip_id_t, uint32_t>& simulated_harvesting_masks);
void construct_cluster(
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
std::unordered_map<chip_id_t, uint32_t> simulated_harvesting_masks);
tt::umd::CoreCoord translate_chip_coord(
const chip_id_t chip, const tt::umd::CoreCoord core_coord, const CoordSystem coord_system) const;

void remote_io_sysmem_sanity_check(chip_id_t logical_device_id) const;

// State variables
tt_device_dram_address_params dram_address_params;
tt_device_l1_address_params l1_address_params;
Expand Down
10 changes: 7 additions & 3 deletions device/api/umd/device/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ class PCIDevice {
*/
const PciDeviceInfo get_device_info() const { return info; }

/**
* @return PCI character device path
*/
const std::string &get_device_path() const { return device_path; }

/**
* @return which NUMA node this device is associated with, or -1 if non-NUMA
*/
Expand Down Expand Up @@ -131,16 +136,15 @@ class PCIDevice {
bool is_iommu_enabled() const { return iommu_enabled; }

// TODO: this also probably has more sense to live in the future TTDevice class.
bool init_hugepage(uint32_t num_host_mem_channels);
void init_hugepage(uint32_t num_host_mem_channels);

/**
* Allocate sysmem without hugepages and map it through IOMMU.
* This is used when the system is protected by an IOMMU. The mappings will
* still appear as hugepages to the caller.
* @param size sysmem size in bytes; size % (1UL << 30) == 0
* @return whether allocation/mapping succeeded.
*/
bool init_iommu(size_t size);
void init_iommu(size_t size);

size_t get_num_host_mem_channels() const;
hugepage_mapping get_hugepage_mapping(size_t channel) const;
Expand Down
96 changes: 50 additions & 46 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ void Cluster::initialize_interprocess_mutexes(int logical_device_id, bool cleanu

void Cluster::create_device(
const std::set<chip_id_t>& target_mmio_device_ids,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources) {
log_debug(LogSiliconDriver, "Cluster::Cluster");
Expand All @@ -211,46 +211,21 @@ void Cluster::create_device(
target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now.");

for (const chip_id_t& logical_device_id : target_mmio_device_ids) {
auto pci_device = get_tt_device(logical_device_id)->get_pci_device();

uint16_t pcie_device_id = pci_device->get_pci_device_id();
uint32_t pcie_revision = pci_device->get_pci_revision();
// TODO: get rid of this, it doesn't make any sense.
// Update: I did get rid of it and it broke Metal CI, which is passing
// tests that ask for more hugepages than exist. That's wrong, but it
// isn't fixed yet, so until then...
int num_host_mem_channels =
get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision);

log_debug(
LogSiliconDriver,
"Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} "
"device_id: 0x{:x} revision: {})",
num_host_mem_channels,
logical_device_id,
pci_device->get_device_num(),
pci_device->get_device_num(),
pci_device->revision_id);

// TODO: This will be moved to a dedicated Locking class.
initialize_interprocess_mutexes(logical_device_id, clean_system_resources);

// MT: Initial BH - hugepages will fail init
// For using silicon driver without workload to query mission mode params, no need for hugepage.
if (!skip_driver_allocs) {
bool hugepages_initialized = pci_device->init_hugepage(num_host_mem_channels);
// Large writes to remote chips require hugepages to be initialized.
// Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused
// if using remote only for small transactions)
if (remote_chip_ids_.size()) {
log_assert(
hugepages_initialized,
"Hugepages must be successfully initialized if workload contains remote chips!");
}
if (not pci_device->get_hugepage_mapping(0).mapping) {
log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id);
}
// Host memory channel (i.e. hugepage or equivalent) allocation/setup:
if (!skip_driver_allocs && num_host_mem_ch_per_mmio_device > 0) {
auto pci_device = get_tt_device(logical_device_id)->get_pci_device();
log_info(
LogSiliconDriver,
"Using {} Host Memory Channels for {} (logical id: {})",
num_host_mem_ch_per_mmio_device,
pci_device->get_device_path(),
logical_device_id);
pci_device->init_hugepage(num_host_mem_ch_per_mmio_device);
}

// translation layer for harvested coords. Default is identity map
harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)});
}
Expand All @@ -277,7 +252,7 @@ std::unordered_map<chip_id_t, uint32_t> Cluster::get_harvesting_masks_for_soc_de
}

void Cluster::construct_cluster(
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down Expand Up @@ -513,7 +488,7 @@ uint32_t Cluster::get_tensix_harvesting_mask(
}

Cluster::Cluster(
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand All @@ -539,7 +514,7 @@ Cluster::Cluster(

Cluster::Cluster(
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down Expand Up @@ -570,7 +545,7 @@ Cluster::Cluster(
Cluster::Cluster(
const std::string& sdesc_path,
const std::set<chip_id_t>& target_devices,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down Expand Up @@ -607,7 +582,7 @@ Cluster::Cluster(

Cluster::Cluster(
std::unordered_map<chip_id_t, std::unique_ptr<Chip>>& chips,
const uint32_t& num_host_mem_ch_per_mmio_device,
const uint32_t num_host_mem_ch_per_mmio_device,
const bool skip_driver_allocs,
const bool clean_system_resources,
bool perform_harvesting,
Expand Down Expand Up @@ -1845,6 +1820,11 @@ void Cluster::write_to_non_mmio_device(
use_dram = broadcast || (size_in_bytes > 256 * DATA_WORD_SIZE);
max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;

// See the remark in the equivalent read function about this sanity check.
if (use_dram) {
remote_io_sysmem_sanity_check(mmio_capable_chip_logical);
}

//
// MUTEX ACQUIRE (NON-MMIO)
// do not locate any ethernet core reads/writes before this acquire
Expand Down Expand Up @@ -2101,11 +2081,20 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_
erisc_q_rptr.resize(1);
erisc_q_rptr[0] = erisc_q_ptrs[4];

bool use_dram;
uint32_t max_block_size;
bool use_dram = size_in_bytes > 1024;
uint32_t max_block_size =
use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;

use_dram = size_in_bytes > 1024;
max_block_size = use_dram ? host_address_params.eth_routing_block_size : eth_interface_params.max_block_size;
// DRAM in this case is the host memory, not the device memory. Elsewhere
// in the code we refer to this memory as sysmem. So if use_dram is true,
// make sure that we actually have some sysmem to use.
//
// If you are wondering how the application knows which chunk of sysmem is
// used for remote IO so that it can avoid stepping on it, the answer is
// that it doesn't.
if (use_dram) {
remote_io_sysmem_sanity_check(mmio_capable_chip_logical);
}

uint32_t offset = 0;
uint32_t block_size;
Expand Down Expand Up @@ -3365,4 +3354,19 @@ tt::umd::CoreCoord Cluster::translate_chip_coord(
return get_soc_descriptor(chip).translate_coord_to(core_coord, coord_system);
}

/**
* The remote IO logic has an implicit dependency on the first host memory
* channel for the local, PCIe-attached device that is initiating the transfer.
* This function ensures that the first host memory channel exists.
*/
void Cluster::remote_io_sysmem_sanity_check(chip_id_t logical_device_id) const {
auto* tt_device = get_tt_device(logical_device_id);
auto* pci_device = tt_device->get_pci_device();
auto channel_0 = pci_device->get_hugepage_mapping(0);
if (channel_0.mapping == nullptr) {
log_error("No sysmem is configured for logical chip id {}", logical_device_id);
throw std::runtime_error("One or more host memory channels (sysmem) must exist for large remote IO");
}
}

} // namespace tt::umd
Loading
Loading