From e03bc51e2aa03feb24f0f2f06697f298f06fda23 Mon Sep 17 00:00:00 2001 From: Joel Smith <140545543+joelsmithTT@users.noreply.github.com> Date: Thu, 14 Nov 2024 10:23:03 -0600 Subject: [PATCH 1/8] More sysmem tests (#294) ### Issue https://github.com/tenstorrent/tt-umd/issues/293 ### Description Adds more unit tests for sysmem. I need a reliable way to ensure it is functioning as intended as work ramps up to enable IOMMU and provide a transition away from huge pages. ### List of the changes * Adds a second sysmem test for Wormhole that examines more of the address space. * Adds a Grayskull version of the simple Wormhole sysmem test. * Some whitespace cleanup ### Testing Manually tested; code paths are in CI ### API Changes There are no API changes in this PR. --- tests/grayskull/test_silicon_driver.cpp | 76 +++++++-- tests/microbenchmark/device_fixture.hpp | 4 +- tests/test_utils/device_test_utils.hpp | 16 +- tests/wormhole/test_silicon_driver_wh.cpp | 180 +++++++++++++++------- 4 files changed, 206 insertions(+), 70 deletions(-) diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index c48548480..c8fca4bf1 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -91,7 +91,7 @@ TEST(SiliconDriverGS, HarvestingRuntime) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index); @@ -146,19 +146,19 @@ TEST(SiliconDriverGS, StaticTLB_RW) { return flat_index; }; std::set target_devices = {0}; - + uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); for(int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted); } device.setup_core_to_tlb_map(i, get_static_tlb_index); } - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -188,7 +188,7 @@ TEST(SiliconDriverGS, StaticTLB_RW) { address += 0x20; // Increment by uint32_t size for each write } } - device.close_device(); + device.close_device(); } TEST(SiliconDriverGS, DynamicTLB_RW) { @@ -239,7 +239,6 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); - tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -299,7 +298,7 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run // Have 2 threads read and write from a single device concurrently - // All (fairly large) transactions go through a static TLB. + // All (fairly large) transactions go through a static TLB. // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test @@ -317,12 +316,12 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); - + for(int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), base_addr); } device.setup_core_to_tlb_map(i, get_static_tlb_index); @@ -404,3 +403,62 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run device.close_device(); } + +/** + * Copied from Wormhole unit tests. + */ +TEST(SiliconDriverGS, SysmemTestWithPcie) { + Cluster cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), + "", // test_utils::GetClusterDescYAML(), + {0}, + 1, // one "host memory channel", currently a 1G huge page + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes + + + cluster.start_device(tt_device_params{}); // no special parameters + + const chip_id_t mmio_chip_id = 0; + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); + const size_t test_size_bytes = 0x4000; // Arbitrarilly chosen, but small size so the test runs quickly. + + // PCIe core is at (x=0, y=4) on Grayskull NOC0. + ASSERT_EQ(PCIE.x, 0); + ASSERT_EQ(PCIE.y, 4); + + // Bad API: how big is the buffer? How do we know it's big enough? + // Situation today is that there's a 1G hugepage behind it, although this is + // unclear from the API and may change in the future. + uint8_t *sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); + ASSERT_NE(sysmem, nullptr); + + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); + + // Buffer that we will use to read sysmem into, then write sysmem from. + std::vector buffer(test_size_bytes, 0x0); + + // Step 1: Fill sysmem with random bytes. + test_utils::fill_with_random_bytes(sysmem, test_size_bytes); + + // Step 2: Read sysmem into buffer. + cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB"); + + // Step 3: Verify that buffer matches sysmem. + ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); + + // Step 4: Fill buffer with random bytes. + test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes); + + // Step 5: Write buffer into sysmem, overwriting what was there. + cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB"); + + // Step 5b: Read back sysmem into a throwaway buffer. The intent is to + // ensure the write has completed before we check sysmem against buffer. + std::vector throwaway(test_size_bytes, 0x0); + cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB"); + + // Step 6: Verify that sysmem matches buffer. + ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); +} diff --git a/tests/microbenchmark/device_fixture.hpp b/tests/microbenchmark/device_fixture.hpp index 3e20679a3..c53d5f234 100644 --- a/tests/microbenchmark/device_fixture.hpp +++ b/tests/microbenchmark/device_fixture.hpp @@ -13,6 +13,8 @@ #include "device/tt_soc_descriptor.h" #include "tests/test_utils/generate_cluster_desc.hpp" +using tt::umd::Cluster; + class uBenchmarkFixture : public ::testing::Test { protected: void SetUp() override { @@ -34,7 +36,7 @@ class uBenchmarkFixture : public ::testing::Test { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device->get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device->configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } } diff --git a/tests/test_utils/device_test_utils.hpp b/tests/test_utils/device_test_utils.hpp index f7a789484..87446be3f 100644 --- a/tests/test_utils/device_test_utils.hpp +++ b/tests/test_utils/device_test_utils.hpp @@ -6,8 +6,9 @@ #pragma once #include -#include +#include #include +#include #include "cluster.h" @@ -27,4 +28,17 @@ static void read_data_from_device(tt_device& device, std::vector &vec, device.read_from_device(vec.data(), core, addr, size, tlb_to_use); } +inline void fill_with_random_bytes(uint8_t* data, size_t n) +{ + static std::random_device rd; + static std::mt19937_64 gen(rd()); + uint64_t* data64 = reinterpret_cast(data); + std::generate_n(data64, n/8, [&]() { return gen(); }); + + // Handle remaining bytes + for (size_t i = (n/8)*8; i < n; ++i) { + data[i] = static_cast(gen()); + } +} + } diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index c425ce83a..0f8f90999 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -3,7 +3,6 @@ // SPDX-License-Identifier: Apache-2.0 #include #include -#include #include "gtest/gtest.h" #include "cluster.h" @@ -18,14 +17,6 @@ using namespace tt::umd; -inline void fill_with_random_bytes(uint8_t* data, size_t n) -{ - static std::random_device rd; - static std::mt19937 gen(rd()); - static std::uniform_int_distribution dis(0, 255); - - std::generate(data, data + n, [&]() { return dis(gen); }); -} void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions @@ -122,7 +113,7 @@ TEST(SiliconDriverWH, CustomSocDesc) { // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); - + ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled"; for(const auto& chip : sdesc_per_chip) { ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver"; @@ -142,23 +133,23 @@ TEST(SiliconDriverWH, HarvestingRuntime) { std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - + Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - + for(int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } - } + } } device.setup_core_to_tlb_map(get_static_tlb_index_callback); - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -177,13 +168,13 @@ TEST(SiliconDriverWH, HarvestingRuntime) { device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - + test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB"); ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; ASSERT_EQ(vector_to_write, dynamic_readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - + device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); // Clear any written data device.wait_for_non_mmio_flush(); @@ -207,7 +198,6 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { int num_devices = target_devices.size(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -217,13 +207,13 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } } - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -266,7 +256,6 @@ TEST(SiliconDriverWH, StaticTLB_RW) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -276,14 +265,14 @@ TEST(SiliconDriverWH, StaticTLB_RW) { if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); - } + } } - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -308,7 +297,7 @@ TEST(SiliconDriverWH, StaticTLB_RW) { address += 0x20; // Increment by uint32_t size for each write } } - device.close_device(); + device.close_device(); } TEST(SiliconDriverWH, DynamicTLB_RW) { @@ -355,7 +344,6 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); - set_params_for_remote_txn(device); tt_device_params default_params; @@ -401,7 +389,7 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { TEST(SiliconDriverWH, MultiThreadedMemBar) { // Have 2 threads read and write from a single device concurrently - // All (fairly large) transactions go through a static TLB. + // All (fairly large) transactions go through a static TLB. // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test @@ -416,13 +404,13 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - + for(int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); @@ -432,7 +420,7 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); - + std::vector readback_membar_vec = {}; for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); @@ -446,7 +434,7 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM readback_membar_vec = {}; } - + for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores @@ -477,7 +465,7 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } - + } }); @@ -519,7 +507,6 @@ TEST(SiliconDriverWH, BroadcastWrite) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -545,7 +532,7 @@ TEST(SiliconDriverWH, BroadcastWrite) { // Broadcast to Tensix device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); for(const auto i : target_devices) { @@ -567,7 +554,7 @@ TEST(SiliconDriverWH, BroadcastWrite) { // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { @@ -575,7 +562,6 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -588,7 +574,7 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { device.close_device(); GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled"; } - + device.deassert_risc_reset(); std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; @@ -608,7 +594,7 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { // Broadcast to Tensix device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); for(const auto i : target_devices) { @@ -630,7 +616,7 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } @@ -658,58 +644,134 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { TEST(SiliconDriverWH, SysmemTestWithPcie) { auto target_devices = get_target_devices(); - Cluster device(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), - tt_ClusterDescriptor::get_cluster_descriptor_file_path(), - target_devices, - 1, // one "host memory channel", currently a 1G huge page - false, // skip driver allocs - no (don't skip) - true, // clean system resources - yes - true); // perform harvesting - yes + Cluster cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + 1, // one "host memory channel", currently a 1G huge page + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes - set_params_for_remote_txn(device); - device.start_device(tt_device_params{}); // no special parameters + set_params_for_remote_txn(cluster); + cluster.start_device(tt_device_params{}); // no special parameters - // PCIe core is at (x=0, y=3) on Wormhole NOC0. const chip_id_t mmio_chip_id = 0; - const size_t PCIE_X = 0; // NOC0 - const size_t PCIE_Y = 3; // NOC0 - const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE_X, PCIE_Y); + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); const size_t test_size_bytes = 0x4000; // Arbitrarilly chosen, but small size so the test runs quickly. + // PCIe core is at (x=0, y=3) on Wormhole NOC0. + ASSERT_EQ(PCIE.x, 0); + ASSERT_EQ(PCIE.y, 3); + // Bad API: how big is the buffer? How do we know it's big enough? // Situation today is that there's a 1G hugepage behind it, although this is // unclear from the API and may change in the future. - uint8_t *sysmem = (uint8_t*)device.host_dma_address(0, 0, 0); + uint8_t *sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); ASSERT_NE(sysmem, nullptr); // This is the address inside the Wormhole PCIe block that is mapped to the // system bus. In Wormhole, this is a fixed address, 0x8'0000'0000. // The driver should have mapped this address to the bottom of sysmem. - uint64_t base_address = device.get_pcie_base_addr_from_device(mmio_chip_id); + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); // Buffer that we will use to read sysmem into, then write sysmem from. std::vector buffer(test_size_bytes, 0x0); // Step 1: Fill sysmem with random bytes. - fill_with_random_bytes(sysmem, test_size_bytes); + test_utils::fill_with_random_bytes(sysmem, test_size_bytes); // Step 2: Read sysmem into buffer. - device.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB"); + cluster.read_from_device(&buffer[0], PCIE_CORE, base_address, buffer.size(), "REG_TLB"); // Step 3: Verify that buffer matches sysmem. ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); // Step 4: Fill buffer with random bytes. - fill_with_random_bytes(&buffer[0], test_size_bytes); + test_utils::fill_with_random_bytes(&buffer[0], test_size_bytes); // Step 5: Write buffer into sysmem, overwriting what was there. - device.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB"); + cluster.write_to_device(&buffer[0], buffer.size(), PCIE_CORE, base_address, "REG_TLB"); // Step 5b: Read back sysmem into a throwaway buffer. The intent is to // ensure the write has completed before we check sysmem against buffer. std::vector throwaway(test_size_bytes, 0x0); - device.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB"); + cluster.read_from_device(&throwaway[0], PCIE_CORE, base_address, throwaway.size(), "REG_TLB"); // Step 6: Verify that sysmem matches buffer. ASSERT_EQ(buffer, std::vector(sysmem, sysmem + test_size_bytes)); } + +/** + * Same idea as above, but with four channels of sysmem and random addresses. + * The hardware mechanism is too slow to sweep the entire range. + */ +TEST(SiliconDriverWH, RandomSysmemTestWithPcie) { + const size_t num_channels = 2; // ideally 4, but CI seems to have 2... + auto target_devices = get_target_devices(); + + Cluster cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_channels, + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes + + set_params_for_remote_txn(cluster); + cluster.start_device(tt_device_params{}); // no special parameters + + const chip_id_t mmio_chip_id = 0; + const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); + const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); + const size_t ONE_GIG = 1 << 30; + const size_t num_tests = 0x20000; // runs in a reasonable amount of time + + // PCIe core is at (x=0, y=3) on Wormhole NOC0. + ASSERT_EQ(PCIE.x, 0); + ASSERT_EQ(PCIE.y, 3); + + const uint64_t ALIGNMENT = sizeof(uint32_t); + auto generate_aligned_address = [&](uint64_t lo, uint64_t hi) -> uint64_t { + static std::random_device rd; + static std::mt19937_64 gen(rd()); + std::uniform_int_distribution dis(lo/ALIGNMENT, hi/ALIGNMENT); + return dis(gen) * ALIGNMENT; + }; + + uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); + for (size_t channel = 0; channel < num_channels; ++channel) { + uint8_t *sysmem = (uint8_t*)cluster.host_dma_address(0, 0, channel); + ASSERT_NE(sysmem, nullptr); + + test_utils::fill_with_random_bytes(sysmem, ONE_GIG); + + uint64_t lo = (ONE_GIG * channel); + uint64_t hi = (lo + ONE_GIG) - 1; + + if (channel == 3) { + // TODO: I thought everything past 0xffff'dddd was registers or + // something, but a) I don't know what's actually there, and b) + // the unusable range seems to be bigger than that... so + // restricting to 0x8'f000'0000. + hi &= ~0x0fff'ffffULL; + } + + for (size_t i = 0; i < num_tests; ++i) { + uint64_t address = generate_aligned_address(lo, hi); + uint64_t noc_addr = base_address + address; + uint64_t sysmem_address = address - lo; + + ASSERT_GE(address, lo) << "Address too low"; + ASSERT_LE(address, hi) << "Address too high"; + ASSERT_EQ(address % ALIGNMENT, 0) << "Address not properly aligned"; + + uint32_t value = 0; + cluster.read_from_device(&value, PCIE_CORE, noc_addr, sizeof(uint32_t), "REG_TLB"); + + uint32_t expected = *reinterpret_cast(&sysmem[sysmem_address]); + ASSERT_EQ(value, expected) << fmt::format("Mismatch at address {:#x}", address); + } + } +} + From 427357eec740d0ef2354f71f7a3ec3b1139a6179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Fri, 15 Nov 2024 16:03:11 +0100 Subject: [PATCH 2/8] Offline cluster test descriptor (#302) ### Issue Related to #226 ### Description Making this non-functionality more explicit. Also eases up testing. ### List of the changes - Minor fix of test_chip on machines with no cards. - Added offline examples of cluster descriptors. - Test all offline cluster descriptors. - Change the non-functional test to explicitly consume unsuppored configuration ### Testing This PR is about tests. ### API Changes There are no API changes in this PR. --- .github/workflows/build-tests.yml | 4 +- .../blackhole_P150.yaml | 23 ++ .../cluster_descriptor_examples/galaxy.yaml | 383 ++++++++++++++++++ .../grayskull_E150.yaml | 23 ++ .../grayskull_E300.yaml | 23 ++ .../wormhole_2xN300_unconnected.yaml | 41 ++ .../wormhole_N150.yaml | 24 ++ .../wormhole_N300.yaml | 30 ++ tests/api/test_chip.cpp | 46 +-- tests/api/test_cluster.cpp | 33 +- tests/api/test_cluster_descriptor.cpp | 63 +-- 11 files changed, 604 insertions(+), 89 deletions(-) create mode 100644 tests/api/cluster_descriptor_examples/blackhole_P150.yaml create mode 100644 tests/api/cluster_descriptor_examples/galaxy.yaml create mode 100644 tests/api/cluster_descriptor_examples/grayskull_E150.yaml create mode 100644 tests/api/cluster_descriptor_examples/grayskull_E300.yaml create mode 100644 tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml create mode 100644 tests/api/cluster_descriptor_examples/wormhole_N150.yaml create mode 100644 tests/api/cluster_descriptor_examples/wormhole_N300.yaml diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 5edd35eb0..08dc84ee8 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -33,6 +33,7 @@ env: LIB_OUTPUT_DIR: ./build/lib DEPS_OUTPUT_DIR: ./build/_deps TEST_OUTPUT_DIR: ./build/test + CLUSTER_DESCRIPTORS_DIR: ./tests/api/cluster_descriptor_examples jobs: build: @@ -74,7 +75,8 @@ jobs: run: | tar cvf artifact.tar ${{ env.TEST_OUTPUT_DIR }} \ ${{ env.LIB_OUTPUT_DIR }} \ - ${{ env.DEPS_OUTPUT_DIR }} + ${{ env.DEPS_OUTPUT_DIR }} \ + ${{ env.CLUSTER_DESCRIPTORS_DIR }} - name: Upload build artifacts archive uses: actions/upload-artifact@v4 diff --git a/tests/api/cluster_descriptor_examples/blackhole_P150.yaml b/tests/api/cluster_descriptor_examples/blackhole_P150.yaml new file mode 100644 index 000000000..06232d981 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/blackhole_P150.yaml @@ -0,0 +1,23 @@ +arch: { + 0: Blackhole, +} + +chips: { +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: false, harvest_mask: 0}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: null, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/galaxy.yaml b/tests/api/cluster_descriptor_examples/galaxy.yaml new file mode 100644 index 000000000..d2ca245c2 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/galaxy.yaml @@ -0,0 +1,383 @@ +arch: { + 0: Wormhole, + 1: Wormhole, + 2: Wormhole, + 3: Wormhole, + 4: Wormhole, + 5: Wormhole, + 6: Wormhole, + 7: Wormhole, + 8: Wormhole, + 9: Wormhole, + 10: Wormhole, + 11: Wormhole, + 12: Wormhole, + 13: Wormhole, + 14: Wormhole, + 15: Wormhole, + 16: Wormhole, + 17: Wormhole, + 18: Wormhole, + 19: Wormhole, + 20: Wormhole, + 21: Wormhole, + 22: Wormhole, + 23: Wormhole, + 24: Wormhole, + 25: Wormhole, + 26: Wormhole, + 27: Wormhole, + 28: Wormhole, + 29: Wormhole, + 30: Wormhole, + 31: Wormhole, + 32: Wormhole, + 33: Wormhole, + 34: Wormhole, + 35: Wormhole, +} + +chips: { + 0: [0,3,0,0], + 1: [0,2,0,0], + 2: [0,1,0,0], + 3: [0,0,0,0], + 4: [3,6,0,1], + 5: [3,5,0,1], + 6: [2,5,0,1], + 7: [2,6,0,1], + 8: [1,6,0,1], + 9: [1,7,0,1], + 10: [2,7,0,1], + 11: [3,7,0,1], + 12: [0,7,0,1], + 13: [0,6,0,1], + 14: [0,5,0,1], + 15: [1,5,0,1], + 16: [1,4,0,1], + 17: [2,4,0,1], + 18: [3,4,0,1], + 19: [3,3,0,1], + 20: [2,3,0,1], + 21: [1,3,0,1], + 22: [1,2,0,1], + 23: [2,2,0,1], + 24: [3,2,0,1], + 25: [3,1,0,1], + 26: [2,1,0,1], + 27: [1,1,0,1], + 28: [1,0,0,1], + 29: [2,0,0,1], + 30: [3,0,0,1], + 31: [0,0,0,1], + 32: [0,1,0,1], + 33: [0,2,0,1], + 34: [0,3,0,1], + 35: [0,4,0,1], +} + +ethernet_connections: [ + [{chip: 0, chan: 6}, {chip: 11, chan: 12}], + [{chip: 0, chan: 7}, {chip: 4, chan: 12}], + [{chip: 1, chan: 6}, {chip: 5, chan: 12}], + [{chip: 1, chan: 7}, {chip: 18, chan: 12}], + [{chip: 2, chan: 6}, {chip: 19, chan: 12}], + [{chip: 2, chan: 7}, {chip: 24, chan: 12}], + [{chip: 3, chan: 6}, {chip: 25, chan: 12}], + [{chip: 3, chan: 7}, {chip: 30, chan: 12}], + [{chip: 4, chan: 0}, {chip: 11, chan: 0}], + [{chip: 4, chan: 1}, {chip: 11, chan: 1}], + [{chip: 4, chan: 2}, {chip: 11, chan: 2}], + [{chip: 4, chan: 3}, {chip: 11, chan: 3}], + [{chip: 4, chan: 4}, {chip: 7, chan: 12}], + [{chip: 4, chan: 5}, {chip: 7, chan: 13}], + [{chip: 4, chan: 6}, {chip: 7, chan: 14}], + [{chip: 4, chan: 7}, {chip: 7, chan: 15}], + [{chip: 4, chan: 8}, {chip: 5, chan: 8}], + [{chip: 4, chan: 9}, {chip: 5, chan: 9}], + [{chip: 4, chan: 10}, {chip: 5, chan: 10}], + [{chip: 4, chan: 11}, {chip: 5, chan: 11}], + [{chip: 5, chan: 0}, {chip: 18, chan: 0}], + [{chip: 5, chan: 1}, {chip: 18, chan: 1}], + [{chip: 5, chan: 2}, {chip: 18, chan: 2}], + [{chip: 5, chan: 3}, {chip: 18, chan: 3}], + [{chip: 5, chan: 4}, {chip: 6, chan: 12}], + [{chip: 5, chan: 5}, {chip: 6, chan: 13}], + [{chip: 5, chan: 6}, {chip: 6, chan: 14}], + [{chip: 5, chan: 7}, {chip: 6, chan: 15}], + [{chip: 6, chan: 0}, {chip: 17, chan: 0}], + [{chip: 6, chan: 1}, {chip: 17, chan: 1}], + [{chip: 6, chan: 2}, {chip: 17, chan: 2}], + [{chip: 6, chan: 3}, {chip: 17, chan: 3}], + [{chip: 6, chan: 4}, {chip: 15, chan: 12}], + [{chip: 6, chan: 5}, {chip: 15, chan: 13}], + [{chip: 6, chan: 6}, {chip: 15, chan: 14}], + [{chip: 6, chan: 7}, {chip: 15, chan: 15}], + [{chip: 6, chan: 8}, {chip: 7, chan: 8}], + [{chip: 6, chan: 9}, {chip: 7, chan: 9}], + [{chip: 6, chan: 10}, {chip: 7, chan: 10}], + [{chip: 6, chan: 11}, {chip: 7, chan: 11}], + [{chip: 7, chan: 0}, {chip: 10, chan: 0}], + [{chip: 7, chan: 1}, {chip: 10, chan: 1}], + [{chip: 7, chan: 2}, {chip: 10, chan: 2}], + [{chip: 7, chan: 3}, {chip: 10, chan: 3}], + [{chip: 7, chan: 4}, {chip: 8, chan: 12}], + [{chip: 7, chan: 5}, {chip: 8, chan: 13}], + [{chip: 7, chan: 6}, {chip: 8, chan: 14}], + [{chip: 7, chan: 7}, {chip: 8, chan: 15}], + [{chip: 8, chan: 0}, {chip: 15, chan: 0}], + [{chip: 8, chan: 1}, {chip: 15, chan: 1}], + [{chip: 8, chan: 2}, {chip: 15, chan: 2}], + [{chip: 8, chan: 3}, {chip: 15, chan: 3}], + [{chip: 8, chan: 4}, {chip: 13, chan: 12}], + [{chip: 8, chan: 5}, {chip: 13, chan: 13}], + [{chip: 8, chan: 6}, {chip: 13, chan: 14}], + [{chip: 8, chan: 7}, {chip: 13, chan: 15}], + [{chip: 8, chan: 8}, {chip: 9, chan: 8}], + [{chip: 8, chan: 9}, {chip: 9, chan: 9}], + [{chip: 8, chan: 10}, {chip: 9, chan: 10}], + [{chip: 8, chan: 11}, {chip: 9, chan: 11}], + [{chip: 9, chan: 4}, {chip: 12, chan: 12}], + [{chip: 9, chan: 5}, {chip: 12, chan: 13}], + [{chip: 9, chan: 6}, {chip: 12, chan: 14}], + [{chip: 9, chan: 7}, {chip: 12, chan: 15}], + [{chip: 9, chan: 12}, {chip: 10, chan: 4}], + [{chip: 9, chan: 13}, {chip: 10, chan: 5}], + [{chip: 9, chan: 14}, {chip: 10, chan: 6}], + [{chip: 9, chan: 15}, {chip: 10, chan: 7}], + [{chip: 10, chan: 12}, {chip: 11, chan: 4}], + [{chip: 10, chan: 13}, {chip: 11, chan: 5}], + [{chip: 10, chan: 14}, {chip: 11, chan: 6}], + [{chip: 10, chan: 15}, {chip: 11, chan: 7}], + [{chip: 12, chan: 8}, {chip: 13, chan: 8}], + [{chip: 12, chan: 9}, {chip: 13, chan: 9}], + [{chip: 12, chan: 10}, {chip: 13, chan: 10}], + [{chip: 12, chan: 11}, {chip: 13, chan: 11}], + [{chip: 13, chan: 0}, {chip: 14, chan: 0}], + [{chip: 13, chan: 1}, {chip: 14, chan: 1}], + [{chip: 13, chan: 2}, {chip: 14, chan: 2}], + [{chip: 13, chan: 3}, {chip: 14, chan: 3}], + [{chip: 14, chan: 8}, {chip: 35, chan: 8}], + [{chip: 14, chan: 9}, {chip: 35, chan: 9}], + [{chip: 14, chan: 10}, {chip: 35, chan: 10}], + [{chip: 14, chan: 11}, {chip: 35, chan: 11}], + [{chip: 14, chan: 12}, {chip: 15, chan: 4}], + [{chip: 14, chan: 13}, {chip: 15, chan: 5}], + [{chip: 14, chan: 14}, {chip: 15, chan: 6}], + [{chip: 14, chan: 15}, {chip: 15, chan: 7}], + [{chip: 15, chan: 8}, {chip: 16, chan: 8}], + [{chip: 15, chan: 9}, {chip: 16, chan: 9}], + [{chip: 15, chan: 10}, {chip: 16, chan: 10}], + [{chip: 15, chan: 11}, {chip: 16, chan: 11}], + [{chip: 16, chan: 0}, {chip: 21, chan: 0}], + [{chip: 16, chan: 1}, {chip: 21, chan: 1}], + [{chip: 16, chan: 2}, {chip: 21, chan: 2}], + [{chip: 16, chan: 3}, {chip: 21, chan: 3}], + [{chip: 16, chan: 4}, {chip: 35, chan: 12}], + [{chip: 16, chan: 5}, {chip: 35, chan: 13}], + [{chip: 16, chan: 6}, {chip: 35, chan: 14}], + [{chip: 16, chan: 7}, {chip: 35, chan: 15}], + [{chip: 16, chan: 12}, {chip: 17, chan: 4}], + [{chip: 16, chan: 13}, {chip: 17, chan: 5}], + [{chip: 16, chan: 14}, {chip: 17, chan: 6}], + [{chip: 16, chan: 15}, {chip: 17, chan: 7}], + [{chip: 17, chan: 8}, {chip: 20, chan: 8}], + [{chip: 17, chan: 9}, {chip: 20, chan: 9}], + [{chip: 17, chan: 10}, {chip: 20, chan: 10}], + [{chip: 17, chan: 11}, {chip: 20, chan: 11}], + [{chip: 17, chan: 12}, {chip: 18, chan: 4}], + [{chip: 17, chan: 13}, {chip: 18, chan: 5}], + [{chip: 17, chan: 14}, {chip: 18, chan: 6}], + [{chip: 17, chan: 15}, {chip: 18, chan: 7}], + [{chip: 18, chan: 8}, {chip: 19, chan: 8}], + [{chip: 18, chan: 9}, {chip: 19, chan: 9}], + [{chip: 18, chan: 10}, {chip: 19, chan: 10}], + [{chip: 18, chan: 11}, {chip: 19, chan: 11}], + [{chip: 19, chan: 0}, {chip: 24, chan: 0}], + [{chip: 19, chan: 1}, {chip: 24, chan: 1}], + [{chip: 19, chan: 2}, {chip: 24, chan: 2}], + [{chip: 19, chan: 3}, {chip: 24, chan: 3}], + [{chip: 19, chan: 4}, {chip: 20, chan: 12}], + [{chip: 19, chan: 5}, {chip: 20, chan: 13}], + [{chip: 19, chan: 6}, {chip: 20, chan: 14}], + [{chip: 19, chan: 7}, {chip: 20, chan: 15}], + [{chip: 20, chan: 0}, {chip: 23, chan: 0}], + [{chip: 20, chan: 1}, {chip: 23, chan: 1}], + [{chip: 20, chan: 2}, {chip: 23, chan: 2}], + [{chip: 20, chan: 3}, {chip: 23, chan: 3}], + [{chip: 20, chan: 4}, {chip: 21, chan: 12}], + [{chip: 20, chan: 5}, {chip: 21, chan: 13}], + [{chip: 20, chan: 6}, {chip: 21, chan: 14}], + [{chip: 20, chan: 7}, {chip: 21, chan: 15}], + [{chip: 21, chan: 4}, {chip: 34, chan: 12}], + [{chip: 21, chan: 5}, {chip: 34, chan: 13}], + [{chip: 21, chan: 6}, {chip: 34, chan: 14}], + [{chip: 21, chan: 7}, {chip: 34, chan: 15}], + [{chip: 21, chan: 8}, {chip: 22, chan: 8}], + [{chip: 21, chan: 9}, {chip: 22, chan: 9}], + [{chip: 21, chan: 10}, {chip: 22, chan: 10}], + [{chip: 21, chan: 11}, {chip: 22, chan: 11}], + [{chip: 22, chan: 0}, {chip: 27, chan: 0}], + [{chip: 22, chan: 1}, {chip: 27, chan: 1}], + [{chip: 22, chan: 2}, {chip: 27, chan: 2}], + [{chip: 22, chan: 3}, {chip: 27, chan: 3}], + [{chip: 22, chan: 4}, {chip: 33, chan: 12}], + [{chip: 22, chan: 5}, {chip: 33, chan: 13}], + [{chip: 22, chan: 6}, {chip: 33, chan: 14}], + [{chip: 22, chan: 7}, {chip: 33, chan: 15}], + [{chip: 22, chan: 12}, {chip: 23, chan: 4}], + [{chip: 22, chan: 13}, {chip: 23, chan: 5}], + [{chip: 22, chan: 14}, {chip: 23, chan: 6}], + [{chip: 22, chan: 15}, {chip: 23, chan: 7}], + [{chip: 23, chan: 8}, {chip: 26, chan: 8}], + [{chip: 23, chan: 9}, {chip: 26, chan: 9}], + [{chip: 23, chan: 10}, {chip: 26, chan: 10}], + [{chip: 23, chan: 11}, {chip: 26, chan: 11}], + [{chip: 23, chan: 12}, {chip: 24, chan: 4}], + [{chip: 23, chan: 13}, {chip: 24, chan: 5}], + [{chip: 23, chan: 14}, {chip: 24, chan: 6}], + [{chip: 23, chan: 15}, {chip: 24, chan: 7}], + [{chip: 24, chan: 8}, {chip: 25, chan: 8}], + [{chip: 24, chan: 9}, {chip: 25, chan: 9}], + [{chip: 24, chan: 10}, {chip: 25, chan: 10}], + [{chip: 24, chan: 11}, {chip: 25, chan: 11}], + [{chip: 25, chan: 0}, {chip: 30, chan: 0}], + [{chip: 25, chan: 1}, {chip: 30, chan: 1}], + [{chip: 25, chan: 2}, {chip: 30, chan: 2}], + [{chip: 25, chan: 3}, {chip: 30, chan: 3}], + [{chip: 25, chan: 4}, {chip: 26, chan: 12}], + [{chip: 25, chan: 5}, {chip: 26, chan: 13}], + [{chip: 25, chan: 6}, {chip: 26, chan: 14}], + [{chip: 25, chan: 7}, {chip: 26, chan: 15}], + [{chip: 26, chan: 0}, {chip: 29, chan: 0}], + [{chip: 26, chan: 1}, {chip: 29, chan: 1}], + [{chip: 26, chan: 2}, {chip: 29, chan: 2}], + [{chip: 26, chan: 3}, {chip: 29, chan: 3}], + [{chip: 26, chan: 4}, {chip: 27, chan: 12}], + [{chip: 26, chan: 5}, {chip: 27, chan: 13}], + [{chip: 26, chan: 6}, {chip: 27, chan: 14}], + [{chip: 26, chan: 7}, {chip: 27, chan: 15}], + [{chip: 27, chan: 4}, {chip: 32, chan: 12}], + [{chip: 27, chan: 5}, {chip: 32, chan: 13}], + [{chip: 27, chan: 6}, {chip: 32, chan: 14}], + [{chip: 27, chan: 7}, {chip: 32, chan: 15}], + [{chip: 27, chan: 8}, {chip: 28, chan: 8}], + [{chip: 27, chan: 9}, {chip: 28, chan: 9}], + [{chip: 27, chan: 10}, {chip: 28, chan: 10}], + [{chip: 27, chan: 11}, {chip: 28, chan: 11}], + [{chip: 28, chan: 4}, {chip: 31, chan: 12}], + [{chip: 28, chan: 5}, {chip: 31, chan: 13}], + [{chip: 28, chan: 6}, {chip: 31, chan: 14}], + [{chip: 28, chan: 7}, {chip: 31, chan: 15}], + [{chip: 28, chan: 12}, {chip: 29, chan: 4}], + [{chip: 28, chan: 13}, {chip: 29, chan: 5}], + [{chip: 28, chan: 14}, {chip: 29, chan: 6}], + [{chip: 28, chan: 15}, {chip: 29, chan: 7}], + [{chip: 29, chan: 12}, {chip: 30, chan: 4}], + [{chip: 29, chan: 13}, {chip: 30, chan: 5}], + [{chip: 29, chan: 14}, {chip: 30, chan: 6}], + [{chip: 29, chan: 15}, {chip: 30, chan: 7}], + [{chip: 31, chan: 8}, {chip: 32, chan: 8}], + [{chip: 31, chan: 9}, {chip: 32, chan: 9}], + [{chip: 31, chan: 10}, {chip: 32, chan: 10}], + [{chip: 31, chan: 11}, {chip: 32, chan: 11}], + [{chip: 32, chan: 0}, {chip: 33, chan: 0}], + [{chip: 32, chan: 1}, {chip: 33, chan: 1}], + [{chip: 32, chan: 2}, {chip: 33, chan: 2}], + [{chip: 32, chan: 3}, {chip: 33, chan: 3}], + [{chip: 33, chan: 8}, {chip: 34, chan: 8}], + [{chip: 33, chan: 9}, {chip: 34, chan: 9}], + [{chip: 33, chan: 10}, {chip: 34, chan: 10}], + [{chip: 33, chan: 11}, {chip: 34, chan: 11}], + [{chip: 34, chan: 0}, {chip: 35, chan: 0}], + [{chip: 34, chan: 1}, {chip: 35, chan: 1}], + [{chip: 34, chan: 2}, {chip: 35, chan: 2}], + [{chip: 34, chan: 3}, {chip: 35, chan: 3}], +] + +chips_with_mmio: [ + 0: 0, + 1: 1, + 2: 2, + 3: 3, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 1}, + 1: {noc_translation: true, harvest_mask: 1}, + 2: {noc_translation: true, harvest_mask: 4}, + 3: {noc_translation: true, harvest_mask: 8}, + 4: {noc_translation: true, harvest_mask: 0}, + 5: {noc_translation: true, harvest_mask: 0}, + 6: {noc_translation: true, harvest_mask: 0}, + 7: {noc_translation: true, harvest_mask: 0}, + 8: {noc_translation: true, harvest_mask: 0}, + 9: {noc_translation: true, harvest_mask: 0}, + 10: {noc_translation: true, harvest_mask: 0}, + 11: {noc_translation: true, harvest_mask: 0}, + 12: {noc_translation: true, harvest_mask: 0}, + 13: {noc_translation: true, harvest_mask: 0}, + 14: {noc_translation: true, harvest_mask: 0}, + 15: {noc_translation: true, harvest_mask: 0}, + 16: {noc_translation: true, harvest_mask: 0}, + 17: {noc_translation: true, harvest_mask: 0}, + 18: {noc_translation: true, harvest_mask: 0}, + 19: {noc_translation: true, harvest_mask: 0}, + 20: {noc_translation: true, harvest_mask: 0}, + 21: {noc_translation: true, harvest_mask: 0}, + 22: {noc_translation: true, harvest_mask: 0}, + 23: {noc_translation: true, harvest_mask: 0}, + 24: {noc_translation: true, harvest_mask: 0}, + 25: {noc_translation: true, harvest_mask: 0}, + 26: {noc_translation: true, harvest_mask: 0}, + 27: {noc_translation: true, harvest_mask: 0}, + 28: {noc_translation: true, harvest_mask: 0}, + 29: {noc_translation: true, harvest_mask: 0}, + 30: {noc_translation: true, harvest_mask: 0}, + 31: {noc_translation: true, harvest_mask: 0}, + 32: {noc_translation: true, harvest_mask: 0}, + 33: {noc_translation: true, harvest_mask: 0}, + 34: {noc_translation: true, harvest_mask: 0}, + 35: {noc_translation: true, harvest_mask: 0}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n150, + 1: n150, + 2: n150, + 3: n150, + 4: GALAXY, + 5: GALAXY, + 6: GALAXY, + 7: GALAXY, + 8: GALAXY, + 9: GALAXY, + 10: GALAXY, + 11: GALAXY, + 12: GALAXY, + 13: GALAXY, + 14: GALAXY, + 15: GALAXY, + 16: GALAXY, + 17: GALAXY, + 18: GALAXY, + 19: GALAXY, + 20: GALAXY, + 21: GALAXY, + 22: GALAXY, + 23: GALAXY, + 24: GALAXY, + 25: GALAXY, + 26: GALAXY, + 27: GALAXY, + 28: GALAXY, + 29: GALAXY, + 30: GALAXY, + 31: GALAXY, + 32: GALAXY, + 33: GALAXY, + 34: GALAXY, + 35: GALAXY, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/grayskull_E150.yaml b/tests/api/cluster_descriptor_examples/grayskull_E150.yaml new file mode 100644 index 000000000..6545cdad2 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/grayskull_E150.yaml @@ -0,0 +1,23 @@ +arch: { + 0: Grayskull, +} + +chips: { +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: false, harvest_mask: 0}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: e150, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/grayskull_E300.yaml b/tests/api/cluster_descriptor_examples/grayskull_E300.yaml new file mode 100644 index 000000000..16a571687 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/grayskull_E300.yaml @@ -0,0 +1,23 @@ +arch: { + 0: Grayskull, +} + +chips: { +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: false, harvest_mask: 514}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: e300, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml b/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml new file mode 100644 index 000000000..896888d07 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml @@ -0,0 +1,41 @@ +arch: { + 0: Wormhole, + 1: Wormhole, + 2: Wormhole, + 3: Wormhole, +} + +chips: { + 0: [0,0,0,0], + 1: [0,0,0,0], + 2: [1,0,0,0], + 3: [1,0,0,0], +} + +ethernet_connections: [ + [{chip: 0, chan: 8}, {chip: 2, chan: 0}], + [{chip: 0, chan: 9}, {chip: 2, chan: 1}], + [{chip: 1, chan: 8}, {chip: 3, chan: 0}], + [{chip: 1, chan: 9}, {chip: 3, chan: 1}], +] + +chips_with_mmio: [ + 0: 0, + 1: 1, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 65}, + 1: {noc_translation: true, harvest_mask: 3}, + 2: {noc_translation: true, harvest_mask: 5}, + 3: {noc_translation: true, harvest_mask: 33}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n300, + 1: n300, + 2: n300, + 3: n300, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/wormhole_N150.yaml b/tests/api/cluster_descriptor_examples/wormhole_N150.yaml new file mode 100644 index 000000000..c2dd123aa --- /dev/null +++ b/tests/api/cluster_descriptor_examples/wormhole_N150.yaml @@ -0,0 +1,24 @@ +arch: { + 0: Wormhole, +} + +chips: { + 0: [0,0,0,0], +} + +ethernet_connections: [ +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 32}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n150, +} \ No newline at end of file diff --git a/tests/api/cluster_descriptor_examples/wormhole_N300.yaml b/tests/api/cluster_descriptor_examples/wormhole_N300.yaml new file mode 100644 index 000000000..78f7822a7 --- /dev/null +++ b/tests/api/cluster_descriptor_examples/wormhole_N300.yaml @@ -0,0 +1,30 @@ +arch: { + 0: Wormhole, + 1: Wormhole, +} + +chips: { + 0: [0,0,0,0], + 1: [1,0,0,0], +} + +ethernet_connections: [ + [{chip: 0, chan: 8}, {chip: 1, chan: 0}], + [{chip: 0, chan: 9}, {chip: 1, chan: 1}], +] + +chips_with_mmio: [ + 0: 0, +] + +# harvest_mask is the bit indicating which tensix row is harvested. So bit 0 = first tensix row; bit 1 = second tensix row etc... +harvesting: { + 0: {noc_translation: true, harvest_mask: 65}, + 1: {noc_translation: true, harvest_mask: 5}, +} + +# This value will be null if the boardtype is unknown, should never happen in practice but to be defensive it would be useful to throw an error on this case. +boardtype: { + 0: n300, + 1: n300, +} \ No newline at end of file diff --git a/tests/api/test_chip.cpp b/tests/api/test_chip.cpp index caea4245b..713cf464c 100644 --- a/tests/api/test_chip.cpp +++ b/tests/api/test_chip.cpp @@ -22,35 +22,10 @@ using namespace tt::umd; inline std::unique_ptr get_cluster_desc() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. - std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - - // TODO: Make this test work on a host system without any tt devices. - if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; - return nullptr; - } - - // TODO: Remove different branch for different archs - std::unique_ptr cluster_desc; // TODO: remove getting manually cluster descriptor from yaml. std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); - return cluster_desc; + return tt_ClusterDescriptor::create_from_yaml(yaml_path); } inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr &umd_cluster) { @@ -80,7 +55,6 @@ inline std::unique_ptr get_cluster() { // TODO: Make this test work on a host system without any tt devices. if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; return nullptr; } @@ -123,8 +97,7 @@ TEST(ApiChipTest, ManualTLBConfiguration) { std::unique_ptr umd_cluster = get_cluster(); if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } // Expect to throw for remote chip for any worker core @@ -183,8 +156,7 @@ TEST(ApiChipTest, SimpleAPIShowcase) { std::unique_ptr umd_cluster = get_cluster(); if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } chip_id_t chip_id = umd_cluster->get_cluster_description()->get_chips_with_mmio().begin()->first; @@ -198,6 +170,10 @@ TEST(ApiChipTest, SimpleAPIShowcase) { // It reads back the risc reset reg to validate TEST(ApiChipTest, DeassertRiscResetOnCore) { std::unique_ptr umd_cluster = get_cluster(); + + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster); @@ -218,6 +194,10 @@ TEST(ApiChipTest, DeassertRiscResetOnCore) { TEST(ApiChipTest, SpecifyLegalDeassertRiscResetOnCore) { std::unique_ptr umd_cluster = get_cluster(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster); umd_cluster->assert_risc_reset_at_core(chip_core_coord); @@ -236,6 +216,10 @@ TEST(ApiChipTest, SpecifyLegalDeassertRiscResetOnCore) { TEST(ApiChipTest, SpecifyIllegalDeassertRiscResetOnCore) { std::unique_ptr umd_cluster = get_cluster(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster); umd_cluster->assert_risc_reset_at_core(chip_core_coord); diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp index bc9c4a4cb..fc68f62a6 100644 --- a/tests/api/test_cluster.cpp +++ b/tests/api/test_cluster.cpp @@ -32,34 +32,10 @@ using namespace tt::umd; // TODO: This function should not exist, the API itself should be simple enough. inline std::unique_ptr get_cluster_desc() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. - std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device(physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - - // TODO: Make this test work on a host system without any tt devices. - if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; - return nullptr; - } - - std::unique_ptr cluster_desc; // TODO: remove getting manually cluster descriptor from yaml. std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); - return cluster_desc; + return tt_ClusterDescriptor::create_from_yaml(yaml_path); } // TODO: This function should not exist, the API itself should be simple enough. @@ -82,7 +58,6 @@ inline std::unique_ptr get_cluster() { // TODO: Make this test work on a host system without any tt devices. if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; return nullptr; } @@ -144,8 +119,7 @@ TEST(ApiClusterTest, SimpleIOAllChips) { std::unique_ptr umd_cluster = get_cluster(); if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } // Initialize random data. @@ -202,8 +176,7 @@ TEST(ApiClusterTest, RemoteFlush) { std::unique_ptr umd_cluster = get_cluster(); if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { - std::cout << "No chips found. Skipping test." << std::endl; - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } size_t data_size = 1024; diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index 51755c2f3..10e0bbe72 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -18,31 +18,10 @@ inline std::unique_ptr get_cluster_desc() { - - std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); - - // TODO: This test requires knowledge of the device architecture, which should not be true. - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - int physical_device_id = pci_device_ids[0]; - PCIDevice pci_device (physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - - // TODO: Make this test work on a host system without any tt devices. - if (pci_device_ids.empty()) { - std::cout << "No Tenstorrent devices found. Skipping test." << std::endl; - return nullptr; - } - - // TODO: Remove different branch for different archs - std::unique_ptr cluster_desc; // TODO: remove getting manually cluster descriptor from yaml. std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - cluster_desc = tt_ClusterDescriptor::create_from_yaml(yaml_path); - return cluster_desc; + return tt_ClusterDescriptor::create_from_yaml(yaml_path); } TEST(ApiClusterDescriptorTest, DetectArch) { @@ -69,7 +48,7 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) { std::unique_ptr cluster_desc = get_cluster_desc(); if (cluster_desc == nullptr) { - return; + GTEST_SKIP() << "No chips present on the system. Skipping test."; } std::unordered_set all_chips = cluster_desc->get_all_chips(); @@ -90,6 +69,38 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) { std::unordered_map> chips_grouped_by_closest_mmio = cluster_desc->get_chips_grouped_by_closest_mmio(); } +TEST(ApiClusterDescriptorTest, TestAllOfflineClusterDescriptors) { + for (std::string cluster_desc_yaml : { + "blackhole_P150.yaml", + "galaxy.yaml", + "grayskull_E150.yaml", + "grayskull_E300.yaml", + "wormhole_2xN300_unconnected.yaml", + "wormhole_N150.yaml", + "wormhole_N300.yaml", + }) { + std::cout << "Testing " << cluster_desc_yaml << std::endl; + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/" + cluster_desc_yaml)); + + std::unordered_set all_chips = cluster_desc->get_all_chips(); + std::unordered_map harvesting_for_chips = cluster_desc->get_harvesting_info(); + std::unordered_map eth_chip_coords = cluster_desc->get_chip_locations(); + std::unordered_map local_chips_to_pci_device_id = cluster_desc->get_chips_with_mmio(); + std::unordered_set local_chips; + for (auto [chip, _]: local_chips_to_pci_device_id) { + local_chips.insert(chip); + } + std::unordered_set remote_chips; + for (auto chip : all_chips) { + if (local_chips.find(chip) == local_chips.end()) { + remote_chips.insert(chip); + } + } + + std::unordered_map> chips_grouped_by_closest_mmio = cluster_desc->get_chips_grouped_by_closest_mmio(); + } +} + // A standard disjoint set data structure to track connected components. class DisjointSet { public: @@ -130,11 +141,9 @@ class DisjointSet { // It works as long as all the devices that are discoverable are connected through ethernet. // Our ClusterDescriptor doesn't have a notion of multiple unconnected clusters of cards. TEST(ApiClusterDescriptorTest, SeparateClusters) { - std::unique_ptr cluster_desc = get_cluster_desc(); + GTEST_SKIP() << "Skipping test which documents non functional feature."; - if (cluster_desc == nullptr) { - return; - } + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml")); auto all_chips = cluster_desc->get_all_chips(); DisjointSet chip_clusters; From c49cbfbfd4c33468d2f695e4ec3465b77058afc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Mon, 18 Nov 2024 08:44:15 +0100 Subject: [PATCH 3/8] Support multiple unconnected clusters (#306) ### Issue Fixes tenstorrent/tt-umd#226 Unblocks systems with multiple blackhole cards requested by @abhullar-tt Makes situation in https://github.com/tenstorrent/tt-metal/issues/15101 better. ### Description I'm not fully certain if this was the only required change needed to support multiple unconnected clusters. There was a clear issue due to way eth coords were handled, which is fixed. Not sure if there is some other assumption somewhere else in the driver. ### List of the changes - Changed eth_coord_t to a struct. Made changes throughout the code accordingly. This somewhat deprecates tenstorrent/tt-umd#46 - Added cluster_id to eth_coord struct, which designates which connected graph component a chip belongs to. - Filling up cluster_id as part of construction. - Split filling ethernet coordinates and galaxy shelf related structures. - If cluster_id doesn't match, the get_eth_distance function will return limits::max(). ### Testing Uncommented previously skipped test, which was failing for one of the example cluster descriptors. ### API Changes There are no API changes in this PR. --- cmake/dependencies.cmake | 1 + common/disjoint_set.hpp | 42 ++++++ device/cluster.cpp | 18 +-- device/tt_cluster_descriptor.cpp | 188 ++++++++++++++------------ device/tt_cluster_descriptor.h | 2 + device/tt_cluster_descriptor_types.h | 25 +++- tests/CMakeLists.txt | 1 + tests/api/test_cluster_descriptor.cpp | 44 +----- tests/galaxy/test_galaxy_common.h | 3 - 9 files changed, 180 insertions(+), 144 deletions(-) create mode 100644 common/disjoint_set.hpp diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index 96ebbaf53..b8f968859 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -46,6 +46,7 @@ function(fetch_dependencies) # boost::interprocess ############################################################################################################################ include(${PROJECT_SOURCE_DIR}/cmake/fetch_boost.cmake) + fetch_boost_library(container_hash) fetch_boost_library(interprocess) ############################################################################################################################ diff --git a/common/disjoint_set.hpp b/common/disjoint_set.hpp new file mode 100644 index 000000000..b21871731 --- /dev/null +++ b/common/disjoint_set.hpp @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include + +// A standard disjoint set data structure to track connected components. +template +class DisjointSet { +public: + void add_item(T item) { parent[item] = item; } + + int get_set(T item) { + while (parent[item] != item) { + item = parent[item]; + } + return item; + } + + void merge(T item1, T item2) { + T set1 = get_set(item1); + T set2 = get_set(item2); + parent[set1] = set2; + } + + bool are_same_set(T item1, T item2) { return get_set(item1) == get_set(item2); } + + int get_num_sets() { + std::unordered_set sets; + for (auto [item, _] : parent) { + sets.insert(get_set(item)); + } + return sets.size(); + } + +private: + std::unordered_map parent; +}; diff --git a/device/cluster.cpp b/device/cluster.cpp index 041d7e7b0..e73af023f 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1603,8 +1603,8 @@ void Cluster::write_to_non_mmio_device( new_cmd->sys_addr = address + offset; } else { - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); + new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset); + new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf); } if(req_flags & eth_interface_params.cmd_data_block) { @@ -1740,8 +1740,8 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ // Send the read request log_assert((req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned. - new_cmd->sys_addr = get_sys_addr(std::get<0>(target_chip), std::get<1>(target_chip), core.x, core.y, address + offset); - new_cmd->rack = get_sys_rack(std::get<2>(target_chip), std::get<3>(target_chip)); + new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset); + new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf); new_cmd->data = block_size; new_cmd->flags = req_flags; if (use_dram) { @@ -1935,15 +1935,15 @@ std::unordered_map>>& Cluster::get_ether chip_id_t physical_chip_id = ndesc -> get_shelf_local_physical_chip_coords(chip); eth_coord_t eth_coords = ndesc -> get_chip_locations().at(chip); // Rack word to be set in header - uint32_t rack_word = std::get<2>(eth_coords) >> 2; + uint32_t rack_word = eth_coords.rack >> 2; // Rack byte to be set in header - uint32_t rack_byte = std::get<2>(eth_coords) % 4; + uint32_t rack_byte = eth_coords.rack % 4; // 1st level grouping: Group broadcasts based on the MMIO chip they must go through // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each set connected to host through its closest MMIO chip // For the first shelf, pass broadcasts to specific chips through their closest MMIO chip // All other shelves are fully connected galaxy grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list. chip_id_t closest_mmio_chip = 0; - if (std::get<2>(eth_coords) == 0 && std::get<3>(eth_coords) == 0) { + if (eth_coords.rack == 0 && eth_coords.shelf == 0) { // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its own MMIO counterpart. closest_mmio_chip = ndesc -> get_closest_mmio_capable_chip(chip); } @@ -1958,14 +1958,14 @@ std::unordered_map>>& Cluster::get_ether if(broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { // Target seen for the first time. std::vector broadcast_mask(8, 0); - broadcast_mask.at(rack_word) |= (1 << std::get<3>(eth_coords)) << rack_byte; + broadcast_mask.at(rack_word) |= (1 << eth_coords.shelf) << rack_byte; broadcast_mask.at(3) |= 1 << physical_chip_id; broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).insert({physical_chip_id, broadcast_mask}); } else { // Target was seen before -> include curr rack and shelf in header - broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).at(physical_chip_id).at(rack_word) |= static_cast(1 << std::get<3>(eth_coords)) << rack_byte; + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).at(physical_chip_id).at(rack_word) |= static_cast(1 << eth_coords.shelf) << rack_byte; } } } diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index 97c5a711e..0ed661203 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -10,6 +10,7 @@ #include #include +#include "common/disjoint_set.hpp" #include "common/logger.hpp" #include "yaml-cpp/yaml.h" @@ -72,50 +73,40 @@ bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { // the function returns the total distance of travelled between shelves and racks, plust the x&y dim difference int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const { - log_trace(LogSiliconDriver, "get_ethernet_link_coord_distance from ({}, {}, {}, {}) to ({}, {}, {}, {})", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b)); + log_trace(LogSiliconDriver, "get_ethernet_link_coord_distance from ({}, {}, {}, {}, {}) to ({}, {}, {}, {}, {})", + location_a.cluster_id, location_a.x, location_a.y, location_a.rack, location_a.shelf, + location_b.cluster_id, location_b.x, location_b.y, location_b.rack, location_b.shelf); - // eth_coord_t: x, y, rack, shelf - - int x_a = std::get<0>(location_a); - int x_b = std::get<0>(location_b); - - int y_a = std::get<1>(location_a); - int y_b = std::get<1>(location_b); - - int shelf_a = std::get<3>(location_a); - int shelf_b = std::get<3>(location_b); - - int rack_a = std::get<2>(location_a); - int rack_b = std::get<2>(location_b); + if (location_a.cluster_id != location_b.cluster_id) { + return std::numeric_limits::max(); + } - int x_distance = std::abs(x_a - x_b); - int y_distance = std::abs(y_a - y_b); + int x_distance = std::abs(location_a.x - location_b.x); + int y_distance = std::abs(location_a.y - location_b.y); // move along y-dim to exit from the shelf to go to a higher shelf - if(shelf_b > shelf_a) { + if(location_b.shelf > location_a.shelf) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_a) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(location_a.shelf) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).find(y_a) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).end()) { + if(galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).find(location_a.y) == galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_a).at(y_a); + const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).at(location_a.y); log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many"); - // for each shelf-to-shelf connection at y_a, find the distance to location_b, take min + // for each shelf-to-shelf connection at location_a.y, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - log_assert(std::get<1>(exit_shelf) == y_a && std::get<3>(exit_shelf) == shelf_a && std::get<2>(exit_shelf) == rack_a, + log_assert(exit_shelf.y == location_a.y && exit_shelf.shelf == location_a.shelf && exit_shelf.rack == location_a.rack, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(std::get<3>(next_shelf) == (shelf_a+1) && std::get<2>(next_shelf) == rack_a, + log_assert(next_shelf.shelf == (location_a.shelf+1) && next_shelf.rack == location_a.rack, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there @@ -128,32 +119,32 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + location_a.x, location_a.y, location_a.rack, location_a.shelf, + location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); return distance; } - else if(shelf_a > shelf_b) { + else if(location_a.shelf > location_b.shelf) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_b) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(location_b.shelf) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).find(y_b) == galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).end()) { + if(galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).find(location_b.y) == galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(shelf_b).at(y_b); + const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).at(location_b.y); log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many") - // for each shelf-to-shelf connection at y_b, find the distance to location_a, take min + // for each shelf-to-shelf connection at location_b.y, find the distance to location_a, take min int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - log_assert(std::get<1>(exit_shelf) == y_b && std::get<3>(exit_shelf) == shelf_b && std::get<2>(exit_shelf) == rack_b, + log_assert(exit_shelf.y == location_b.y && exit_shelf.shelf == location_b.shelf && exit_shelf.rack == location_b.rack, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(std::get<3>(next_shelf) == (shelf_b+1) && std::get<2>(next_shelf) == rack_b, + log_assert(next_shelf.shelf == (location_b.shelf+1) && next_shelf.rack == location_b.rack, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there @@ -166,34 +157,34 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + location_a.x, location_a.y, location_a.rack, location_a.shelf, + location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); return distance; } // move along y-dim to exit from the shelf to go to a higher shelf - if(rack_b > rack_a) { + if(location_b.rack > location_a.rack) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_a) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(location_a.rack) != galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).find(x_a) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).end()) { + if(galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).find(location_a.x) == galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_a).at(x_a); + const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).at(location_a.x); log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); - // for each rack-to-rack connection at x_a, find the distance to location_b, take min + // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - log_assert(std::get<0>(exit_rack) == x_a && std::get<3>(exit_rack) == shelf_a && std::get<2>(exit_rack) == rack_a, + log_assert(exit_rack.x == location_a.x && exit_rack.shelf == location_a.shelf && exit_rack.rack == location_a.rack, "Invalid rack exit coordinates"); - log_assert(std::get<0>(next_rack) == x_a && std::get<3>(next_rack) == shelf_a && std::get<2>(next_rack) == (rack_a+1), + log_assert(next_rack.x == location_a.x && next_rack.shelf == location_a.shelf && next_rack.rack == (location_a.rack+1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there @@ -206,33 +197,33 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + location_a.x, location_a.y, location_a.rack, location_a.shelf, + location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); return distance; } - else if(rack_a > rack_b) { + else if(location_a.rack > location_b.rack) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(rack_b) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(location_b.rack) != galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).find(x_b) == galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).end()) { + if(galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).find(location_b.x) == galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(rack_b).at(x_b); + const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).at(location_b.x); log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); - // for each rack-to-rack connection at x_a, find the distance to location_b, take min + // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - log_assert(std::get<0>(exit_rack) == x_b && std::get<3>(exit_rack) == shelf_b && std::get<2>(exit_rack) == rack_b, + log_assert(exit_rack.x == location_b.x && exit_rack.shelf == location_b.shelf && exit_rack.rack == location_b.rack, "Invalid rack exit coordinates"); - log_assert(std::get<0>(next_rack) == x_b && std::get<3>(next_rack) == shelf_b && std::get<2>(next_rack) == (rack_b+1), + log_assert(next_rack.x == location_b.x && next_rack.shelf == location_b.shelf && next_rack.rack == (location_b.rack+1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there @@ -245,15 +236,15 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), distance); + location_a.x, location_a.y, location_a.rack, location_a.shelf, + location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); return distance; } log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - std::get<0>(location_a), std::get<1>(location_a), std::get<2>(location_a), std::get<3>(location_a), - std::get<0>(location_b), std::get<1>(location_b), std::get<2>(location_b), std::get<3>(location_b), x_distance + y_distance); + location_a.x, location_a.y, location_a.rack, location_a.shelf, + location_b.x, location_b.y, location_b.rack, location_b.shelf, x_distance + y_distance); // on same shelf/rack, the distance is just x+y difference return x_distance + y_distance; @@ -280,9 +271,10 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch const chip_id_t &mmio_chip = pair.first; eth_coord_t mmio_eth_coord = this->chip_locations.at(mmio_chip); - log_debug(LogSiliconDriver, "Checking chip{} at ({}, {}, {}, {})", mmio_chip, std::get<0>(mmio_eth_coord), std::get<1>(mmio_eth_coord), std::get<2>(mmio_eth_coord), std::get<3>(mmio_eth_coord)); + log_debug(LogSiliconDriver, "Checking chip{} at ({}, {}, {}, {})", mmio_chip, mmio_eth_coord.x, mmio_eth_coord.y, mmio_eth_coord.rack, mmio_eth_coord.shelf); int distance = get_ethernet_link_coord_distance(mmio_eth_coord, chip_eth_coord); + log_debug(LogSiliconDriver, "Distance from chip{} to chip{} is {}", chip, mmio_chip, distance); if (distance < min_distance) { min_distance = distance; closest_chip = mmio_chip; @@ -334,6 +326,8 @@ std::unique_ptr tt_ClusterDescriptor::create_from_yaml(con YAML::Node yaml = YAML::LoadFile(cluster_descriptor_file_path); tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(yaml, *desc); tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(yaml, *desc); + tt_ClusterDescriptor::merge_cluster_ids(*desc); + tt_ClusterDescriptor::fill_galaxy_connections(*desc); tt_ClusterDescriptor::load_harvesting_information(yaml, *desc); desc->enable_all_devices(); @@ -367,7 +361,7 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull desc->all_chips.insert(logical_id); eth_coord_t chip_location{logical_id, 0, 0, 0}; desc->chip_locations.insert({logical_id, chip_location}); - desc->coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = logical_id; + desc->coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = logical_id; log_debug(tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id); } @@ -409,7 +403,7 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto log_debug(LogSiliconDriver, "Ethernet Connectivity Descriptor:"); for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) { for (const auto &[chan, chip_and_chan] : chan_to_chip_chan_map) { - log_debug(LogSiliconDriver, "\tchip: {}, chan: {} <--> chip: {}, chan: {}", chip, chan, std::get<0>(chip_and_chan), std::get<1>(chip_and_chan)); + log_debug(LogSiliconDriver, "\tchip: {}, chan: {} <--> chip: {}, chan: {}", chip, chan, chip_and_chan.x, chip_and_chan.y); } } @@ -426,6 +420,9 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto } } } +} + +void tt_ClusterDescriptor::fill_galaxy_connections(tt_ClusterDescriptor &desc) { int highest_shelf_id = 0; int highest_rack_id = 0; @@ -434,8 +431,8 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip on the other shelf/rack is // this is used in get_ethernet_link_coord_distance to find the distance between two chips for (const auto &[chip_id, chip_eth_coord] : desc.chip_locations) { - highest_shelf_id = std::max(highest_shelf_id, std::get<3>(chip_eth_coord)); - highest_rack_id = std::max(highest_rack_id, std::get<2>(chip_eth_coord)); + highest_shelf_id = std::max(highest_shelf_id, chip_eth_coord.shelf); + highest_rack_id = std::max(highest_rack_id, chip_eth_coord.rack); // iterate over all neighbors if(desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) { continue; // chip has no eth connections @@ -444,11 +441,11 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto const chip_id_t &neighbor_chip = std::get<0>(chip_and_chan); eth_coord_t neighbor_eth_coord = desc.chip_locations.at(neighbor_chip); // shelves are connected in x-dim - if(std::get<3>(neighbor_eth_coord) != std::get<3>(chip_eth_coord)) { - eth_coord_t higher_shelf_coord = std::get<3>(neighbor_eth_coord) > std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_shelf_coord = std::get<3>(neighbor_eth_coord) < std::get<3>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - int lower_shelf_id = std::get<3>(lower_shelf_coord); - int lower_shelf_y = std::get<1>(lower_shelf_coord); + if(neighbor_eth_coord.shelf != chip_eth_coord.shelf) { + eth_coord_t higher_shelf_coord = neighbor_eth_coord.shelf > chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_shelf_coord = neighbor_eth_coord.shelf < chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; + int lower_shelf_id = lower_shelf_coord.shelf; + int lower_shelf_y = lower_shelf_coord.y; auto& galaxy_shelf_exit_chip_coords_per_y_dim = desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id]; @@ -461,11 +458,11 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto } // racks are connected in y-dim - if(std::get<2>(neighbor_eth_coord) != std::get<2>(chip_eth_coord)) { - eth_coord_t higher_rack_coord = std::get<2>(neighbor_eth_coord) > std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_rack_coord = std::get<2>(neighbor_eth_coord) < std::get<2>(chip_eth_coord) ? neighbor_eth_coord : chip_eth_coord; - int lower_rack_id = std::get<2>(lower_rack_coord); - int lower_rack_x = std::get<0>(lower_rack_coord); + if(neighbor_eth_coord.rack != chip_eth_coord.rack) { + eth_coord_t higher_rack_coord = neighbor_eth_coord.rack > chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_rack_coord = neighbor_eth_coord.rack < chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; + int lower_rack_id = lower_rack_coord.rack; + int lower_rack_x = lower_rack_coord.x; auto& galaxy_rack_exit_chip_coords_per_x_dim = desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id]; @@ -493,12 +490,12 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto for (const auto &[y_dim, shelf_exit_chip_coords] : shelf_exit_chip_coords_per_y_dim) { log_debug(LogSiliconDriver, "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})", shelf, y_dim, - std::get<0>(shelf_exit_chip_coords.source_chip_coord), std::get<1>(shelf_exit_chip_coords.source_chip_coord), - std::get<2>(shelf_exit_chip_coords.source_chip_coord), std::get<3>(shelf_exit_chip_coords.source_chip_coord)); + shelf_exit_chip_coords.source_chip_coord.x, shelf_exit_chip_coords.source_chip_coord.y, + shelf_exit_chip_coords.source_chip_coord.rack, shelf_exit_chip_coords.source_chip_coord.shelf); for (const auto &destination_chip_coord : shelf_exit_chip_coords.destination_chip_coords) { // print shelf_exit_chip_coord in the format: (x, y, rack, shelf) log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord)); + destination_chip_coord.x, destination_chip_coord.y, destination_chip_coord.rack, destination_chip_coord.shelf); } } } @@ -516,16 +513,37 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto for (const auto &[rack, rack_exit_chip_coords_per_x_dim] : desc.galaxy_racks_exit_chip_coords_per_x_dim) { for (const auto &[x_dim, rack_exit_chip_coords] : rack_exit_chip_coords_per_x_dim) { log_debug(LogSiliconDriver, "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", rack, x_dim, - std::get<0>(rack_exit_chip_coords.source_chip_coord), std::get<1>(rack_exit_chip_coords.source_chip_coord), - std::get<2>(rack_exit_chip_coords.source_chip_coord), std::get<3>(rack_exit_chip_coords.source_chip_coord)); + rack_exit_chip_coords.source_chip_coord.x, rack_exit_chip_coords.source_chip_coord.y, + rack_exit_chip_coords.source_chip_coord.rack, rack_exit_chip_coords.source_chip_coord.shelf); for (const auto &destination_chip_coord : rack_exit_chip_coords.destination_chip_coords) { log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - std::get<0>(destination_chip_coord), std::get<1>(destination_chip_coord), std::get<2>(destination_chip_coord), std::get<3>(destination_chip_coord)); + destination_chip_coord.x, destination_chip_coord.y, destination_chip_coord.rack, destination_chip_coord.shelf); } } } } +void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) { + + DisjointSet chip_sets; + for (const auto &[chip, _] : desc.chip_locations) { + chip_sets.add_item(chip); + log_debug(LogSiliconDriver, "Adding chip {} to disjoint set", chip); + } + + for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) { + for (const auto &[chan, dest_chip_chan_tuple] : chan_to_chip_chan_map) { + chip_sets.merge(chip, std::get<0>(dest_chip_chan_tuple)); + log_debug(LogSiliconDriver, "Merging chip {} and chip {}", chip, std::get<0>(dest_chip_chan_tuple)); + } + } + + for (const auto &[chip, chip_eth_coords] : desc.chip_locations) { + desc.chip_locations[chip].cluster_id = chip_sets.get_set(chip); + log_debug(LogSiliconDriver, "Chip {} belongs to cluster {}", chip, chip_sets.get_set(chip)); + } +} + void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) { @@ -538,10 +556,10 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y std::vector chip_rack_coords = node->second.as>(); log_assert(chip_rack_coords.size() == 4, "Galaxy (x, y, rack, shelf) coords must be size 4"); eth_coord_t chip_location{ - chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)}; + chip_id, chip_rack_coords.at(0), chip_rack_coords.at(1), chip_rack_coords.at(2), chip_rack_coords.at(3)}; desc.chip_locations.insert({chip_id, chip_location}); - desc.coords_to_chip_ids[std::get<2>(chip_location)][std::get<3>(chip_location)][std::get<1>(chip_location)][std::get<0>(chip_location)] = chip_id; + desc.coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = chip_id; } for(const auto& chip : yaml["chips_with_mmio"]) { @@ -561,10 +579,10 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y LogSiliconDriver, "\tchip: {}, EthCoord(x={}, y={}, rack={}, shelf={})", chip_id, - std::get<0>(chip_location), - std::get<1>(chip_location), - std::get<2>(chip_location), - std::get<3>(chip_location)); + chip_location.x, + chip_location.y, + chip_location.rack, + chip_location.shelf); } if (yaml["boardtype"]) { @@ -650,8 +668,8 @@ chip_id_t tt_ClusterDescriptor::get_shelf_local_physical_chip_coords(chip_id_t v log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates"); // Physical cooridnates of chip inside a single rack. Calculated based on Galaxy topology. // See: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png - int x = std::get<0>(get_chip_locations().at(virtual_coord)); - int y = std::get<1>(get_chip_locations().at(virtual_coord)); + int x = get_chip_locations().at(virtual_coord).x; + int y = get_chip_locations().at(virtual_coord).y; return 8 * x + y; } diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h index 746a99dc4..a51e6ac4d 100644 --- a/device/tt_cluster_descriptor.h +++ b/device/tt_cluster_descriptor.h @@ -66,7 +66,9 @@ class tt_ClusterDescriptor { std::unordered_map > galaxy_racks_exit_chip_coords_per_x_dim = {}; static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void fill_galaxy_connections(tt_ClusterDescriptor &desc); static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void merge_cluster_ids(tt_ClusterDescriptor &desc); static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); void fill_chips_grouped_by_closest_mmio(); diff --git a/device/tt_cluster_descriptor_types.h b/device/tt_cluster_descriptor_types.h index 142c9fef6..e120ffd9b 100644 --- a/device/tt_cluster_descriptor_types.h +++ b/device/tt_cluster_descriptor_types.h @@ -6,22 +6,37 @@ #pragma once +#include + #include #include using chip_id_t = int; using ethernet_channel_t = int; -using eth_coord_t = std::tuple; // x, y, rack, shelf +struct eth_coord_t { + int cluster_id; // This is the same for connected chips. + int x; + int y; + int rack; + int shelf; + + // in C++20 this should be defined as: + // constexpr bool operator==(const eth_coord_t &other) const noexcept = default; + constexpr bool operator==(const eth_coord_t &other) const noexcept { + return (cluster_id == other.cluster_id and x == other.x and y == other.y and rack == other.rack and shelf == other.shelf); + } +}; namespace std { template <> struct hash { std::size_t operator()(eth_coord_t const &c) const { std::size_t seed = 0; - seed = std::hash()(std::get<0>(c)) << 48 | - std::hash()(std::get<1>(c)) << 32 | - std::hash()(std::get<2>(c)) << 16 | - std::hash()(std::get<3>(c)); + boost::hash_combine(seed, c.cluster_id); + boost::hash_combine(seed, c.x); + boost::hash_combine(seed, c.y); + boost::hash_combine(seed, c.rack); + boost::hash_combine(seed, c.shelf); return seed; } }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6829c91b0..9afafb9d5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,6 +13,7 @@ target_link_libraries( gtest pthread fmt::fmt-header-only + Boost::container_hash ) target_include_directories( test_common diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index 10e0bbe72..ff0f4c7ea 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -9,6 +9,7 @@ #include #include "tests/test_utils/generate_cluster_desc.hpp" +#include "common/disjoint_set.hpp" #include "device/pcie/pci_device.hpp" #include "device/tt_cluster_descriptor.h" @@ -101,52 +102,11 @@ TEST(ApiClusterDescriptorTest, TestAllOfflineClusterDescriptors) { } } -// A standard disjoint set data structure to track connected components. -class DisjointSet { - public: - void add_item(int item) { - parent[item] = item; - } - - int get_parent(int item) { - while (parent[item] != item) { - item = parent[item]; - } - return item; - } - - void merge(int item1, int item2) { - int parent1 = get_parent(item1); - int parent2 = get_parent(item2); - parent[parent1] = parent2; - } - - bool are_same_set(int item1, int item2) { - return get_parent(item1) == get_parent(item2); - } - - int get_num_sets() { - std::unordered_set sets; - for (auto [item, _]: parent) { - sets.insert(get_parent(item)); - } - return sets.size(); - } - - private: - std::unordered_map parent; -}; - -// This tests fails on a machine with multiple cards. -// It works as long as all the devices that are discoverable are connected through ethernet. -// Our ClusterDescriptor doesn't have a notion of multiple unconnected clusters of cards. TEST(ApiClusterDescriptorTest, SeparateClusters) { - GTEST_SKIP() << "Skipping test which documents non functional feature."; - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml")); auto all_chips = cluster_desc->get_all_chips(); - DisjointSet chip_clusters; + DisjointSet chip_clusters; for (auto chip : all_chips) { chip_clusters.add_item(chip); } diff --git a/tests/galaxy/test_galaxy_common.h b/tests/galaxy/test_galaxy_common.h index 321b33fd0..057719014 100644 --- a/tests/galaxy/test_galaxy_common.h +++ b/tests/galaxy/test_galaxy_common.h @@ -22,9 +22,6 @@ using namespace tt::umd; -using chip_id_t = int; -using ethernet_channel_t = int; -using eth_coord_t = std::tuple; // x, y, rack, shelf struct tt_multichip_core_addr { tt_multichip_core_addr() : core{}, chip{}, addr{} {} tt_multichip_core_addr(chip_id_t chip, tt_xy_pair core, std::uint64_t addr) : core(core), chip(chip), addr(addr) {} From 852999cfed8a684af29559e2b189b24043f1c3eb Mon Sep 17 00:00:00 2001 From: Pavle Janevski <165378935+pjanevskiTT@users.noreply.github.com> Date: Mon, 18 Nov 2024 12:23:15 +0100 Subject: [PATCH 4/8] Simplify Cluster constructors (#277) Work towards removing parameters from default Cluster constructor - Remove target devices - Remove cluster descriptor - Remove soc descriptor - Add constructor with only target devices --- common/utils.hpp | 31 +++++ device/cluster.cpp | 127 ++++++++++++++++----- device/cluster.h | 37 +++++- device/mockup/tt_mockup_device.hpp | 2 +- device/simulation/tt_simulation_device.cpp | 2 +- device/tt_cluster_descriptor.h | 6 +- device/tt_device.cpp | 32 ++++++ device/tt_soc_descriptor.cpp | 18 +++ device/tt_soc_descriptor.h | 5 +- tests/api/test_chip.cpp | 57 +-------- tests/api/test_cluster.cpp | 120 ++++++++++--------- tests/blackhole/test_bh_common.h | 2 +- tests/blackhole/test_silicon_driver_bh.cpp | 14 +-- tests/grayskull/test_silicon_driver.cpp | 19 +-- tests/wormhole/test_silicon_driver_wh.cpp | 34 +++--- tests/wormhole/test_wh_common.h | 2 +- 16 files changed, 322 insertions(+), 186 deletions(-) create mode 100644 common/utils.hpp create mode 100644 device/tt_device.cpp diff --git a/common/utils.hpp b/common/utils.hpp new file mode 100644 index 000000000..b8cba9f53 --- /dev/null +++ b/common/utils.hpp @@ -0,0 +1,31 @@ +/* + * SPDX-FileCopyrightText: (c) 2024 Tenstorrent Inc. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include + +namespace tt::umd::utils { + +std::string get_abs_path(std::string path) { + // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the + // compiler. + std::filesystem::path current_file_path = std::filesystem::path(__FILE__); + std::filesystem::path umd_root; + if (current_file_path.is_absolute()) { + umd_root = current_file_path.parent_path().parent_path(); + } else { + std::filesystem::path umd_root_relative = + std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path(), "../"); + umd_root = std::filesystem::canonical(umd_root_relative); + } + std::filesystem::path abs_path = umd_root / path; + return abs_path.string(); +} + +} // namespace tt::umd::utils diff --git a/device/cluster.cpp b/device/cluster.cpp index e73af023f..72378b131 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -37,6 +37,8 @@ #include #include +#include "tt_arch_types.h" +#include "tt_cluster_descriptor.h" #include "yaml-cpp/yaml.h" #include "common/logger.hpp" @@ -103,7 +105,7 @@ void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes // TODO: To be removed when tt_device is removed -tt_device::tt_device(const std::string& sdesc_path) : soc_descriptor_per_chip({}) { +tt_device::tt_device() : soc_descriptor_per_chip({}) { } tt_device::~tt_device() { @@ -300,30 +302,12 @@ std::unordered_map Cluster::get_harvesting_masks_for_soc_de return default_harvesting_masks; } -Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, - const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device(sdesc_path) { - std::unordered_set target_mmio_device_ids; - target_devices_in_cluster = target_devices; - arch_name = tt_SocDescriptor(sdesc_path).arch; - perform_harvesting_on_sdesc = perform_harvesting; - - auto available_device_ids = detect_available_device_ids(); - m_num_pci_devices = available_device_ids.size(); - - if (!skip_driver_allocs) { - log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); - log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); - } +void Cluster::construct_cluster(const std::string& sdesc_path, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, + const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) { - std::string cluster_descriptor_path = ndesc_path; - if (cluster_descriptor_path == "") { - cluster_descriptor_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - } - - ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path); - - for (auto &d: target_devices){ + std::unordered_set target_mmio_device_ids; + for (auto &d: target_devices_in_cluster){ + log_assert(ndesc->get_all_chips().find(d) != ndesc->get_all_chips().end(), "Target device {} not present in current cluster!", d); if (ndesc->is_chip_mmio_capable(d)){ target_mmio_device_ids.insert(d); } @@ -357,7 +341,7 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c translation_tables_en = false; for(auto& masks : harvesting_masks) { - if(target_devices.find(masks.first) != target_devices.end()) { + if(target_devices_in_cluster.find(masks.first) != target_devices_in_cluster.end()) { harvested_rows_per_target[masks.first] = get_harvested_noc_rows(masks.second); noc_translation_enabled_for_chip[masks.first] = noc_translation_enabled.at(masks.first); num_rows_harvested.insert({masks.first, std::bitset<32>(masks.second).count()}); @@ -386,7 +370,7 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c } else if(arch_name == tt::ARCH::BLACKHOLE) { // Default harvesting info for Blackhole, describing no harvesting - for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){ + for(auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++){ harvested_rows_per_target[*chip_id] = 0; //get_harvested_noc_rows_for_chip(*chip_id); num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. if(harvested_rows_per_target[*chip_id]) { @@ -396,7 +380,7 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c } else if(arch_name == tt::ARCH::GRAYSKULL) { // Multichip harvesting is supported for GS. - for(auto chip_id = target_devices.begin(); chip_id != target_devices.end(); chip_id++){ + for(auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++){ harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id); num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. if(harvested_rows_per_target[*chip_id]) { @@ -407,7 +391,7 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c if(simulated_harvesting_masks.size()) { performed_harvesting = true; - for (auto device_id = target_devices.begin(); device_id != target_devices.end(); device_id++) { + for (auto device_id = target_devices_in_cluster.begin(); device_id != target_devices_in_cluster.end(); device_id++) { log_assert(simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), "Could not find harvesting mask for device_id {}", *device_id); if(arch_name == tt::ARCH::GRAYSKULL) { if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != harvested_rows_per_target[*device_id]) { @@ -456,7 +440,94 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c // Default initialize noc_params based on detected arch noc_params = architecture_implementation->get_noc_params(); +} + +Cluster::Cluster(const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, + const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device() { + // TODO: this should be fetched through ClusterDescriptor + auto available_device_ids = detect_available_device_ids(); + m_num_pci_devices = available_device_ids.size(); + + int physical_device_id = available_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device (physical_device_id, 0); + tt::ARCH device_arch = pci_device.get_arch(); + + std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch); + + arch_name = tt_SocDescriptor(sdesc_path).arch; + perform_harvesting_on_sdesc = perform_harvesting; + + if (!skip_driver_allocs) { + log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); + log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); + } + + std::string ndesc_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); + ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); + + std::set target_devices; + for(const chip_id_t &d : ndesc->get_all_chips()) { + target_devices.insert(d); + } + target_devices_in_cluster = target_devices; + + construct_cluster(sdesc_path, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources, perform_harvesting, simulated_harvesting_masks); +} + +Cluster::Cluster(const std::set &target_devices, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, + const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device() { + // TODO: this should be fetched through ClusterDescriptor + auto available_device_ids = detect_available_device_ids(); + m_num_pci_devices = available_device_ids.size(); + + int physical_device_id = available_device_ids[0]; + // TODO: remove logical_device_id + PCIDevice pci_device (physical_device_id, 0); + tt::ARCH device_arch = pci_device.get_arch(); + + std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch); + + arch_name = tt_SocDescriptor(sdesc_path).arch; + perform_harvesting_on_sdesc = perform_harvesting; + + if (!skip_driver_allocs) { + log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); + log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); + } + + std::string ndesc_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); + ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); + + target_devices_in_cluster = target_devices; + + construct_cluster(sdesc_path, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources, perform_harvesting, simulated_harvesting_masks); +} + +Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, + const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, + const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device() { + // TODO: this should be fetched through ClusterDescriptor + auto available_device_ids = detect_available_device_ids(); + m_num_pci_devices = available_device_ids.size(); + + target_devices_in_cluster = target_devices; + arch_name = tt_SocDescriptor(sdesc_path).arch; + perform_harvesting_on_sdesc = perform_harvesting; + + if (!skip_driver_allocs) { + log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); + log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); + } + + std::string cluster_descriptor_path = ndesc_path; + if (cluster_descriptor_path == "") { + cluster_descriptor_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); + } + + ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path); + construct_cluster(sdesc_path, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources, perform_harvesting, simulated_harvesting_masks); } void Cluster::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { diff --git a/device/cluster.h b/device/cluster.h index f2cb6f949..b5caaa85f 100644 --- a/device/cluster.h +++ b/device/cluster.h @@ -220,7 +220,7 @@ struct tt_device_params { class tt_device { public: - tt_device(const std::string& sdesc_path); + tt_device(); virtual ~tt_device(); // Setup/Teardown Functions /** @@ -606,10 +606,10 @@ namespace tt::umd { */ class Cluster: public tt_device { - public: +public: // Constructor /** - * Silicon Driver constructor. + * Cluster constructor. * * @param sdesc_path SOC descriptor specifying single chip. * @param ndesc_path Network Descriptor specifying the network topology of the system. @@ -624,6 +624,32 @@ class Cluster: public tt_device const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); + /** + * Cluster constructor. This constructor should be used to work towards removing all + * of the params from the constructor of tt_SiliconDevice (to become Cluster). + * + * @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages). + * @param skip_driver_allocs + * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. + * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. + * @param simulated_harvesting_masks + */ + Cluster(const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, + const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); + + /** + * Cluster constructor. This constructor should be used to target specific devices in a cluster. + * + * @param target_devices Devices to target. + * @param num_host_mem_ch_per_mmio_device Requested number of host channels (hugepages). + * @param skip_driver_allocs + * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. + * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. + * @param simulated_harvesting_masks + */ + Cluster(const std::set &target_devices, const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, + const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); + //Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors(); virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); @@ -710,7 +736,7 @@ class Cluster: public tt_device // Destructor virtual ~Cluster (); - private: +private: // Helper functions // Startup + teardown void create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources); @@ -771,6 +797,9 @@ class Cluster: public tt_device // This functions has to be called for local chip, and then it will wait for all connected remote chips to flush. void wait_for_connected_non_mmio_flush(chip_id_t chip_id); + void construct_cluster(const std::string& sdesc_path, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, + const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks); + // State variables tt_device_dram_address_params dram_address_params; tt_device_l1_address_params l1_address_params; diff --git a/device/mockup/tt_mockup_device.hpp b/device/mockup/tt_mockup_device.hpp index 2d888d934..e6085b396 100644 --- a/device/mockup/tt_mockup_device.hpp +++ b/device/mockup/tt_mockup_device.hpp @@ -14,7 +14,7 @@ class tt_MockupDevice : public tt_device { public: - tt_MockupDevice(const std::string& sdesc_path) : tt_device(sdesc_path) { + tt_MockupDevice(const std::string& sdesc_path) : tt_device() { soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; } diff --git a/device/simulation/tt_simulation_device.cpp b/device/simulation/tt_simulation_device.cpp index e3909ecca..a77a8ad3f 100644 --- a/device/simulation/tt_simulation_device.cpp +++ b/device/simulation/tt_simulation_device.cpp @@ -48,7 +48,7 @@ void print_flatbuffer(const DeviceRequestResponse *buf){ std::cout << std::endl; } -tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_device(sdesc_path){ +tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_device(){ log_info(tt::LogEmulationDriver, "Instantiating simulation device"); soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h index a51e6ac4d..56ec9393c 100644 --- a/device/tt_cluster_descriptor.h +++ b/device/tt_cluster_descriptor.h @@ -33,10 +33,10 @@ enum BoardType : uint32_t { class tt_ClusterDescriptor { - private: +private: int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const; - protected: +protected: std::unordered_map > > ethernet_connections; std::unordered_map chip_locations; @@ -73,7 +73,7 @@ class tt_ClusterDescriptor { void fill_chips_grouped_by_closest_mmio(); - public: +public: tt_ClusterDescriptor() = default; tt_ClusterDescriptor(const tt_ClusterDescriptor&) = default; diff --git a/device/tt_device.cpp b/device/tt_device.cpp new file mode 100644 index 000000000..9df2f3923 --- /dev/null +++ b/device/tt_device.cpp @@ -0,0 +1,32 @@ +// SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + + +#ifdef TT_DEBUG_LOGGING +#define DEBUG_LOG(str) do { std::cout << str << std::endl; } while( false ) +#else +#define DEBUG_LOG(str) ((void)0) +#endif + +#include "tt_device.h" +#include "device/tt_cluster_descriptor_types.h" +#include +#include +#include +#include +#include +#include "yaml-cpp/yaml.h" + +//////// +// Device base +//////// +tt_device::tt_device() : soc_descriptor_per_chip({}) { +} + +tt_device::~tt_device() { +} + +const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { + return soc_descriptor_per_chip.at(chip_id); +} diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp index 9a5724209..74c35e59a 100644 --- a/device/tt_soc_descriptor.cpp +++ b/device/tt_soc_descriptor.cpp @@ -5,6 +5,8 @@ #include "yaml-cpp/yaml.h" #include "tt_soc_descriptor.h" +#include "common/utils.hpp" + #include #include #include @@ -273,6 +275,22 @@ bool tt_SocDescriptor::is_ethernet_core(const tt_xy_pair &core) const { return this->ethernet_core_channel_map.find(core) != ethernet_core_channel_map.end(); } +std::string tt_SocDescriptor::get_soc_descriptor_path(tt::ARCH arch) { + switch (arch) { + case tt::ARCH::GRAYSKULL: + // TODO: this path needs to be changed to point to soc descriptors outside of tests directory. + return tt::umd::utils::get_abs_path("tests/soc_descs/grayskull_10x12.yaml"); + case tt::ARCH::WORMHOLE_B0: + // TODO: this path needs to be changed to point to soc descriptors outside of tests directory. + return tt::umd::utils::get_abs_path("tests/soc_descs/wormhole_b0_8x10.yaml"); + case tt::ARCH::BLACKHOLE: + // TODO: this path needs to be changed to point to soc descriptors outside of tests directory. + return tt::umd::utils::get_abs_path("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); + default: + throw std::runtime_error("Invalid architecture"); + } +} + std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { if (arch_name == tt::ARCH::Invalid) { out << "none"; diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h index 372d0a296..e0529570a 100644 --- a/device/tt_soc_descriptor.h +++ b/device/tt_soc_descriptor.h @@ -189,11 +189,14 @@ class tt_SocDescriptor { void perform_harvesting(std::size_t harvesting_mask); + static std::string get_soc_descriptor_path(tt::ARCH arch); + private: - std::unique_ptr coordinate_manager = nullptr; void create_coordinate_manager(std::size_t harvesting_mask); void load_core_descriptors_from_device_descriptor(YAML::Node &device_descriptor_yaml); void load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml); + + std::unique_ptr coordinate_manager = nullptr; }; // Allocates a new soc descriptor on the heap. Returns an owning pointer. diff --git a/tests/api/test_chip.cpp b/tests/api/test_chip.cpp index 713cf464c..339d1abc6 100644 --- a/tests/api/test_chip.cpp +++ b/tests/api/test_chip.cpp @@ -21,13 +21,6 @@ using namespace tt::umd; -inline std::unique_ptr get_cluster_desc() { - // TODO: remove getting manually cluster descriptor from yaml. - std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - - return tt_ClusterDescriptor::create_from_yaml(yaml_path); -} - inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr &umd_cluster) { chip_id_t any_mmio_chip = *umd_cluster->get_target_mmio_device_ids().begin(); const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_mmio_chip); @@ -36,60 +29,12 @@ inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr &um } inline std::unique_ptr get_cluster() { - - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set (pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - // TODO: Make this test work on a host system without any tt devices. if (pci_device_ids.empty()) { return nullptr; } - - std::string yaml_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - yaml_path = ""; - } else if (device_arch == tt::ARCH::BLACKHOLE) { - yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"); - } else { - // TODO: remove getting manually cluster descriptor from yaml. - yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - } - // TODO: Remove the need to do this, allow default constructor to construct with all chips. - std::unique_ptr cluster_desc = get_cluster_desc(); - std::unordered_set detected_num_chips = cluster_desc->get_all_chips(); - - // TODO: make this unordered vs set conversion not needed. - std::set detected_num_chips_set (detected_num_chips.begin(), detected_num_chips.end()); - - - // TODO: This would be incorporated inside SocDescriptor. - std::string soc_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); - } else if (device_arch == tt::ARCH::WORMHOLE_B0) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); - } else if (device_arch == tt::ARCH::BLACKHOLE) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); - } else { - throw std::runtime_error("Unsupported architecture"); - } - - - // TODO: Don't pass each of these arguments. - return std::unique_ptr(new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set)); + return std::unique_ptr(new Cluster()); } // TODO: Once default auto TLB setup is in, check it is setup properly. diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp index fc68f62a6..339d628fe 100644 --- a/tests/api/test_cluster.cpp +++ b/tests/api/test_cluster.cpp @@ -30,68 +30,14 @@ using namespace tt::umd; // N150. N300 // Galaxy -// TODO: This function should not exist, the API itself should be simple enough. -inline std::unique_ptr get_cluster_desc() { - // TODO: remove getting manually cluster descriptor from yaml. - std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - - return tt_ClusterDescriptor::create_from_yaml(yaml_path); -} - -// TODO: This function should not exist, the API itself should be simple enough. inline std::unique_ptr get_cluster() { - // TODO: This should not be needed. And could be part of the cluster descriptor probably. - // Note that cluster descriptor holds logical ids of chips. - // Which are different than physical PCI ids, which are /dev/tenstorrent/N ones. - // You have to see if physical PCIe is GS before constructing a cluster descriptor. std::vector pci_device_ids = PCIDevice::enumerate_devices(); - std::set pci_device_ids_set(pci_device_ids.begin(), pci_device_ids.end()); - - tt::ARCH device_arch = tt::ARCH::GRAYSKULL; - if (!pci_device_ids.empty()) { - // TODO: This should be removed from the API, the driver itself should do it. - int physical_device_id = pci_device_ids[0]; - // TODO: remove logical_device_id - PCIDevice pci_device(physical_device_id, 0); - device_arch = pci_device.get_arch(); - } - // TODO: Make this test work on a host system without any tt devices. if (pci_device_ids.empty()) { return nullptr; } - - std::string yaml_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - yaml_path = ""; - } else if (device_arch == tt::ARCH::BLACKHOLE) { - yaml_path = test_utils::GetAbsPath("blackhole_1chip_cluster.yaml"); - } else { - // TODO: remove getting manually cluster descriptor from yaml. - yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); - } - // TODO: Remove the need to do this, allow default constructor to construct with all chips. - std::unique_ptr cluster_desc = get_cluster_desc(); - std::unordered_set detected_num_chips = cluster_desc->get_all_chips(); - - // TODO: make this unordered vs set conversion not needed. - std::set detected_num_chips_set(detected_num_chips.begin(), detected_num_chips.end()); - - // TODO: This would be incorporated inside SocDescriptor. - std::string soc_path; - if (device_arch == tt::ARCH::GRAYSKULL) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); - } else if (device_arch == tt::ARCH::WORMHOLE_B0) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); - } else if (device_arch == tt::ARCH::BLACKHOLE) { - soc_path = test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"); - } else { - throw std::runtime_error("Unsupported architecture"); - } - - // TODO: Don't pass each of these arguments. return std::unique_ptr( - new Cluster(soc_path, tt_ClusterDescriptor::get_cluster_descriptor_file_path(), detected_num_chips_set)); + new Cluster()); } // TODO: Should not be wormhole specific. @@ -115,9 +61,10 @@ void setup_wormhole_remote(Cluster* umd_cluster) { TEST(ApiClusterTest, OpenAllChips) { std::unique_ptr umd_cluster = get_cluster(); } TEST(ApiClusterTest, SimpleIOAllChips) { - std::unique_ptr cluster_desc = get_cluster_desc(); std::unique_ptr umd_cluster = get_cluster(); + const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { GTEST_SKIP() << "No chips present on the system. Skipping test."; } @@ -172,9 +119,10 @@ TEST(ApiClusterTest, SimpleIOAllChips) { } TEST(ApiClusterTest, RemoteFlush) { - std::unique_ptr cluster_desc = get_cluster_desc(); std::unique_ptr umd_cluster = get_cluster(); + const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description(); + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { GTEST_SKIP() << "No chips present on the system. Skipping test."; } @@ -229,3 +177,61 @@ TEST(ApiClusterTest, RemoteFlush) { std::cout << "Testing whole cluster wait for remote chip flush again, should be no-op." << std::endl; umd_cluster->wait_for_non_mmio_flush(); } + +TEST(ApiClusterTest, SimpleIOSpecificChips) { + std::unique_ptr umd_cluster = std::make_unique(0); + + const tt_ClusterDescriptor* cluster_desc = umd_cluster->get_cluster_description(); + + if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { + GTEST_SKIP() << "No chips present on the system. Skipping test."; + } + + // Initialize random data. + size_t data_size = 1024; + std::vector data(data_size, 0); + for (int i = 0; i < data_size; i++) { + data[i] = i % 256; + } + + // TODO: this should be part of constructor if it is mandatory. + setup_wormhole_remote(umd_cluster.get()); + + for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); + + // TODO: figure out if core locations should contain chip_id + tt_xy_pair any_core = soc_desc.workers[0]; + tt_cxy_pair any_core_global(chip_id, any_core); + + if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { + std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; + continue; + } + + std::cout << "Writing to chip " << chip_id << " core " << any_core.str() << std::endl; + + umd_cluster->write_to_device(data.data(), data_size, any_core_global, 0, "LARGE_WRITE_TLB"); + } + + // Now read back the data. + for (auto chip_id : umd_cluster->get_all_chips_in_cluster()) { + const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(chip_id); + + // TODO: figure out if core locations should contain chip_id + tt_xy_pair any_core = soc_desc.workers[0]; + tt_cxy_pair any_core_global(chip_id, any_core); + + if (cluster_desc->is_chip_remote(chip_id) && soc_desc.arch != tt::ARCH::WORMHOLE_B0) { + std::cout << "Skipping remote chip " << chip_id << " because it is not a wormhole_b0 chip." << std::endl; + continue; + } + + std::cout << "Reading from chip " << chip_id << " core " << any_core.str() << std::endl; + + std::vector readback_data(data_size, 0); + umd_cluster->read_from_device(readback_data.data(), any_core_global, 0, data_size, "LARGE_READ_TLB"); + + ASSERT_EQ(data, readback_data); + } +} diff --git a/tests/blackhole/test_bh_common.h b/tests/blackhole/test_bh_common.h index 57fdf25c1..5d115e31b 100644 --- a/tests/blackhole/test_bh_common.h +++ b/tests/blackhole/test_bh_common.h @@ -54,7 +54,7 @@ class BlackholeTestFixture : public ::testing::Test { std::iota(devices.begin(), devices.end(), 0); std::set target_devices = {devices.begin(), devices.end()}; uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_unique(test_utils::GetAbsPath(SOC_DESC_PATH), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); assert(device != nullptr); assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp index abaff80aa..b2b7bde10 100644 --- a/tests/blackhole/test_silicon_driver_bh.cpp +++ b/tests/blackhole/test_silicon_driver_bh.cpp @@ -207,7 +207,7 @@ TEST(SiliconDriverBH, UnalignedStaticTLB_RW) { uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -266,7 +266,7 @@ TEST(SiliconDriverBH, StaticTLB_RW) { uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -316,7 +316,7 @@ TEST(SiliconDriverBH, DynamicTLB_RW) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); @@ -380,7 +380,7 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); @@ -439,7 +439,7 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); for(int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores @@ -541,7 +541,7 @@ TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/e uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -598,7 +598,7 @@ TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index c8fca4bf1..c61a3a2ef 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -20,7 +20,7 @@ TEST(SiliconDriverGS, CreateDestroySequential) { uint32_t num_host_mem_ch_per_mmio_device = 1; tt_device_params default_params; for(int i = 0; i < 100; i++) { - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); device.start_device(default_params); device.deassert_risc_reset(); device.close_device(); @@ -34,7 +34,7 @@ TEST(SiliconDriverGS, CreateMultipleInstance) { default_params.init_device = false; std::unordered_map concurrent_devices = {}; for(int i = 0; i < 100; i++) { - concurrent_devices.insert({i, new Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true)}); + concurrent_devices.insert({i, new Cluster(num_host_mem_ch_per_mmio_device, false, true)}); concurrent_devices.at(i) -> start_device(default_params); } @@ -48,7 +48,7 @@ TEST(SiliconDriverGS, Harvesting) { std::set target_devices = {0}; std::unordered_map simulated_harvesting_masks = {{0, 6}, {1, 12}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; @@ -85,7 +85,7 @@ TEST(SiliconDriverGS, HarvestingRuntime) { std::set target_devices = {0}; std::unordered_map simulated_harvesting_masks = {{0, 6}, {1, 12}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); for(int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores @@ -148,7 +148,7 @@ TEST(SiliconDriverGS, StaticTLB_RW) { std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); for(int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); @@ -196,7 +196,7 @@ TEST(SiliconDriverGS, DynamicTLB_RW) { std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); device.set_fallback_tlb_ordering_mode("SMALL_READ_WRITE_TLB", TLB_DATA::Posted); // Explicitly test API to set fallback tlb ordering mode tt_device_params default_params; device.start_device(default_params); @@ -238,7 +238,8 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -315,8 +316,8 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true); - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); + for(int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index 0f8f90999..48834d483 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -92,7 +92,7 @@ TEST(SiliconDriverWH, Harvesting) { std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; @@ -133,8 +133,8 @@ TEST(SiliconDriverWH, HarvestingRuntime) { std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; uint32_t num_host_mem_ch_per_mmio_device = 1; - - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -198,7 +198,7 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { int num_devices = target_devices.size(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -256,7 +256,7 @@ TEST(SiliconDriverWH, StaticTLB_RW) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -305,7 +305,7 @@ TEST(SiliconDriverWH, DynamicTLB_RW) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); @@ -343,7 +343,8 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); + set_params_for_remote_txn(device); tt_device_params default_params; @@ -401,7 +402,7 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -507,7 +508,8 @@ TEST(SiliconDriverWH, BroadcastWrite) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -562,7 +564,8 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -644,13 +647,10 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { TEST(SiliconDriverWH, SysmemTestWithPcie) { auto target_devices = get_target_devices(); - Cluster cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), - tt_ClusterDescriptor::get_cluster_descriptor_file_path(), - target_devices, - 1, // one "host memory channel", currently a 1G huge page - false, // skip driver allocs - no (don't skip) - true, // clean system resources - yes - true); // perform harvesting - yes + Cluster cluster(1, // one "host memory channel", currently a 1G huge page + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes set_params_for_remote_txn(cluster); cluster.start_device(tt_device_params{}); // no special parameters diff --git a/tests/wormhole/test_wh_common.h b/tests/wormhole/test_wh_common.h index 98cdf8adf..812f8b98b 100644 --- a/tests/wormhole/test_wh_common.h +++ b/tests/wormhole/test_wh_common.h @@ -55,7 +55,7 @@ class WormholeTestFixture : public ::testing::Test { std::iota(devices.begin(), devices.end(), 0); std::set target_devices = {devices.begin(), devices.end()}; uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_unique(test_utils::GetAbsPath(SOC_DESC_PATH), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true); + device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); assert(device != nullptr); assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); From f749fc1dac448fe391f4adf7645389b267257a35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:45:24 +0100 Subject: [PATCH 5/8] Enable clang tests/ device/ (#312) ### Issue Related to https://github.com/tenstorrent/tt-umd/issues/47 , a follow up from https://github.com/tenstorrent/tt-umd/pull/203 ### Description Enabling clang practically repo-wide ### List of the changes - Removed device/.clang-format and tests/.clang-format which were previously disabling it. - Left over src/.clang-format, since these are copied files from external repo. - Ran `pre-commit run --all-files` ### Testing Code builds. ### API Changes There are no API changes in this PR. --- device/.clang-format | 2 - device/architecture_implementation.cpp | 12 +- device/architecture_implementation.h | 7 +- .../blackhole/blackhole_coordinate_manager.h | 8 +- device/blackhole/blackhole_implementation.cpp | 20 +- device/blackhole/blackhole_implementation.h | 96 +- device/cluster.cpp | 2273 +++++++++++------ device/cluster.h | 554 ++-- device/coordinate_manager.cpp | 26 +- device/coordinate_manager.h | 19 +- device/cpuset_lib.cpp | 489 ++-- device/cpuset_lib.hpp | 168 +- device/driver_atomics.h | 34 +- .../grayskull/grayskull_coordinate_manager.h | 6 +- device/grayskull/grayskull_implementation.cpp | 13 +- device/grayskull/grayskull_implementation.h | 43 +- device/hugepage.cpp | 146 +- device/hugepage.h | 11 +- device/ioctl.h | 4 + device/mockup/tt_mockup_device.hpp | 42 +- device/pcie/pci_device.cpp | 367 ++- device/pcie/pci_device.hpp | 82 +- .../deprecated/tt_emulation_device.cpp | 246 +- .../deprecated/tt_emulation_device.h | 124 +- .../deprecated/tt_emulation_stub.cpp | 107 +- .../deprecated/tt_versim_device.cpp | 437 ++-- .../simulation/deprecated/tt_versim_device.h | 101 +- .../simulation/deprecated/tt_versim_stub.cpp | 121 +- device/simulation/tt_simulation_device.cpp | 127 +- device/simulation/tt_simulation_device.h | 34 +- device/simulation/tt_simulation_host.cpp | 19 +- device/simulation/tt_simulation_host.hpp | 3 +- device/tlb.h | 8 +- device/tt_arch_types.h | 2 +- device/tt_cluster_descriptor.cpp | 511 ++-- device/tt_cluster_descriptor.h | 184 +- device/tt_cluster_descriptor_types.h | 30 +- device/tt_device.cpp | 20 +- device/tt_io.hpp | 28 +- device/tt_silicon_driver_common.cpp | 28 +- device/tt_silicon_driver_common.hpp | 55 +- device/tt_soc_descriptor.cpp | 79 +- device/tt_soc_descriptor.h | 67 +- device/tt_xy_pair.h | 12 + .../wormhole/wormhole_coordinate_manager.cpp | 6 +- device/wormhole/wormhole_coordinate_manager.h | 8 +- device/wormhole/wormhole_implementation.cpp | 13 +- device/wormhole/wormhole_implementation.h | 40 +- device/xy_pair.cpp | 1 + device/xy_pair.h | 3 + tests/.clang-format | 2 - tests/api/test_chip.cpp | 17 +- tests/api/test_cluster.cpp | 16 +- tests/api/test_cluster_descriptor.cpp | 43 +- tests/api/test_mockup_device.cpp | 27 +- tests/api/test_soc_descriptor_bh.cpp | 37 +- tests/api/test_soc_descriptor_gs.cpp | 19 +- tests/api/test_soc_descriptor_wh.cpp | 58 +- tests/blackhole/test_bh_common.h | 97 +- tests/blackhole/test_silicon_driver_bh.cpp | 633 +++-- tests/emulation/test_emulation_device.cpp | 24 +- tests/galaxy/test_galaxy_common.cpp | 20 +- tests/galaxy/test_galaxy_common.h | 10 +- tests/galaxy/test_umd_concurrent_threads.cpp | 83 +- tests/galaxy/test_umd_remote_api.cpp | 56 +- .../galaxy/test_umd_remote_api_stability.cpp | 190 +- tests/grayskull/test_silicon_driver.cpp | 342 ++- tests/microbenchmark/device_fixture.hpp | 27 +- tests/microbenchmark/test_rw_tensix.cpp | 98 +- tests/pcie/test_pcie_device.cpp | 3 +- tests/simulation/device_fixture.hpp | 15 +- tests/simulation/test_simulation_device.cpp | 61 +- tests/test_utils/device_test_utils.hpp | 19 +- tests/test_utils/generate_cluster_desc.hpp | 12 +- tests/test_utils/soc_desc_test_utils.hpp | 2 +- tests/test_utils/stimulus_generators.hpp | 329 ++- tests/unit_test_main.cpp | 5 +- tests/wormhole/test_silicon_driver_wh.cpp | 477 ++-- .../test_umd_remote_api_stability.cpp | 398 ++- tests/wormhole/test_wh_common.h | 95 +- 80 files changed, 6207 insertions(+), 3844 deletions(-) delete mode 100644 device/.clang-format delete mode 100644 tests/.clang-format diff --git a/device/.clang-format b/device/.clang-format deleted file mode 100644 index 9d159247d..000000000 --- a/device/.clang-format +++ /dev/null @@ -1,2 +0,0 @@ -DisableFormat: true -SortIncludes: false diff --git a/device/architecture_implementation.cpp b/device/architecture_implementation.cpp index 7cd1dac80..186c6c141 100644 --- a/device/architecture_implementation.cpp +++ b/device/architecture_implementation.cpp @@ -12,10 +12,14 @@ namespace tt::umd { std::unique_ptr architecture_implementation::create(tt::ARCH architecture) { switch (architecture) { - case tt::ARCH::BLACKHOLE: return std::make_unique(); - case tt::ARCH::GRAYSKULL: return std::make_unique(); - case tt::ARCH::WORMHOLE_B0: return std::make_unique(); - default: return nullptr; + case tt::ARCH::BLACKHOLE: + return std::make_unique(); + case tt::ARCH::GRAYSKULL: + return std::make_unique(); + case tt::ARCH::WORMHOLE_B0: + return std::make_unique(); + default: + return nullptr; } } diff --git a/device/architecture_implementation.h b/device/architecture_implementation.h index ffd92e4b3..f715fd3ec 100644 --- a/device/architecture_implementation.h +++ b/device/architecture_implementation.h @@ -12,8 +12,8 @@ #include #include "device/tlb.h" -#include "device/xy_pair.h" #include "device/tt_arch_types.h" +#include "device/xy_pair.h" struct tt_driver_host_address_params; struct tt_driver_eth_interface_params; @@ -22,7 +22,7 @@ struct tt_driver_noc_params; namespace tt::umd { class architecture_implementation { - public: +public: virtual ~architecture_implementation() = default; virtual tt::ARCH get_architecture() const = 0; @@ -65,7 +65,8 @@ class architecture_implementation { virtual std::tuple multicast_workaround(xy_pair start, xy_pair end) const = 0; virtual tlb_configuration get_tlb_configuration(uint32_t tlb_index) const = 0; virtual std::optional> describe_tlb(std::int32_t tlb_index) const = 0; - virtual std::pair get_tlb_data(std::uint32_t tlb_index, const tlb_data& data) const = 0; + virtual std::pair get_tlb_data( + std::uint32_t tlb_index, const tlb_data& data) const = 0; virtual tt_driver_host_address_params get_host_address_params() const = 0; virtual tt_driver_eth_interface_params get_eth_interface_params() const = 0; diff --git a/device/blackhole/blackhole_coordinate_manager.h b/device/blackhole/blackhole_coordinate_manager.h index 6eef92eb6..5a316f8c8 100644 --- a/device/blackhole/blackhole_coordinate_manager.h +++ b/device/blackhole/blackhole_coordinate_manager.h @@ -9,15 +9,15 @@ #include "device/coordinate_manager.h" class BlackholeCoordinateManager : public CoordinateManager { - public: - BlackholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + BlackholeCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override; tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override; -protected: +protected: std::set get_x_coordinates_to_harvest(std::size_t harvesting_mask) override; }; diff --git a/device/blackhole/blackhole_implementation.cpp b/device/blackhole/blackhole_implementation.cpp index bf6ef5537..3d0b19fef 100644 --- a/device/blackhole/blackhole_implementation.cpp +++ b/device/blackhole/blackhole_implementation.cpp @@ -4,13 +4,12 @@ #include "blackhole_implementation.h" -#include "src/firmware/riscv/blackhole/host_mem_address_map.h" -#include "src/firmware/riscv/blackhole/eth_interface.h" - #include "device/cluster.h" +#include "src/firmware/riscv/blackhole/eth_interface.h" +#include "src/firmware/riscv/blackhole/host_mem_address_map.h" -constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH -constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH namespace tt::umd { @@ -26,10 +25,9 @@ std::tuple blackhole_implementation::multicast_workaround(xy_p } tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_index) const { - // If TLB index is in range for 4GB tlbs (8 TLBs after 202 TLBs for 2MB) if (tlb_index >= blackhole::TLB_COUNT_2M && tlb_index < blackhole::TLB_COUNT_2M + blackhole::TLB_COUNT_4G) { - return tlb_configuration { + return tlb_configuration{ .size = blackhole::DYNAMIC_TLB_4G_SIZE, .base = blackhole::DYNAMIC_TLB_4G_BASE, .cfg_addr = blackhole::DYNAMIC_TLB_4G_CFG_ADDR, @@ -37,7 +35,7 @@ tlb_configuration blackhole_implementation::get_tlb_configuration(uint32_t tlb_i .offset = blackhole::TLB_4G_OFFSET, }; } - + return tlb_configuration{ .size = blackhole::DYNAMIC_TLB_2M_SIZE, .base = blackhole::DYNAMIC_TLB_2M_BASE, @@ -73,17 +71,17 @@ std::optional> blackhole_implementation std::pair blackhole_implementation::get_tlb_data( std::uint32_t tlb_index, const tlb_data& data) const { - if (tlb_index < blackhole::TLB_COUNT_2M) { return data.apply_offset(blackhole::TLB_2M_OFFSET); } else { throw std::runtime_error("Invalid TLB index for Blackhole arch"); } - } tt_driver_host_address_params blackhole_implementation::get_host_address_params() const { - return {::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::blackhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::blackhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } tt_driver_eth_interface_params blackhole_implementation::get_eth_interface_params() const { diff --git a/device/blackhole/blackhole_implementation.h b/device/blackhole/blackhole_implementation.h index 2cd3ee9ee..74789233d 100644 --- a/device/blackhole/blackhole_implementation.h +++ b/device/blackhole/blackhole_implementation.h @@ -7,10 +7,10 @@ #pragma once #include +#include #include "device/architecture_implementation.h" #include "device/tlb.h" -#include namespace tt::umd { @@ -59,30 +59,8 @@ enum class arc_message_type { // DEVICE_DATA static constexpr std::array DRAM_LOCATIONS = { - {{0, 0}, - {0, 1}, - {0, 11}, - {0, 2}, - {0, 10}, - {0, 3}, - {0, 9}, - {0, 4}, - {0, 8}, - {0, 5}, - {0, 7}, - {0, 6}, - {9, 0}, - {9, 1}, - {9, 11}, - {9, 2}, - {9, 10}, - {9, 3}, - {9, 9}, - {9, 4}, - {9, 8}, - {9, 5}, - {9, 7}, - {9, 6}}}; + {{0, 0}, {0, 1}, {0, 11}, {0, 2}, {0, 10}, {0, 3}, {0, 9}, {0, 4}, {0, 8}, {0, 5}, {0, 7}, {0, 6}, + {9, 0}, {9, 1}, {9, 11}, {9, 2}, {9, 10}, {9, 3}, {9, 9}, {9, 4}, {9, 8}, {9, 5}, {9, 7}, {9, 6}}}; static constexpr std::array ARC_LOCATIONS = {{{8, 0}}}; static constexpr std::array PCI_LOCATIONS = {{{11, 0}}}; @@ -113,14 +91,14 @@ static constexpr uint32_t BROADCAST_TLB_INDEX = 0; // TODO: Copied from worm static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000; static constexpr uint32_t TLB_COUNT_2M = 202; -static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0 +static constexpr uint32_t TLB_BASE_2M = 0; // 0 in BAR0 static constexpr uint32_t TLB_BASE_INDEX_2M = 0; static constexpr uint32_t TLB_2M_SIZE = 2 * 1024 * 1024; static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 12; static constexpr uint32_t TLB_COUNT_4G = 8; -static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4 +static constexpr uint32_t TLB_BASE_4G = 0; // 0 in BAR4 static constexpr uint32_t TLB_BASE_INDEX_4G = TLB_COUNT_2M; static constexpr uint64_t TLB_4G_SIZE = 4ULL * 1024ULL * 1024ULL * 1024ULL; static constexpr uint64_t DYNAMIC_TLB_4G_SIZE = TLB_4G_SIZE; @@ -168,59 +146,108 @@ static constexpr uint32_t MSG_TYPE_SETUP_IATU_FOR_PEER_TO_PEER = 0x97; } // namespace blackhole class blackhole_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::BLACKHOLE; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(blackhole::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(blackhole::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(blackhole::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(blackhole::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(blackhole::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(blackhole::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(blackhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(blackhole::arc_message_type::TEST); } - uint32_t get_arc_csm_mailbox_offset() const override { throw std::runtime_error("Not supported for Blackhole arch"); return 0; } + + uint32_t get_arc_csm_mailbox_offset() const override { + throw std::runtime_error("Not supported for Blackhole arch"); + return 0; + } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return blackhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return blackhole::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return blackhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return blackhole::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return blackhole::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return blackhole::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return blackhole::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return blackhole::DYNAMIC_TLB_2M_SIZE; } - uint32_t get_dynamic_tlb_16m_base() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } - uint32_t get_dynamic_tlb_16m_size() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } - uint32_t get_dynamic_tlb_16m_cfg_addr() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } + + uint32_t get_dynamic_tlb_16m_base() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + + uint32_t get_dynamic_tlb_16m_size() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + uint32_t get_mem_large_read_tlb() const override { return blackhole::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return blackhole::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return blackhole::STATIC_TLB_CFG_ADDR; } - uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; } + + uint32_t get_static_tlb_size() const override { return blackhole::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return blackhole::REG_TLB; } - uint32_t get_tlb_base_index_16m() const override { throw std::runtime_error("No 16MB TLBs for Blackhole arch"); return 0; } + + uint32_t get_tlb_base_index_16m() const override { + throw std::runtime_error("No 16MB TLBs for Blackhole arch"); + return 0; + } + uint32_t get_tensix_soft_reset_addr() const override { return blackhole::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return blackhole::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return blackhole::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return blackhole::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return blackhole::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return blackhole::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return blackhole::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return blackhole::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -231,7 +258,6 @@ class blackhole_implementation : public architecture_implementation { tt_driver_host_address_params get_host_address_params() const override; tt_driver_eth_interface_params get_eth_interface_params() const override; tt_driver_noc_params get_noc_params() const override; - }; } // namespace tt::umd diff --git a/device/cluster.cpp b/device/cluster.cpp index 72378b131..33c896683 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -3,63 +3,61 @@ // SPDX-License-Identifier: Apache-2.0 #include "cluster.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#include #include -#include #include - +#include +#include +#include +#include +#include +#include +#include #include #include #include #include -#include #include #include +#include +#include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include +#include -#include "tt_arch_types.h" -#include "tt_cluster_descriptor.h" -#include "yaml-cpp/yaml.h" #include "common/logger.hpp" - -#include "device/tt_cluster_descriptor.h" +#include "device/architecture_implementation.h" #include "device/driver_atomics.h" #include "device/hugepage.h" -#include "device/architecture_implementation.h" #include "device/tlb.h" #include "device/tt_arch_types.h" +#include "device/tt_cluster_descriptor.h" +#include "tt_arch_types.h" +#include "tt_cluster_descriptor.h" +#include "yaml-cpp/yaml.h" using namespace boost::interprocess; using namespace tt; using namespace tt::umd; - static const uint32_t MSG_ERROR_REPLY = 0xFFFFFFFF; // TLB size for DRAM on blackhole - 4GB const uint64_t BH_4GB_TLB_SIZE = 4ULL * 1024 * 1024 * 1024; -static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368; // Remove 256MB from full 1GB for channel 3 (iATU limitation) +// Remove 256MB from full 1GB for channel 3 (iATU limitation) +static constexpr uint32_t HUGEPAGE_CHANNEL_3_SIZE_LIMIT = 805306368; // TODO: Remove in favor of cluster descriptor method, when it becomes available. // Metal uses this function to determine the architecture of the first PCIe chip @@ -95,7 +93,7 @@ tt::ARCH detect_arch() { } template -void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes) { +void size_buffer_to_capacity(std::vector& data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0; if (size_in_bytes > 0) { target_size = ((size_in_bytes - 1) / sizeof(T)) + 1; @@ -105,11 +103,9 @@ void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes // TODO: To be removed when tt_device is removed -tt_device::tt_device() : soc_descriptor_per_chip({}) { -} +tt_device::tt_device() : soc_descriptor_per_chip({}) {} -tt_device::~tt_device() { -} +tt_device::~tt_device() {} const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { return soc_descriptor_per_chip.at(chip_id); @@ -119,12 +115,12 @@ const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { // -------------------------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------------------------- -#include "tt_silicon_driver_common.hpp" -#include "tt_xy_pair.h" -#include #include #include +#include +#include "tt_silicon_driver_common.hpp" +#include "tt_xy_pair.h" struct routing_cmd_t { uint64_t sys_addr; @@ -133,49 +129,53 @@ struct routing_cmd_t { uint16_t rack; uint16_t src_resp_buf_index; uint32_t local_buf_index; - uint8_t src_resp_q_id; - uint8_t host_mem_txn_id; + uint8_t src_resp_q_id; + uint8_t host_mem_txn_id; uint16_t padding; - uint32_t src_addr_tag; //upper 32-bits of request source address. + uint32_t src_addr_tag; // upper 32-bits of request source address. }; -struct remote_update_ptr_t{ - uint32_t ptr; - uint32_t pad[3]; +struct remote_update_ptr_t { + uint32_t ptr; + uint32_t pad[3]; }; namespace { - struct tt_4_byte_aligned_buffer { - // Stores a 4 byte aligned buffer - // If the input buffer is already 4 byte aligned, this is a nop - std::uint32_t* local_storage = nullptr; - std::uint32_t input_size = 0; - std::uint32_t block_size = 0; - - tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) { - input_size = size_in_bytes; - local_storage = (uint32_t*)mem_ptr; - uint32_t alignment_mask = sizeof(uint32_t) - 1; - uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask; +struct tt_4_byte_aligned_buffer { + // Stores a 4 byte aligned buffer + // If the input buffer is already 4 byte aligned, this is a nop + std::uint32_t* local_storage = nullptr; + std::uint32_t input_size = 0; + std::uint32_t block_size = 0; - if(size_in_bytes < aligned_size) { - local_storage = new uint32_t[aligned_size / sizeof(uint32_t)]; - } - block_size = aligned_size; + tt_4_byte_aligned_buffer(const void* mem_ptr, uint32_t size_in_bytes) { + input_size = size_in_bytes; + local_storage = (uint32_t*)mem_ptr; + uint32_t alignment_mask = sizeof(uint32_t) - 1; + uint32_t aligned_size = (size_in_bytes + alignment_mask) & ~alignment_mask; + + if (size_in_bytes < aligned_size) { + local_storage = new uint32_t[aligned_size / sizeof(uint32_t)]; } + block_size = aligned_size; + } - ~tt_4_byte_aligned_buffer() { - if(block_size > input_size) { - delete [] local_storage; - } + ~tt_4_byte_aligned_buffer() { + if (block_size > input_size) { + delete[] local_storage; } - }; -} + } +}; +} // namespace namespace tt::umd { -bool Cluster::address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { - return ((tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && address >= tlb_config_map.at(chip).at(tlb_index) && (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size)); +bool Cluster::address_in_tlb_space( + uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, std::uint32_t chip) { + return ( + (tlb_config_map.at(chip).find(tlb_index) != tlb_config_map.at(chip).end()) && + address >= tlb_config_map.at(chip).at(tlb_index) && + (address + size_in_bytes <= tlb_config_map.at(chip).at(tlb_index) + tlb_size)); } std::unordered_map& Cluster::get_virtual_soc_descriptors() { @@ -183,10 +183,10 @@ std::unordered_map& Cluster::get_virtual_soc_descri } void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm) { - // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here (during device init) - // since its unsafe to modify shared state during multithreaded runtime. - // cleanup_mutexes_in_shm is tied to clean_system_resources from the constructor. The main process is responsible for initializing the driver with this - // field set to cleanup after an aborted process. + // These mutexes are intended to be based on physical devices/pci-intf not logical. Set these up ahead of time here + // (during device init) since its unsafe to modify shared state during multithreaded runtime. cleanup_mutexes_in_shm + // is tied to clean_system_resources from the constructor. The main process is responsible for initializing the + // driver with this field set to cleanup after an aborted process. // Store old mask and clear processes umask auto old_umask = umask(0); @@ -195,218 +195,292 @@ void Cluster::initialize_interprocess_mutexes(int pci_interface_id, bool cleanup std::string mutex_name = ""; // Initialize Dynamic TLB mutexes - for(auto &tlb : dynamic_tlb_config) { + for (auto& tlb : dynamic_tlb_config) { mutex_name = tlb.first + std::to_string(pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); } // Initialize ARC core mutex mutex_name = fmt::format("ARC_MSG{}", pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); if (arch_name == tt::ARCH::WORMHOLE_B0) { mutex_name = NON_MMIO_MUTEX_NAME + std::to_string(pci_interface_id); - // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for ethernet broadcast - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + // Initialize non-MMIO mutexes for WH devices regardless of number of chips, since these may be used for + // ethernet broadcast + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); } // Initialize interprocess mutexes to make host -> device memory barriers atomic mutex_name = MEM_BARRIER_MUTEX_NAME + std::to_string(pci_interface_id); - if (cleanup_mutexes_in_shm) named_mutex::remove(mutex_name.c_str()); - hardware_resource_mutex_map[mutex_name] = std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); - + if (cleanup_mutexes_in_shm) { + named_mutex::remove(mutex_name.c_str()); + } + hardware_resource_mutex_map[mutex_name] = + std::make_shared(open_or_create, mutex_name.c_str(), unrestricted_permissions); + // Restore old mask umask(old_umask); } -void Cluster::create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources) { +void Cluster::create_device( + const std::unordered_set& target_mmio_device_ids, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources) { log_debug(LogSiliconDriver, "Cluster::Cluster"); // Don't buffer stdout. setbuf(stdout, NULL); - // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to use available devices. + // Just use PCI interface id from physical_device_id given by cluster desc mmio map. For GS, already virtualized to + // use available devices. auto logical_to_physical_device_id_map = ndesc->get_chips_with_mmio(); - log_assert(target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now."); + log_assert( + target_mmio_device_ids.size() > 0, "Must provide set of target_mmio_device_ids to Cluster constructor now."); - for (const chip_id_t &logical_device_id : target_mmio_device_ids) { - log_assert(logical_to_physical_device_id_map.count(logical_device_id) != 0, "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", logical_device_id); + for (const chip_id_t& logical_device_id : target_mmio_device_ids) { + log_assert( + logical_to_physical_device_id_map.count(logical_device_id) != 0, + "Cannot find logical mmio device_id: {} in cluster desc / logical-to-physical-map", + logical_device_id); int pci_interface_id = logical_to_physical_device_id_map.at(logical_device_id); if (!m_pci_device_map.count(logical_device_id)) { - log_debug(LogSiliconDriver, "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", pci_interface_id, logical_device_id); - m_pci_device_map.insert({logical_device_id, std::make_unique(pci_interface_id, logical_device_id)}); + log_debug( + LogSiliconDriver, + "Opening TT_PCI_INTERFACE_ID {} for netlist target_device_id: {}", + pci_interface_id, + logical_device_id); + m_pci_device_map.insert( + {logical_device_id, std::make_unique(pci_interface_id, logical_device_id)}); } auto dev = m_pci_device_map.at(logical_device_id).get(); uint16_t pcie_device_id = dev->get_pci_device_id(); uint32_t pcie_revision = dev->get_pci_revision(); // TODO: get rid of this, it doesn't make any sense. - int num_host_mem_channels = get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); + int num_host_mem_channels = + get_available_num_host_mem_channels(num_host_mem_ch_per_mmio_device, pcie_device_id, pcie_revision); if (dev->get_arch() == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1) { // TODO: Implement support for multiple host channels on BLACKHOLE. - log_warning(LogSiliconDriver, "Forcing a single channel for Blackhole device. Multiple host channels not supported."); + log_warning( + LogSiliconDriver, + "Forcing a single channel for Blackhole device. Multiple host channels not supported."); num_host_mem_channels = 1; } - log_debug(LogSiliconDriver, "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} device_id: 0x{:x} revision: {})", - num_host_mem_channels, logical_device_id, pci_interface_id, pci_device->get_device_num(), pci_device->revision_id); + log_debug( + LogSiliconDriver, + "Using {} Hugepages/NumHostMemChannels for PCIDevice (logical_device_id: {} pci_interface_id: {} " + "device_id: 0x{:x} revision: {})", + num_host_mem_channels, + logical_device_id, + pci_interface_id, + pci_device->get_device_num(), + pci_device->revision_id); initialize_interprocess_mutexes(pci_interface_id, clean_system_resources); // MT: Initial BH - hugepages will fail init // For using silicon driver without workload to query mission mode params, no need for hugepage. - if (!skip_driver_allocs){ + if (!skip_driver_allocs) { // TODO: Implement support for multiple host channels on BLACKHOLE. - log_assert(!(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1), "More channels are not yet supported for Blackhole"); - bool hugepages_initialized = m_pci_device_map.at(logical_device_id)->init_hugepage(num_host_mem_channels); // Same number of host channels per device for now + log_assert( + !(arch_name == tt::ARCH::BLACKHOLE && num_host_mem_channels > 1), + "More channels are not yet supported for Blackhole"); + // Same number of host channels per device for now + bool hugepages_initialized = m_pci_device_map.at(logical_device_id)->init_hugepage(num_host_mem_channels); // Large writes to remote chips require hugepages to be initialized. - // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused if using remote only for small transactions) - if(target_remote_chips.size()) { - log_assert(hugepages_initialized, "Hugepages must be successfully initialized if workload contains remote chips!"); + // Conservative assert - end workload if remote chips present but hugepages not initialized (failures caused + // if using remote only for small transactions) + if (target_remote_chips.size()) { + log_assert( + hugepages_initialized, + "Hugepages must be successfully initialized if workload contains remote chips!"); } if (not m_pci_device_map.at(logical_device_id)->get_hugepage_mapping(0).mapping) { log_warning(LogSiliconDriver, "No hugepage mapping at device {}.", logical_device_id); } } - harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); //translation layer for harvested coords. Default is identity map + // translation layer for harvested coords. Default is identity map + harvested_coord_translation.insert({logical_device_id, create_harvested_coord_translation(arch_name, true)}); } - for(const chip_id_t& chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { // Initialize identity mapping for Non-MMIO chips as well - if(!ndesc -> is_chip_mmio_capable(chip)) { + if (!ndesc->is_chip_mmio_capable(chip)) { harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, true)}); flush_non_mmio_per_chip[chip] = false; } } } -bool Cluster::using_harvested_soc_descriptors() { - return perform_harvesting_on_sdesc && performed_harvesting; -} +bool Cluster::using_harvested_soc_descriptors() { return perform_harvesting_on_sdesc && performed_harvesting; } std::unordered_map Cluster::get_harvested_coord_translation_map(chip_id_t logical_device_id) { return harvested_coord_translation.at(logical_device_id); } std::unordered_map Cluster::get_harvesting_masks_for_soc_descriptors() { - if(using_harvested_soc_descriptors()) { + if (using_harvested_soc_descriptors()) { return harvested_rows_per_target; } std::unordered_map default_harvesting_masks = {}; - for(const auto chip : target_devices_in_cluster) default_harvesting_masks.insert({chip, 0}); + for (const auto chip : target_devices_in_cluster) { + default_harvesting_masks.insert({chip, 0}); + } return default_harvesting_masks; } -void Cluster::construct_cluster(const std::string& sdesc_path, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) { - +void Cluster::construct_cluster( + const std::string& sdesc_path, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) { std::unordered_set target_mmio_device_ids; - for (auto &d: target_devices_in_cluster){ - log_assert(ndesc->get_all_chips().find(d) != ndesc->get_all_chips().end(), "Target device {} not present in current cluster!", d); - if (ndesc->is_chip_mmio_capable(d)){ + for (auto& d : target_devices_in_cluster) { + log_assert( + ndesc->get_all_chips().find(d) != ndesc->get_all_chips().end(), + "Target device {} not present in current cluster!", + d); + if (ndesc->is_chip_mmio_capable(d)) { target_mmio_device_ids.insert(d); - } - else { + } else { target_remote_chips.insert(d); } } - // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and writes. + // It is mandatory for all devices to have these TLBs set aside, as the driver needs them to issue remote reads and + // writes. auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); + dynamic_tlb_config["LARGE_READ_TLB"] = architecture_implementation->get_mem_large_read_tlb(); dynamic_tlb_config["LARGE_WRITE_TLB"] = architecture_implementation->get_mem_large_write_tlb(); dynamic_tlb_config["REG_TLB"] = architecture_implementation->get_reg_tlb(); dynamic_tlb_config["SMALL_READ_WRITE_TLB"] = architecture_implementation->get_small_read_write_tlb(); - for(const auto& tlb : dynamic_tlb_config) { - dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); // All dynamic TLBs use Relaxed Ordering by default; MT: Good for BH + // All dynamic TLBs use Relaxed Ordering by default + for (const auto& tlb : dynamic_tlb_config) { + dynamic_tlb_ordering_modes.insert({tlb.first, TLB_DATA::Relaxed}); } create_device(target_mmio_device_ids, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources); // MT: Initial BH - Disable dependency to ethernet firmware - if(arch_name == tt::ARCH::BLACKHOLE) { + if (arch_name == tt::ARCH::BLACKHOLE) { use_ethernet_ordered_writes = false; use_ethernet_broadcast = false; use_virtual_coords_for_eth_broadcast = false; } - if(arch_name == tt::ARCH::WORMHOLE_B0) { - const auto& harvesting_masks = ndesc -> get_harvesting_info(); - const auto& noc_translation_enabled = ndesc -> get_noc_translation_table_en(); + if (arch_name == tt::ARCH::WORMHOLE_B0) { + const auto& harvesting_masks = ndesc->get_harvesting_info(); + const auto& noc_translation_enabled = ndesc->get_noc_translation_table_en(); translation_tables_en = false; - for(auto& masks : harvesting_masks) { - if(target_devices_in_cluster.find(masks.first) != target_devices_in_cluster.end()) { + for (auto& masks : harvesting_masks) { + if (target_devices_in_cluster.find(masks.first) != target_devices_in_cluster.end()) { harvested_rows_per_target[masks.first] = get_harvested_noc_rows(masks.second); noc_translation_enabled_for_chip[masks.first] = noc_translation_enabled.at(masks.first); num_rows_harvested.insert({masks.first, std::bitset<32>(masks.second).count()}); - if(harvested_rows_per_target[masks.first]) { + if (harvested_rows_per_target[masks.first]) { performed_harvesting = true; } } } - if(noc_translation_enabled_for_chip.size() > 0) { - auto const consistent_translation_table_state = [&] (std::pair const& i) { - return noc_translation_enabled_for_chip.begin() -> second == i.second; + if (noc_translation_enabled_for_chip.size() > 0) { + auto const consistent_translation_table_state = [&](std::pair const& i) { + return noc_translation_enabled_for_chip.begin()->second == i.second; }; - bool translation_tables_match_on_all_chips = std::all_of(noc_translation_enabled_for_chip.begin(), noc_translation_enabled_for_chip.end(), consistent_translation_table_state); - log_assert(translation_tables_match_on_all_chips, "Cluster uses NOC translation tables inconsistently across chips."); - translation_tables_en = noc_translation_enabled_for_chip.begin() -> second; + bool translation_tables_match_on_all_chips = std::all_of( + noc_translation_enabled_for_chip.begin(), + noc_translation_enabled_for_chip.end(), + consistent_translation_table_state); + log_assert( + translation_tables_match_on_all_chips, + "Cluster uses NOC translation tables inconsistently across chips."); + translation_tables_en = noc_translation_enabled_for_chip.begin()->second; } - if(translation_tables_en) { + if (translation_tables_en) { harvested_coord_translation.clear(); - for(const chip_id_t& chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { harvested_coord_translation.insert({chip, create_harvested_coord_translation(arch_name, false)}); } } - log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled."); - } - else if(arch_name == tt::ARCH::BLACKHOLE) { + log_assert( + performed_harvesting ? translation_tables_en : true, + "Using a harvested WH cluster with NOC translation disabled."); + } else if (arch_name == tt::ARCH::BLACKHOLE) { // Default harvesting info for Blackhole, describing no harvesting - for(auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++){ - harvested_rows_per_target[*chip_id] = 0; //get_harvested_noc_rows_for_chip(*chip_id); - num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. - if(harvested_rows_per_target[*chip_id]) { + for (auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++) { + harvested_rows_per_target[*chip_id] = 0; // get_harvested_noc_rows_for_chip(*chip_id); + num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want + // all rows to have a reset signal sent. + if (harvested_rows_per_target[*chip_id]) { performed_harvesting = true; } } - } - else if(arch_name == tt::ARCH::GRAYSKULL) { + } else if (arch_name == tt::ARCH::GRAYSKULL) { // Multichip harvesting is supported for GS. - for(auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++){ - harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id); - num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want all rows to have a reset signal sent. - if(harvested_rows_per_target[*chip_id]) { + for (auto chip_id = target_devices_in_cluster.begin(); chip_id != target_devices_in_cluster.end(); chip_id++) { + harvested_rows_per_target[*chip_id] = get_harvested_noc_rows_for_chip(*chip_id); + num_rows_harvested.insert({*chip_id, 0}); // Only set for broadcast TLB to get RISCS out of reset. We want + // all rows to have a reset signal sent. + if (harvested_rows_per_target[*chip_id]) { performed_harvesting = true; } } } - if(simulated_harvesting_masks.size()) { + if (simulated_harvesting_masks.size()) { performed_harvesting = true; - for (auto device_id = target_devices_in_cluster.begin(); device_id != target_devices_in_cluster.end(); device_id++) { - log_assert(simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), "Could not find harvesting mask for device_id {}", *device_id); - if(arch_name == tt::ARCH::GRAYSKULL) { - if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != harvested_rows_per_target[*device_id]) { - log_warning(LogSiliconDriver, - "Simulated harvesting config for device {} does not include the actual harvesting config. Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : {} Simulated Harvested Rows : {}", - *device_id, harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id)); + for (auto device_id = target_devices_in_cluster.begin(); device_id != target_devices_in_cluster.end(); + device_id++) { + log_assert( + simulated_harvesting_masks.find(*device_id) != simulated_harvesting_masks.end(), + "Could not find harvesting mask for device_id {}", + *device_id); + if (arch_name == tt::ARCH::GRAYSKULL) { + if ((simulated_harvesting_masks.at(*device_id) & harvested_rows_per_target[*device_id]) != + harvested_rows_per_target[*device_id]) { + log_warning( + LogSiliconDriver, + "Simulated harvesting config for device {} does not include the actual harvesting config. " + "Simulated harvesting mask will be added to the real harvesting mask. Actual Harvested Rows : " + "{} Simulated Harvested Rows : {}", + *device_id, + harvested_rows_per_target[*device_id], + simulated_harvesting_masks.at(*device_id)); } simulated_harvesting_masks.at(*device_id) |= harvested_rows_per_target[*device_id]; - } - else if(arch_name == tt::ARCH::WORMHOLE_B0) { - log_assert(std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= std::bitset<32>(harvested_rows_per_target[*device_id]).count(), - "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. Actual Harvested Rows : {} Simulated Harvested Rows : {}", - harvested_rows_per_target[*device_id], simulated_harvesting_masks.at(*device_id)); - num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count(); - log_assert(performed_harvesting ? translation_tables_en : true, "Using a harvested WH cluster with NOC translation disabled."); + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { + log_assert( + std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count() >= + std::bitset<32>(harvested_rows_per_target[*device_id]).count(), + "Simulated Harvesting for WH must contain at least as many rows as the actual harvesting config. " + "Actual Harvested Rows : {} Simulated Harvested Rows : {}", + harvested_rows_per_target[*device_id], + simulated_harvesting_masks.at(*device_id)); + num_rows_harvested.at(*device_id) = std::bitset<32>(simulated_harvesting_masks.at(*device_id)).count(); + log_assert( + performed_harvesting ? translation_tables_en : true, + "Using a harvested WH cluster with NOC translation disabled."); } harvested_rows_per_target[*device_id] = simulated_harvesting_masks.at(*device_id); } @@ -416,18 +490,18 @@ void Cluster::construct_cluster(const std::string& sdesc_path, const uint32_t &n populate_cores(); // MT: Initial BH - skip this for BH - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { remote_transfer_ethernet_cores.resize(target_mmio_device_ids.size()); - for (const auto &logical_mmio_chip_id : target_mmio_device_ids) { + for (const auto& logical_mmio_chip_id : target_mmio_device_ids) { const tt_SocDescriptor& soc_desc = get_soc_descriptor(logical_mmio_chip_id); // 4-5 is for send_epoch_commands, 0-3 are for everything else for (std::uint32_t i = 0; i < NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS; i++) { - if(remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) { + if (remote_transfer_ethernet_cores.size() <= logical_mmio_chip_id) { remote_transfer_ethernet_cores.resize(logical_mmio_chip_id + 1); } - remote_transfer_ethernet_cores.at(logical_mmio_chip_id).push_back( - tt_cxy_pair(logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y) - ); + remote_transfer_ethernet_cores.at(logical_mmio_chip_id) + .push_back(tt_cxy_pair( + logical_mmio_chip_id, soc_desc.ethernet_cores.at(i).x, soc_desc.ethernet_cores.at(i).y)); } } } @@ -442,15 +516,20 @@ void Cluster::construct_cluster(const std::string& sdesc_path, const uint32_t &n noc_params = architecture_implementation->get_noc_params(); } -Cluster::Cluster(const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device() { +Cluster::Cluster( + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) : + tt_device() { // TODO: this should be fetched through ClusterDescriptor auto available_device_ids = detect_available_device_ids(); m_num_pci_devices = available_device_ids.size(); - + int physical_device_id = available_device_ids[0]; // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); + PCIDevice pci_device(physical_device_id, 0); tt::ARCH device_arch = pci_device.get_arch(); std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch); @@ -459,7 +538,12 @@ Cluster::Cluster(const uint32_t &num_host_mem_ch_per_mmio_device, const bool ski perform_harvesting_on_sdesc = perform_harvesting; if (!skip_driver_allocs) { - log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); + log_info( + LogSiliconDriver, + "Detected {} PCI device{} : {}", + m_num_pci_devices, + (m_num_pci_devices > 1) ? "s" : "", + available_device_ids); log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); } @@ -467,23 +551,35 @@ Cluster::Cluster(const uint32_t &num_host_mem_ch_per_mmio_device, const bool ski ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); std::set target_devices; - for(const chip_id_t &d : ndesc->get_all_chips()) { + for (const chip_id_t& d : ndesc->get_all_chips()) { target_devices.insert(d); } target_devices_in_cluster = target_devices; - construct_cluster(sdesc_path, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources, perform_harvesting, simulated_harvesting_masks); -} - -Cluster::Cluster(const std::set &target_devices, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device() { + construct_cluster( + sdesc_path, + num_host_mem_ch_per_mmio_device, + skip_driver_allocs, + clean_system_resources, + perform_harvesting, + simulated_harvesting_masks); +} + +Cluster::Cluster( + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) : + tt_device() { // TODO: this should be fetched through ClusterDescriptor auto available_device_ids = detect_available_device_ids(); m_num_pci_devices = available_device_ids.size(); - + int physical_device_id = available_device_ids[0]; // TODO: remove logical_device_id - PCIDevice pci_device (physical_device_id, 0); + PCIDevice pci_device(physical_device_id, 0); tt::ARCH device_arch = pci_device.get_arch(); std::string sdesc_path = tt_SocDescriptor::get_soc_descriptor_path(device_arch); @@ -492,7 +588,12 @@ Cluster::Cluster(const std::set &target_devices, const uint32_t &num_ perform_harvesting_on_sdesc = perform_harvesting; if (!skip_driver_allocs) { - log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); + log_info( + LogSiliconDriver, + "Detected {} PCI device{} : {}", + m_num_pci_devices, + (m_num_pci_devices > 1) ? "s" : "", + available_device_ids); log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); } @@ -501,12 +602,25 @@ Cluster::Cluster(const std::set &target_devices, const uint32_t &num_ target_devices_in_cluster = target_devices; - construct_cluster(sdesc_path, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources, perform_harvesting, simulated_harvesting_masks); -} - -Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, - const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks) : tt_device() { + construct_cluster( + sdesc_path, + num_host_mem_ch_per_mmio_device, + skip_driver_allocs, + clean_system_resources, + perform_harvesting, + simulated_harvesting_masks); +} + +Cluster::Cluster( + const std::string& sdesc_path, + const std::string& ndesc_path, + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks) : + tt_device() { // TODO: this should be fetched through ClusterDescriptor auto available_device_ids = detect_available_device_ids(); m_num_pci_devices = available_device_ids.size(); @@ -516,7 +630,12 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c perform_harvesting_on_sdesc = perform_harvesting; if (!skip_driver_allocs) { - log_info(LogSiliconDriver, "Detected {} PCI device{} : {}", m_num_pci_devices, (m_num_pci_devices > 1) ? "s":"", available_device_ids); + log_info( + LogSiliconDriver, + "Detected {} PCI device{} : {}", + m_num_pci_devices, + (m_num_pci_devices > 1) ? "s" : "", + available_device_ids); log_debug(LogSiliconDriver, "Passed target devices: {}", target_devices); } @@ -527,20 +646,32 @@ Cluster::Cluster(const std::string &sdesc_path, const std::string &ndesc_path, c ndesc = tt_ClusterDescriptor::create_from_yaml(cluster_descriptor_path); - construct_cluster(sdesc_path, num_host_mem_ch_per_mmio_device, skip_driver_allocs, clean_system_resources, perform_harvesting, simulated_harvesting_masks); + construct_cluster( + sdesc_path, + num_host_mem_ch_per_mmio_device, + skip_driver_allocs, + clean_system_resources, + perform_harvesting, + simulated_harvesting_masks); } -void Cluster::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { +void Cluster::configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { // Makes UMD aware of which ethernet cores have active links. // Based on this information, UMD determines which ethernet cores can be used for host->cluster non-MMIO transfers. - // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be called for all MMIO devices, if default behaviour - // is not desired. - log_assert(get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, "{} can only be called for Wormhole arch", __FUNCTION__); + // This overrides the default ethernet cores tagged for host to cluster routing in the constructor and must be + // called for all MMIO devices, if default behaviour is not desired. + log_assert( + get_soc_descriptor(mmio_chip).arch == tt::ARCH::WORMHOLE_B0, + "{} can only be called for Wormhole arch", + __FUNCTION__); auto& eth_cores = get_soc_descriptor(mmio_chip).ethernet_cores; // Cores 0, 1, 6, 7 are only available if in the active set - static std::unordered_set eth_cores_available_if_active = {eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)}; + static std::unordered_set eth_cores_available_if_active = { + eth_cores.at(0), eth_cores.at(1), eth_cores.at(6), eth_cores.at(7)}; // Eth cores 8 and 9 are always available - std::vector non_mmio_access_cores_for_chip = {tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))}; + std::vector non_mmio_access_cores_for_chip = { + tt_cxy_pair(mmio_chip, eth_cores.at(8)), tt_cxy_pair(mmio_chip, eth_cores.at(9))}; for (const auto& active_eth_core : active_eth_cores_per_chip) { if (eth_cores_available_if_active.find(active_eth_core) != eth_cores_available_if_active.end()) { non_mmio_access_cores_for_chip.push_back(tt_cxy_pair(mmio_chip, active_eth_core)); @@ -554,27 +685,33 @@ void Cluster::configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chi void Cluster::populate_cores() { std::uint32_t count = 0; - for(const auto chip : soc_descriptor_per_chip) { - workers_per_chip.insert({chip.first, std::unordered_set(chip.second.workers.begin(), chip.second.workers.end())}); - if(count == 0) { - eth_cores = std::unordered_set(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end()); - for(std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) { - dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)) ; + for (const auto chip : soc_descriptor_per_chip) { + workers_per_chip.insert( + {chip.first, std::unordered_set(chip.second.workers.begin(), chip.second.workers.end())}); + if (count == 0) { + eth_cores = + std::unordered_set(chip.second.ethernet_cores.begin(), chip.second.ethernet_cores.end()); + for (std::uint32_t dram_idx = 0; dram_idx < chip.second.get_num_dram_channels(); dram_idx++) { + dram_cores.insert(chip.second.get_core_for_dram_channel(dram_idx, 0)); } } count++; } } -std::vector Cluster::extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows) { +std::vector Cluster::extract_rows_to_remove( + const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows) { // Check if harvesting config is legal for GS and WH - log_assert(!((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); + log_assert( + !((harvested_rows & 1) || (harvested_rows & 64) || (harvested_rows & 0xFFFFF000)), + "For grayskull and wormhole, only rows 1-5 and 7-11 can be harvested"); std::vector row_coordinates_to_remove; int row_coordinate = 0; int tmp = harvested_rows; while (tmp) { - if (tmp & 1) + if (tmp & 1) { row_coordinates_to_remove.push_back(row_coordinate); + } tmp = tmp >> 1; row_coordinate++; @@ -588,13 +725,14 @@ std::vector Cluster::extract_rows_to_remove(const tt::ARCH &arch, const int return row_coordinates_to_remove; } -void Cluster::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove) { +void Cluster::remove_worker_row_from_descriptor( + tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove) { std::vector workers_to_keep; - for(auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++){ - if(find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == row_coordinates_to_remove.end()){ + for (auto worker = (full_soc_descriptor.workers).begin(); worker != (full_soc_descriptor.workers).end(); worker++) { + if (find(row_coordinates_to_remove.begin(), row_coordinates_to_remove.end(), (*worker).y) == + row_coordinates_to_remove.end()) { workers_to_keep.push_back(*worker); - } - else{ + } else { (full_soc_descriptor.harvested_workers).push_back(*worker); full_soc_descriptor.cores.at(*worker).type = CoreType::HARVESTED; } @@ -606,28 +744,32 @@ void Cluster::remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descr std::set modified_y_coords = {}; - for(const auto& core : full_soc_descriptor.workers) { + for (const auto& core : full_soc_descriptor.workers) { modified_y_coords.insert(core.y); } int logical_y_coord = 0; - for(const auto& y_coord : modified_y_coords) { + for (const auto& y_coord : modified_y_coords) { full_soc_descriptor.routing_y_to_worker_y.insert({y_coord, logical_y_coord}); - full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord}); + full_soc_descriptor.worker_log_to_routing_y.insert({logical_y_coord, y_coord}); logical_y_coord++; } } void Cluster::harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows) { - std::uint32_t max_row_to_remove = (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [] (const auto& a, const auto& b) { return a.y < b.y; })).y; + std::uint32_t max_row_to_remove = + (*std::max_element((sdesc.workers).begin(), (sdesc.workers).end(), [](const auto& a, const auto& b) { + return a.y < b.y; + })).y; std::vector row_coordinates_to_remove = extract_rows_to_remove(arch, max_row_to_remove, harvested_rows); remove_worker_row_from_descriptor(sdesc, row_coordinates_to_remove); } -void Cluster::perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting) { +void Cluster::perform_harvesting_and_populate_soc_descriptors( + const std::string& sdesc_path, const bool perform_harvesting) { const auto default_sdesc = tt_SocDescriptor(sdesc_path); - for(const auto& chip : harvested_rows_per_target) { + for (const auto& chip : harvested_rows_per_target) { auto temp_sdesc = default_sdesc; - if(perform_harvesting) { + if (perform_harvesting) { harvest_rows_in_soc_descriptor(arch_name, temp_sdesc, chip.second); } soc_descriptor_per_chip.insert({chip.first, temp_sdesc}); @@ -635,25 +777,24 @@ void Cluster::perform_harvesting_and_populate_soc_descriptors(const std::string& } void Cluster::check_pcie_device_initialized(int device_id) { - - PCIDevice *pci_device = get_pci_device(device_id); + PCIDevice* pci_device = get_pci_device(device_id); tt::ARCH device_arch = pci_device->get_arch(); if (arch_name == tt::ARCH::GRAYSKULL) { if (device_arch != tt::ARCH::GRAYSKULL) { - throw std::runtime_error(fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run grayskull configured tt_device on {}", get_arch_str(device_arch))); } - } - else if (arch_name == tt::ARCH::WORMHOLE_B0) { + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { if (device_arch != tt::ARCH::WORMHOLE_B0) { - throw std::runtime_error(fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run wormhole configured tt_device on {}", get_arch_str(device_arch))); } - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { if (device_arch != tt::ARCH::BLACKHOLE) { - throw std::runtime_error(fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch))); + throw std::runtime_error( + fmt::format("Attempted to run blackhole configured tt_device on {}", get_arch_str(device_arch))); } - } - else { + } else { throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name))); } auto architecture_implementation = pci_device->get_architecture_implementation(); @@ -661,29 +802,36 @@ void Cluster::check_pcie_device_initialized(int device_id) { // MT Initial BH - Add check for blackhole once access to ARC registers is setup through TLBs if (arch_name != tt::ARCH::BLACKHOLE) { log_debug(LogSiliconDriver, "== Check if device_id: {} is initialized", device_id); - uint32_t bar_read_initial = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); + uint32_t bar_read_initial = + bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); uint32_t arg = bar_read_initial == 500 ? 325 : 500; uint32_t bar_read_again; - uint32_t arc_msg_return = arc_msg(device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again); + uint32_t arc_msg_return = arc_msg( + device_id, 0xaa00 | architecture_implementation->get_arc_message_test(), true, arg, 0, 1, &bar_read_again); if (arc_msg_return != 0 || bar_read_again != arg + 1) { auto postcode = bar_read32(device_id, architecture_implementation->get_arc_reset_scratch_offset()); - throw std::runtime_error(fmt::format("Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} bar_read_again: {}", - postcode, - arc_msg_return, - arg, - bar_read_initial, - bar_read_again)); + throw std::runtime_error(fmt::format( + "Device is not initialized: arc_fw postcode: {} arc_msg_return: {} arg: {} bar_read_initial: {} " + "bar_read_again: {}", + postcode, + arc_msg_return, + arg, + bar_read_initial, + bar_read_again)); } } - if (test_setup_interface()) { - throw std::runtime_error("Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run."); + throw std::runtime_error( + "Device is incorrectly initialized. If this is a harvested Wormhole machine, it is likely that NOC " + "Translation Tables are not enabled on device. These need to be enabled for the silicon driver to run."); } } -std::unordered_map Cluster::create_harvested_coord_translation(const tt::ARCH arch, bool identity_map) { - log_assert(identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices"); +std::unordered_map Cluster::create_harvested_coord_translation( + const tt::ARCH arch, bool identity_map) { + log_assert( + identity_map ? true : (arch != tt::ARCH::GRAYSKULL), "NOC Translation can only be performed for WH devices"); std::unordered_map translation_table = {}; tt_xy_pair grid_size; @@ -691,29 +839,29 @@ std::unordered_map Cluster::create_harvested_coord_trans std::vector T6_y = {}; std::vector ethernet = {}; // Store device specific data for GS and WH depending on arch - if(arch == tt::ARCH::GRAYSKULL) { + if (arch == tt::ARCH::GRAYSKULL) { grid_size = tt_xy_pair(13, 12); T6_x = {12, 1, 11, 2, 10, 3, 9, 4, 8, 5, 7, 6}; T6_y = {11, 1, 10, 2, 9, 3, 8, 4, 7, 5}; - } - else if (arch == tt::ARCH::BLACKHOLE) { + } else if (arch == tt::ARCH::BLACKHOLE) { grid_size = tt_xy_pair(17, 12); T6_x = {16, 1, 15, 2, 14, 3, 13, 4, 12, 5, 11, 6, 10, 7}; T6_y = {11, 2, 10, 3, 9, 4, 8, 5, 7, 6}; - } - else { + } else { grid_size = tt_xy_pair(10, 12); T6_x = {1, 2, 3, 4, 6, 7, 8, 9}; T6_y = {1, 2, 3, 4, 5, 7, 8, 9, 10, 11}; - ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0}, {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}}; + // clang-format off + ethernet = {{1, 0}, {2, 0}, {3, 0}, {4, 0}, {6, 0}, {7, 0}, {8, 0}, {9, 0}, + {1, 6}, {2, 6}, {3, 6}, {4, 6}, {6, 6}, {7, 6}, {8, 6}, {9, 6}}; + // clang-format on } - - if(identity_map) { + if (identity_map) { // When device is initialized, assume no harvesting and create an identity map for cores // This flow is always used for GS, since there is no hardware harvesting - for(int x = 0; x < grid_size.x; x++) { - for(int y = 0; y < grid_size.y; y++) { + for (int x = 0; x < grid_size.x; x++) { + for (int y = 0; y < grid_size.y; y++) { tt_xy_pair curr_core = tt_xy_pair(x, y); translation_table.insert({curr_core, curr_core}); } @@ -724,34 +872,50 @@ std::unordered_map Cluster::create_harvested_coord_trans // If this function is called with identity_map = false, we have perform NOC translation // This can only happen for WH devices // Setup coord translation for workers. Map all worker cores - for(int x = 0; x < grid_size.x; x++) { - for(int y = 0; y < grid_size.y; y++) { + for (int x = 0; x < grid_size.x; x++) { + for (int y = 0; y < grid_size.y; y++) { tt_xy_pair curr_core = tt_xy_pair(x, y); - if(std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() && - std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) { + if (std::find(T6_x.begin(), T6_x.end(), x) != T6_x.end() && + std::find(T6_y.begin(), T6_y.end(), y) != T6_y.end()) { // This is a worker core. Apply translation for WH. tt_xy_pair harvested_worker; - if(x >= 1 && x <= 4) harvested_worker.x = x + 17; - else if(x <= 9 && x > 5) harvested_worker.x = x + 16; - else log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x); + if (x >= 1 && x <= 4) { + harvested_worker.x = x + 17; + } else if (x <= 9 && x > 5) { + harvested_worker.x = x + 16; + } else { + log_assert(false, "Invalid WH worker x coord {} when creating translation tables.", x); + } - if(y >= 1 && y <= 5) harvested_worker.y = y + 17; - else if(y <= 11 && y > 6) harvested_worker.y = y + 16; - else log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y); + if (y >= 1 && y <= 5) { + harvested_worker.y = y + 17; + } else if (y <= 11 && y > 6) { + harvested_worker.y = y + 16; + } else { + log_assert(false, "Invalid WH worker y coord {} when creating translation tables.", y); + } translation_table.insert({curr_core, harvested_worker}); } - else if(std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()){ + else if (std::find(ethernet.begin(), ethernet.end(), curr_core) != ethernet.end()) { // This is an eth core. Apply translation for WH. tt_xy_pair harvested_eth_core; - if(x >= 1 && x <= 4) harvested_eth_core.x = x + 17; - else if(x <= 9 && x > 5) harvested_eth_core.x = x + 16; - else log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x); + if (x >= 1 && x <= 4) { + harvested_eth_core.x = x + 17; + } else if (x <= 9 && x > 5) { + harvested_eth_core.x = x + 16; + } else { + log_assert(false, "Invalid WH eth_core x coord {} when creating translation tables.", x); + } - if(y == 0) harvested_eth_core.y = y + 16; - else if(y == 6) harvested_eth_core.y = y + 11; - else log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y); + if (y == 0) { + harvested_eth_core.y = y + 16; + } else if (y == 6) { + harvested_eth_core.y = y + 11; + } else { + log_assert(false, "Invalid WH eth_core y coord {} when creating translation tables.", y); + } translation_table.insert({curr_core, harvested_eth_core}); } @@ -764,7 +928,7 @@ std::unordered_map Cluster::create_harvested_coord_trans return translation_table; } -void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { +void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { auto translated_coords = harvested_coord_translation[device_id].at(tt_xy_pair(c, r)); c = translated_coords.x; r = translated_coords.y; @@ -773,7 +937,7 @@ void Cluster::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, void Cluster::initialize_pcie_devices() { log_debug(LogSiliconDriver, "Cluster::start"); - for (auto &device_it : m_pci_device_map){ + for (auto& device_it : m_pci_device_map) { check_pcie_device_initialized(device_it.first); } @@ -782,7 +946,7 @@ void Cluster::initialize_pcie_devices() { init_membars(); } -void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &soft_resets) { +void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& soft_resets) { log_debug(LogSiliconDriver, "Cluster::broadcast_tensix_risc_reset"); PCIDevice* device = get_pci_device(chip_id); @@ -790,7 +954,10 @@ void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSo auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; auto logical_id = device->get_logical_id(); - log_debug(LogSiliconDriver, "== For all tensix set soft-reset for {} risc cores.", TensixSoftResetOptionsToString(valid).c_str()); + log_debug( + LogSiliconDriver, + "== For all tensix set soft-reset for {} risc cores.", + TensixSoftResetOptionsToString(valid).c_str()); auto architecture_implementation = device->get_architecture_implementation(); @@ -809,77 +976,87 @@ void Cluster::broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSo } std::set Cluster::get_target_mmio_device_ids() { - if(!all_target_mmio_devices.size()) { - for (const auto &it: m_pci_device_map) { + if (!all_target_mmio_devices.size()) { + for (const auto& it : m_pci_device_map) { all_target_mmio_devices.insert(it.first); } } return all_target_mmio_devices; } -void Cluster::assert_risc_reset() { - broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); -} +void Cluster::assert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); } -void Cluster::deassert_risc_reset() { - broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); -} +void Cluster::deassert_risc_reset() { broadcast_tensix_risc_reset_to_cluster(TENSIX_DEASSERT_SOFT_RESET); } -void Cluster::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster - log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() || - std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(), - "Cannot deassert reset on a non-tensix or harvested core"); - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device); - if(target_is_mmio_capable) { - log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe"); +void Cluster::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + // Get Target Device to query soc descriptor and determine location in cluster + std::uint32_t target_device = core.chip; + log_assert( + std::find( + get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != + get_soc_descriptor(target_device).workers.end() || + std::find( + get_soc_descriptor(target_device).ethernet_cores.begin(), + get_soc_descriptor(target_device).ethernet_cores.end(), + core) != get_soc_descriptor(target_device).ethernet_cores.end(), + "Cannot deassert reset on a non-tensix or harvested core"); + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device); + if (target_is_mmio_capable) { + log_assert( + m_pci_device_map.find(target_device) != m_pci_device_map.end(), + "Could not find MMIO mapped device in devices connected over PCIe"); send_tensix_risc_reset_to_core(core, soft_resets); - } - else { + } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Can't issue access to remote core in BH"); send_remote_tensix_risc_reset_to_core(core, soft_resets); } } void Cluster::assert_risc_reset_at_core(tt_cxy_pair core) { - std::uint32_t target_device = core.chip; // Get Target Device to query soc descriptor and determine location in cluster - log_assert(std::find(get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != get_soc_descriptor(target_device).workers.end() || - std::find(get_soc_descriptor(target_device).ethernet_cores.begin(), get_soc_descriptor(target_device).ethernet_cores.end(), core) != get_soc_descriptor(target_device).ethernet_cores.end(), - "Cannot assert reset on a non-tensix or harvested core"); - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(target_device); - if(target_is_mmio_capable) { - log_assert(m_pci_device_map.find(target_device) != m_pci_device_map.end(), "Could not find MMIO mapped device in devices connected over PCIe"); + // Get Target Device to query soc descriptor and determine location in cluster + std::uint32_t target_device = core.chip; + log_assert( + std::find( + get_soc_descriptor(target_device).workers.begin(), get_soc_descriptor(target_device).workers.end(), core) != + get_soc_descriptor(target_device).workers.end() || + std::find( + get_soc_descriptor(target_device).ethernet_cores.begin(), + get_soc_descriptor(target_device).ethernet_cores.end(), + core) != get_soc_descriptor(target_device).ethernet_cores.end(), + "Cannot assert reset on a non-tensix or harvested core"); + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(target_device); + if (target_is_mmio_capable) { + log_assert( + m_pci_device_map.find(target_device) != m_pci_device_map.end(), + "Could not find MMIO mapped device in devices connected over PCIe"); send_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET); - } - else { + } else { send_remote_tensix_risc_reset_to_core(core, TENSIX_ASSERT_SOFT_RESET); } } // Free memory during teardown, and remove (clean/unlock) from any leftover mutexes. void Cluster::cleanup_shared_host_state() { - for(auto &mutex : hardware_resource_mutex_map) { + for (auto& mutex : hardware_resource_mutex_map) { mutex.second.reset(); mutex.second = nullptr; named_mutex::remove(mutex.first.c_str()); } } -std::unordered_set Cluster::get_all_chips_in_cluster() { - return ndesc -> get_all_chips(); -} +std::unordered_set Cluster::get_all_chips_in_cluster() { return ndesc->get_all_chips(); } + int Cluster::get_number_of_chips_in_cluster() { // Returns the number of chips seen in the network descriptor - return ndesc -> get_all_chips().size(); + return ndesc->get_all_chips().size(); } -tt_ClusterDescriptor* Cluster::get_cluster_description() {return ndesc.get();} +tt_ClusterDescriptor* Cluster::get_cluster_description() { return ndesc.get(); } + // Can be used before instantiating a silicon device int Cluster::detect_number_of_chips() { - auto available_device_ids = detect_available_device_ids(); return available_device_ids.size(); - } // Can be used before instantiating a silicon device @@ -893,7 +1070,8 @@ std::vector Cluster::detect_available_device_ids() { return PCIDevice::enumerate_devices(); } -std::function Cluster::get_fast_pcie_static_tlb_write_callable(int device_id) { +std::function Cluster::get_fast_pcie_static_tlb_write_callable( + int device_id) { PCIDevice* dev = get_pci_device(device_id); const auto callable = [dev](uint32_t byte_addr, uint32_t num_bytes, const uint8_t* buffer_addr) { @@ -912,7 +1090,7 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) { throw std::runtime_error("TLBs not initialized"); } - auto *dev = get_pci_device(target.chip); + auto* dev = get_pci_device(target.chip); if (!dev->bar0_wc) { throw std::runtime_error("No write-combined mapping for BAR0"); @@ -926,26 +1104,39 @@ tt::Writer Cluster::get_static_tlb_writer(tt_cxy_pair target) { } auto [tlb_offset, tlb_size] = tlb_data.value(); - auto *base = reinterpret_cast(dev->bar0_wc); + auto* base = reinterpret_cast(dev->bar0_wc); return tt::Writer(base + tlb_offset, tlb_size); } -void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb) { - PCIDevice *dev = get_pci_device(target.chip); +void Cluster::write_device_memory( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair target, + std::uint32_t address, + const std::string& fallback_tlb) { + PCIDevice* dev = get_pci_device(target.chip); const uint8_t* buffer_addr = static_cast(mem_ptr); - log_debug(LogSiliconDriver, "Cluster::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}", - target.chip, target.x, target.y, address, size_in_bytes, small_access); + log_debug( + LogSiliconDriver, + "Cluster::write_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {} small_access: {}", + target.chip, + target.x, + target.y, + address, + size_in_bytes, + small_access); std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init_per_chip[target.chip]) { + if (tlbs_init_per_chip[target.chip]) { tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } - if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { + if (tlb_data.has_value() && + address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { auto [tlb_offset, tlb_size] = tlb_data.value(); if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to write to DRAM (BAR4 space), we add offset @@ -958,9 +1149,9 @@ void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, t const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); - while(size_in_bytes > 0) { - - auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = dev->set_dynamic_tlb( + tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->write_block(mapped_address, transfer_size, buffer_addr); @@ -972,22 +1163,36 @@ void Cluster::write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, t } } -void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb) { - // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this function will cause a segfault. - log_debug(LogSiliconDriver, "Cluster::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", target.chip, target.x, target.y, address, size_in_bytes); - PCIDevice *dev = get_pci_device(target.chip); +void Cluster::read_device_memory( + void* mem_ptr, + tt_cxy_pair target, + std::uint32_t address, + std::uint32_t size_in_bytes, + const std::string& fallback_tlb) { + // Assume that mem_ptr has been allocated adequate memory on host when this function is called. Otherwise, this + // function will cause a segfault. + log_debug( + LogSiliconDriver, + "Cluster::read_device_memory to chip:{} {}-{} at 0x{:x} size_in_bytes: {}", + target.chip, + target.x, + target.y, + address, + size_in_bytes); + PCIDevice* dev = get_pci_device(target.chip); uint8_t* buffer_addr = static_cast(mem_ptr); std::int32_t tlb_index = 0; std::optional> tlb_data = std::nullopt; - if(tlbs_init_per_chip[target.chip]) { + if (tlbs_init_per_chip[target.chip]) { tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); tlb_data = dev->get_architecture_implementation()->describe_tlb(tlb_index); } log_debug(LogSiliconDriver, " tlb_index: {}, tlb_data.has_value(): {}", tlb_index, tlb_data.has_value()); - if (tlb_data.has_value() && address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { + if (tlb_data.has_value() && + address_in_tlb_space(address, size_in_bytes, tlb_index, std::get<1>(tlb_data.value()), target.chip)) { auto [tlb_offset, tlb_size] = tlb_data.value(); if (dev->bar4_wc != nullptr && tlb_size == BH_4GB_TLB_SIZE) { // This is only for Blackhole. If we want to read from DRAM (BAR4 space), we add offset @@ -1001,9 +1206,9 @@ void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_ const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, dev->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - while(size_in_bytes > 0) { - - auto [mapped_address, tlb_size] = dev->set_dynamic_tlb(tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = dev->set_dynamic_tlb( + tlb_index, target, address, harvested_coord_translation, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint32_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); dev->read_block(mapped_address, transfer_size, buffer_addr); @@ -1016,55 +1221,61 @@ void Cluster::read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_ } void Cluster::read_buffer( - void* mem_ptr, - std::uint32_t address, - std::uint16_t channel, - std::uint32_t size_in_bytes, - chip_id_t src_device_id) { - + void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id) { log_assert(src_device_id != -1, "Must provide src_device_id for host_resident read/write"); - log_assert(m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "read_buffer: Device id is not a MMIO device"); + log_assert( + m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "read_buffer: Device id is not a MMIO device"); hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel); - log_assert(hugepage_map.mapping, "read_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." - " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", - src_device_id, - channel); + log_assert( + hugepage_map.mapping, + "read_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." + " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", + src_device_id, + channel); + + void* user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); - void * user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); + log_debug( + LogSiliconDriver, + "Cluster::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}", + src_device_id, + channel, + user_scratchspace); - log_debug(LogSiliconDriver, "Cluster::read_buffer (src_device_id: {}, ch: {}) from 0x{:x}", src_device_id, channel, user_scratchspace); - memcpy(mem_ptr, user_scratchspace, size_in_bytes); } void Cluster::write_buffer( - const void *mem_ptr, - std::uint32_t size, - std::uint32_t address, - std::uint16_t channel, - chip_id_t src_device_id) { - - log_assert(m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "write_buffer: Device id is not a MMIO device"); + const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id) { + log_assert( + m_pci_device_map.find(src_device_id) != m_pci_device_map.end(), "write_buffer: Device id is not a MMIO device"); hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel); - log_assert(hugepage_map.mapping, "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." - " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", - src_device_id, - channel); - - log_assert(size <= hugepage_map.mapping_size, "write_buffer data has larger size {} than destination buffer {}", size, hugepage_map.mapping_size); - log_debug(LogSiliconDriver, "Using hugepage mapping at address {} offset {} chan {} size {}", + log_assert( + hugepage_map.mapping, + "write_buffer: Hugepages are not allocated for src_device_id: {} ch: {}." + " - Ensure sufficient number of Hugepages installed per device (1 per host mem ch, per device)", + src_device_id, + channel); + + log_assert( + size <= hugepage_map.mapping_size, + "write_buffer data has larger size {} than destination buffer {}", + size, + hugepage_map.mapping_size); + log_debug( + LogSiliconDriver, + "Using hugepage mapping at address {} offset {} chan {} size {}", hugepage_map.mapping, (address % hugepage_map.mapping_size), channel, size); - void * user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); + void* user_scratchspace = static_cast(hugepage_map.mapping) + (address % hugepage_map.mapping_size); memcpy(user_scratchspace, mem_ptr, size); } - uint32_t Cluster::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state) { PCIDevice* pci_device = get_pci_device(chip_id); uint32_t msg = 0xaa00; @@ -1081,34 +1292,37 @@ uint32_t Cluster::get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState msg |= pci_device->get_architecture_implementation()->get_arc_message_arc_go_short_idle(); break; } - default: throw std::runtime_error("Unrecognized power state."); + default: + throw std::runtime_error("Unrecognized power state."); } return msg; } void Cluster::set_pcie_power_state(tt_DevicePowerState state) { - - for (auto &device_it : m_pci_device_map){ + for (auto& device_it : m_pci_device_map) { int chip_id = device_it.first; uint32_t msg = get_power_state_arc_msg(chip_id, state); std::stringstream ss; ss << state; auto exit_code = arc_msg(chip_id, 0xaa00 | msg, true, 0, 0); if (exit_code != 0) { - throw std::runtime_error(fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code)); + throw std::runtime_error( + fmt::format("Failed to set power state to {} with exit code {}", ss.str(), exit_code)); } } } int Cluster::get_clock(int logical_device_id) { - // TODO: remove this once ARC messages work. // This is currently used only for testing and bringing up Blackhole on Buda. if (arch_name == tt::ARCH::BLACKHOLE) { char* clk_env_var = getenv("TT_SILICON_DRIVER_AICLK"); if (clk_env_var != nullptr) { - log_warning(LogSiliconDriver, "ARC messages are not enabled on Blackhole. " - "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}" , clk_env_var); + log_warning( + LogSiliconDriver, + "ARC messages are not enabled on Blackhole. " + "Using AICLK value from environment variable TT_SILICON_DRIVER_AICLK: {}", + clk_env_var); return std::stoi(clk_env_var); } } @@ -1116,7 +1330,14 @@ int Cluster::get_clock(int logical_device_id) { uint32_t clock; auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical); - auto exit_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), true, 0xFFFF, 0xFFFF, 1, &clock); + auto exit_code = arc_msg( + logical_device_id, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_get_aiclk(), + true, + 0xFFFF, + 0xFFFF, + 1, + &clock); if (exit_code != 0) { throw std::runtime_error(fmt::format("Failed to get aiclk value with exit code {}", exit_code)); } @@ -1124,16 +1345,15 @@ int Cluster::get_clock(int logical_device_id) { } std::map Cluster::get_clocks() { - std::map clock_freq_map; - for (auto &device_it : m_pci_device_map){ + std::map clock_freq_map; + for (auto& device_it : m_pci_device_map) { int d = device_it.first; clock_freq_map.insert({d, get_clock(d)}); } return clock_freq_map; } -Cluster::~Cluster () { - +Cluster::~Cluster() { log_debug(LogSiliconDriver, "Cluster::~Cluster"); cleanup_shared_host_state(); @@ -1154,23 +1374,34 @@ std::optional> Cluster::get_tlb_data_from_target( tlb_index = map_core_to_tlb_per_chip[target.chip](tt_xy_pair(target.x, target.y)); auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); tlb_data = architecture_implementation->describe_tlb(tlb_index); - } + } return tlb_data; } -void Cluster::configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { - log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in Cluster::configure_tlb"); - PCIDevice *pci_device = get_pci_device(logical_device_id); +void Cluster::configure_tlb( + chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering) { + log_assert( + ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, + "Invalid ordering specified in Cluster::configure_tlb"); + PCIDevice* pci_device = get_pci_device(logical_device_id); pci_device->set_dynamic_tlb(tlb_index, core, address, harvested_coord_translation, ordering); auto tlb_size = std::get<1>(pci_device->get_architecture_implementation()->describe_tlb(tlb_index).value()); - if(tlb_config_map.find(logical_device_id) == tlb_config_map.end()) tlb_config_map.insert({logical_device_id, {}}); + if (tlb_config_map.find(logical_device_id) == tlb_config_map.end()) { + tlb_config_map.insert({logical_device_id, {}}); + } tlb_config_map[logical_device_id].insert({tlb_index, (address / tlb_size) * tlb_size}); } void Cluster::set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering) { - log_assert(ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, "Invalid ordering specified in Cluster::configure_tlb."); - log_assert(dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode."); - log_assert(fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); + log_assert( + ordering == TLB_DATA::Strict || ordering == TLB_DATA::Posted || ordering == TLB_DATA::Relaxed, + "Invalid ordering specified in Cluster::configure_tlb."); + log_assert( + dynamic_tlb_ordering_modes.find(fallback_tlb) != dynamic_tlb_ordering_modes.end(), + "Invalid TLB specified in Cluster::set_fallback_tlb_ordering_mode."); + log_assert( + fallback_tlb != "LARGE_READ_TLB" && fallback_tlb != "LARGE_WRITE_TLB", + "Ordering modes for LARGE_READ_TLB and LARGE_WRITE_TLB cannot be modified."); dynamic_tlb_ordering_modes.at(fallback_tlb) = ordering; } @@ -1180,7 +1411,7 @@ void Cluster::init_pcie_iatus() { int num_enabled_devices = m_pci_device_map.size(); log_debug(LogSiliconDriver, "Cluster::init_pcie_iatus() num_enabled_devices: {}", num_enabled_devices); - for (auto &src_device_it : m_pci_device_map){ + for (auto& src_device_it : m_pci_device_map) { int logical_id = src_device_it.first; PCIDevice* src_pci_device = src_device_it.second.get(); @@ -1190,72 +1421,86 @@ void Cluster::init_pcie_iatus() { if (hugepage_map.mapping) { std::uint32_t region_size = hugepage_map.mapping_size; if (channel_id == 3) { - region_size = HUGEPAGE_CHANNEL_3_SIZE_LIMIT; + region_size = HUGEPAGE_CHANNEL_3_SIZE_LIMIT; } // This log message doesn't look right. - log_debug(LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id); + log_debug( + LogSiliconDriver, "Configuring ATU channel {} to point to hugepage {}.", channel_id, logical_id); iatu_configure_peer_region(logical_id, channel_id, hugepage_map.physical_address, region_size); } else { - throw std::runtime_error(fmt::format("init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", logical_id, channel_id)); + throw std::runtime_error(fmt::format( + "init_pcie_iatus: Hugepages are not allocated for logical device id: {} ch: {}", + logical_id, + channel_id)); } } } } -int Cluster::test_setup_interface () { +int Cluster::test_setup_interface() { if (arch_name == tt::ARCH::GRAYSKULL) { int ret_val = 0; - PCIDevice *dev = m_pci_device_map.begin()->second.get(); + PCIDevice* dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(0, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = dev->set_dynamic_tlb( + dev->get_architecture_implementation()->get_reg_tlb(), + tt_xy_pair(0, 0), + 0xffb20108, + harvested_coord_translation) + .bar_offset; uint32_t regval = 0; dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && ((regval & 0x1) == 1)) ? 0 : 1; return ret_val; - } - else if (arch_name == tt::ARCH::WORMHOLE_B0) { + } else if (arch_name == tt::ARCH::WORMHOLE_B0) { int ret_val = 0; - PCIDevice *dev = m_pci_device_map.begin()->second.get(); + PCIDevice* dev = m_pci_device_map.begin()->second.get(); - uint32_t mapped_reg = dev->set_dynamic_tlb(dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; + uint32_t mapped_reg = dev->set_dynamic_tlb( + dev->get_architecture_implementation()->get_reg_tlb(), + tt_xy_pair(1, 0), + 0xffb20108, + harvested_coord_translation) + .bar_offset; uint32_t regval = 0; dev->read_regs(mapped_reg, 1, ®val); ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1; return ret_val; - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { // MT Inital BH - Try to enable this, but double check "regval == 33" // int ret_val = 0; // PCIDevice *dev = m_pci_device_map.begin()->second->hdev; - // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, harvested_coord_translation).bar_offset; + // uint32_t mapped_reg = dev->set_dynamic_tlb(m_pci_device_map.begin()->second, + // dev->get_architecture_implementation()->get_reg_tlb(), tt_xy_pair(1, 0), 0xffb20108, + // harvested_coord_translation).bar_offset; // uint32_t regval = 0; // read_regs(dev, mapped_reg, 1, ®val); // ret_val = (regval != 0xffffffff && (regval == 33)) ? 0 : 1; // return ret_val; return 0; - } - else { + } else { throw std::runtime_error(fmt::format("Unsupported architecture: {}", get_arch_str(arch_name))); } } -void Cluster::bar_write32 (int logical_device_id, uint32_t addr, uint32_t data) { - PCIDevice *dev = get_pci_device(logical_device_id); +void Cluster::bar_write32(int logical_device_id, uint32_t addr, uint32_t data) { + PCIDevice* dev = get_pci_device(logical_device_id); if (addr < dev->bar0_uc_offset) { - dev->write_block(addr, sizeof(data), reinterpret_cast(&data)); // do we have to reinterpret_cast? + dev->write_block( + addr, sizeof(data), reinterpret_cast(&data)); // do we have to reinterpret_cast? } else { dev->write_regs(addr, 1, &data); } } -uint32_t Cluster::bar_read32 (int logical_device_id, uint32_t addr) { +uint32_t Cluster::bar_read32(int logical_device_id, uint32_t addr) { PCIDevice* dev = get_pci_device(logical_device_id); uint32_t data; @@ -1268,32 +1513,39 @@ uint32_t Cluster::bar_read32 (int logical_device_id, uint32_t addr) { } // Returns 0 if everything was OK -int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { - - +int Cluster::pcie_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { if ((msg_code & 0xff00) != 0xaa00) { log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code); } - log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed + log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed - PCIDevice *pci_device = get_pci_device(logical_device_id); + PCIDevice* pci_device = get_pci_device(logical_device_id); auto architecture_implementation = pci_device->get_architecture_implementation(); // Exclusive access for a single process at a time. Based on physical pci interface id. std::string msg_type = "ARC_MSG"; const scoped_lock lock(*get_mutex(msg_type, pci_device->get_device_num())); - uint32_t fw_arg = arg0 | (arg1<<16); + uint32_t fw_arg = arg0 | (arg1 << 16); int exit_code = 0; - bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg); - bar_write32 (logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code); + bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4, fw_arg); + bar_write32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4, msg_code); - uint32_t misc = bar_read32 (logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset()); + uint32_t misc = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset()); if (misc & (1 << 16)) { log_error("trigger_fw_int failed on device {}", logical_device_id); return 1; } else { - bar_write32(logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16)); + bar_write32( + logical_device_id, architecture_implementation->get_arc_reset_arc_misc_cntl_offset(), misc | (1 << 16)); } if (wait_for_done) { @@ -1302,24 +1554,31 @@ int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_fo auto start = std::chrono::system_clock::now(); while (true) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id)); + throw std::runtime_error(fmt::format( + "Timed out after waiting {} seconds for device {} ARC to respond", timeout, logical_device_id)); } status = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 5 * 4); if ((status & 0xffff) == (msg_code & 0xff)) { if (return_3 != nullptr) { - *return_3 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); + *return_3 = bar_read32( + logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 3 * 4); } if (return_4 != nullptr) { - *return_4 = bar_read32(logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4); + *return_4 = bar_read32( + logical_device_id, architecture_implementation->get_arc_reset_scratch_offset() + 4 * 4); } exit_code = (status & 0xffff0000) >> 16; break; } else if (status == MSG_ERROR_REPLY) { - log_warning(LogSiliconDriver, "On device {}, message code 0x{:x} not recognized by FW", logical_device_id, msg_code); + log_warning( + LogSiliconDriver, + "On device {}, message code 0x{:x} not recognized by FW", + logical_device_id, + msg_code); exit_code = MSG_ERROR_REPLY; break; } @@ -1330,12 +1589,16 @@ int Cluster::pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_fo return exit_code; } -int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) { +int Cluster::iatu_configure_peer_region( + int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size) { uint32_t dest_bar_lo = bar_addr_64 & 0xffffffff; uint32_t dest_bar_hi = (bar_addr_64 >> 32) & 0xffffffff; std::uint32_t region_id_to_use = peer_region_id; - if(peer_region_id == 3) region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address space with the correct start offset - PCIDevice *pci_device = get_pci_device(logical_device_id); + if (peer_region_id == 3) { + region_id_to_use = 4; // Hack use region 4 for channel 3..this ensures that we have a smaller chan 3 address + // space with the correct start offset + } + PCIDevice* pci_device = get_pci_device(logical_device_id); auto architecture_implementation = pci_device->get_architecture_implementation(); // BR: ARC doesn't work yet on Blackhole, so programming ATU directly. Should be removed when arc starts working. @@ -1345,8 +1608,8 @@ int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_re uint64_t base_size = (region_id_to_use + 1) * region_size; uint64_t limit_address = base_addr + base_size - 1; - uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1 - uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1 + uint32_t region_ctrl_1 = 1 << 13; // INCREASE_REGION_SIZE = 1 + uint32_t region_ctrl_2 = 1 << 31; // REGION_EN = 1 uint32_t region_ctrl_3 = 0; uint32_t base_addr_lo = base_addr & 0xffffffff; uint32_t base_addr_hi = (base_addr >> 32) & 0xffffffff; @@ -1356,43 +1619,83 @@ int Cluster::iatu_configure_peer_region (int logical_device_id, uint32_t peer_re uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), ®ion_ctrl_1, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), &base_addr_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), &base_addr_hi, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), &limit_address_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), &dest_bar_lo, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), &dest_bar_hi, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), ®ion_ctrl_3, 1); - pci_device->write_regs(reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), &limit_address_hi, 1); - } - else { - bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x00), + ®ion_ctrl_1, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x04), + ®ion_ctrl_2, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x08), + &base_addr_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x0c), + &base_addr_hi, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x10), + &limit_address_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x14), + &dest_bar_lo, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x18), + &dest_bar_hi, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x1c), + ®ion_ctrl_3, + 1); + pci_device->write_regs( + reinterpret_cast(static_cast(pci_device->bar2_uc) + iatu_base + 0x20), + &limit_address_hi, + 1); + } else { + bar_write32( + logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 0 * 4, region_id_to_use); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 1 * 4, dest_bar_lo); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 2 * 4, dest_bar_hi); bar_write32(logical_device_id, architecture_implementation->get_arc_csm_mailbox_offset() + 3 * 4, region_size); - arc_msg(logical_device_id, 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), true, 0, 0); + arc_msg( + logical_device_id, + 0xaa00 | architecture_implementation->get_arc_message_setup_iatu_for_peer_to_peer(), + true, + 0, + 0); } // Print what just happened - uint32_t peer_region_start = region_id_to_use*region_size; - uint32_t peer_region_end = (region_id_to_use+1)*region_size - 1; - log_debug(LogSiliconDriver, " [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", peer_region_id, peer_region_start, peer_region_end, bar_addr_64); + uint32_t peer_region_start = region_id_to_use * region_size; + uint32_t peer_region_end = (region_id_to_use + 1) * region_size - 1; + log_debug( + LogSiliconDriver, + " [region id {}] NOC to PCI address range 0x{:x}-0x{:x} mapped to addr 0x{:x}", + peer_region_id, + peer_region_start, + peer_region_end, + bar_addr_64); return 0; } // Returns broken rows as bits set to 1 in 'memory' and 'logic' uint32_t Cluster::get_harvested_noc_rows(uint32_t harvesting_mask) { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - const std::vector &harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations(); + const std::vector& harv_to_noc_loc = architecture_implementation->get_harvesting_noc_locations(); uint32_t harv_noc_rows = 0; std::string harv_noc_rows_str = ""; - for (int pos=0; pos> 1; @@ -1403,36 +1706,45 @@ uint32_t Cluster::get_harvested_noc_rows(uint32_t harvesting_mask) { return harv_noc_rows; } -uint32_t Cluster::get_harvested_rows (int logical_device_id) { +uint32_t Cluster::get_harvested_rows(int logical_device_id) { const char* harv_override = std::getenv("T6PY_HARVESTING_OVERRIDE"); uint32_t harv = 0xffffffff; if (harv_override) { harv = std::stoul(harv_override, nullptr, 16); } else { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(logical_device_id); - PCIDevice *pci_device = get_pci_device(mmio_capable_chip_logical); - int harvesting_msg_code = arc_msg(logical_device_id, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), true, 0, 0, 1, &harv); - log_assert(harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id); + PCIDevice* pci_device = get_pci_device(mmio_capable_chip_logical); + int harvesting_msg_code = arc_msg( + logical_device_id, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_arc_get_harvesting(), + true, + 0, + 0, + 1, + &harv); + log_assert( + harvesting_msg_code != MSG_ERROR_REPLY, "Failed to read harvested rows from device {}", logical_device_id); } log_assert(harv != 0xffffffff, "Readback 0xffffffff for harvesting info. Chip is fused incorrectly!"); - log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv==0) ? "DISABLED":"ENABLED", harv); - + log_debug(LogSiliconDriver, "HARVESTING {}, 0x{:x}", (harv == 0) ? "DISABLED" : "ENABLED", harv); + uint32_t memory = harv & 0x3ff; uint32_t logic = (harv >> 10) & 0x3ff; - return (memory|logic); + return (memory | logic); } -uint32_t Cluster::get_harvested_noc_rows_for_chip (int logical_device_id) { +uint32_t Cluster::get_harvested_noc_rows_for_chip(int logical_device_id) { return get_harvested_noc_rows(get_harvested_rows(logical_device_id)); } -void Cluster::enable_local_ethernet_queue(const chip_id_t &device_id, int timeout) { +void Cluster::enable_local_ethernet_queue(const chip_id_t& device_id, int timeout) { uint32_t msg_success = 0x0; auto timeout_seconds = std::chrono::seconds(timeout); auto start = std::chrono::system_clock::now(); while (msg_success != 1) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout)); + throw std::runtime_error( + fmt::format("Timed out after waiting {} seconds for for DRAM to finish training", timeout)); } if (arc_msg(device_id, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success) == MSG_ERROR_REPLY) { @@ -1441,7 +1753,7 @@ void Cluster::enable_local_ethernet_queue(const chip_id_t &device_id, int timeou } } -void *Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { +void* Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { hugepage_mapping hugepage_map = m_pci_device_map.at(src_device_id)->get_hugepage_mapping(channel); if (hugepage_map.mapping != nullptr) { return static_cast(hugepage_map.mapping) + offset; @@ -1452,13 +1764,14 @@ void *Cluster::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, u // Wrapper for throwing more helpful exception when not-enabled pci intf is accessed. inline PCIDevice* Cluster::get_pci_device(int device_id) const { - if (!m_pci_device_map.count(device_id)){ + if (!m_pci_device_map.count(device_id)) { throw std::runtime_error(fmt::format("device_id: {} attempted to be accessed, but is not enabled.", device_id)); } return m_pci_device_map.at(device_id).get(); } -std::shared_ptr Cluster::get_mutex(const std::string& tlb_name, int pci_interface_id) { +std::shared_ptr Cluster::get_mutex( + const std::string& tlb_name, int pci_interface_id) { std::string mutex_name = tlb_name + std::to_string(pci_interface_id); return hardware_resource_mutex_map.at(mutex_name); } @@ -1486,7 +1799,8 @@ uint16_t Cluster::get_sys_rack(uint32_t rack_x, uint32_t rack_y) { } bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) { - return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == (curr_rptr & eth_interface_params.cmd_buf_size_mask)); + return (curr_wptr != curr_rptr) && ((curr_wptr & eth_interface_params.cmd_buf_size_mask) == + (curr_rptr & eth_interface_params.cmd_buf_size_mask)); } /* @@ -1535,35 +1849,37 @@ bool Cluster::is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr) { * Other schemes may be more performant. */ - /* * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the * ethernet core (host) command queue DO NOT issue any pcie reads/writes to the ethernet core prior to acquiring the * mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ - void Cluster::write_to_non_mmio_device( - const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, - bool broadcast, std::vector broadcast_header) { - + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t address, + bool broadcast, + std::vector broadcast_header) { chip_id_t mmio_capable_chip_logical; - - if(broadcast) { + + if (broadcast) { mmio_capable_chip_logical = core.chip; - } - else { + } else { mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); } flush_non_mmio_per_chip[ndesc->get_closest_mmio_capable_chip(core.chip)] = true; if (non_mmio_transfer_cores_customized) { - log_assert(active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); + log_assert( + active_eth_core_idx_per_chip.find(mmio_capable_chip_logical) != active_eth_core_idx_per_chip.end(), + "Ethernet Cores for Host to Cluster communication were not initialized for all MMIO devices."); } using data_word_t = uint32_t; constexpr int DATA_WORD_SIZE = sizeof(data_word_t); - constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words + constexpr int BROADCAST_HEADER_SIZE = sizeof(data_word_t) * 8; // Broadcast header is 8 words const auto target_chip = ndesc->get_chip_locations().at(core.chip); std::string write_tlb = "LARGE_WRITE_TLB"; @@ -1572,14 +1888,15 @@ void Cluster::write_to_non_mmio_device( translate_to_noc_table_coords(core.chip, core.y, core.x); std::vector erisc_command; std::vector erisc_q_rptr = std::vector(1); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t)); std::vector data_block; - routing_cmd_t *new_cmd; + routing_cmd_t* new_cmd; uint32_t buffer_id = 0; - uint32_t timestamp = 0; //CMD_TIMESTAMP; + uint32_t timestamp = 0; // CMD_TIMESTAMP; bool use_dram; uint32_t max_block_size; @@ -1591,14 +1908,22 @@ void Cluster::write_to_non_mmio_device( // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); - - int& active_core_for_txn = non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; - tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; - - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + const scoped_lock lock( + *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); + + int& active_core_for_txn = + non_mmio_transfer_cores_customized ? active_eth_core_idx_per_chip.at(mmio_capable_chip_logical) : active_core; + tt_cxy_pair remote_transfer_ethernet_core = + remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; + + erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE); + new_cmd = (routing_cmd_t*)&erisc_command[0]; + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); uint32_t full_count = 0; uint32_t offset = 0; uint32_t block_size; @@ -1608,40 +1933,55 @@ void Cluster::write_to_non_mmio_device( erisc_q_rptr[0] = erisc_q_ptrs[4]; while (offset < size_in_bytes) { while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); + read_device_memory( + erisc_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); + full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]); full_count++; } - //full = true; - // set full only if this command will make the q full. - // otherwise full stays false so that we do not poll the rd pointer in next iteration. - // As long as current command push does not fill up the queue completely, we do not want - // to poll rd pointer in every iteration. - //full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); + // full = true; + // set full only if this command will make the q full. + // otherwise full stays false so that we do not poll the rd pointer in next iteration. + // As long as current command push does not fill up the queue completely, we do not want + // to poll rd pointer in every iteration. + // full = is_non_mmio_cmd_q_full((erisc_q_ptrs[0] + 1) & CMD_BUF_PTR_MASK, erisc_q_rptr[0]); uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - if ((address + offset) & 0x1F) { // address not 32-byte aligned - block_size = DATA_WORD_SIZE; // 4 byte aligned + if ((address + offset) & 0x1F) { // address not 32-byte aligned + block_size = DATA_WORD_SIZE; // 4 byte aligned } else { // For broadcast we prepend a 32byte header. Decrease block size (size of payload) by this amount. - block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset : max_block_size - 32 * broadcast; + block_size = offset + max_block_size > size_in_bytes + 32 * broadcast ? size_in_bytes - offset + : max_block_size - 32 * broadcast; // Explictly align block_size to 4 bytes, in case the input buffer is not uint32_t aligned uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; } - // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size in the last block - uint64_t transfer_size = std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied + // For 4 byte aligned data, transfer_size always == block_size. For unaligned data, transfer_size < block_size + // in the last block + uint64_t transfer_size = + std::min(block_size, size_in_bytes - offset); // Host side data size that needs to be copied // Use block mode for broadcast - uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) : eth_interface_params.cmd_wr_req; - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) : eth_interface_params.cmd_wr_ack; + uint32_t req_flags = (broadcast || (block_size > DATA_WORD_SIZE)) + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_req | timestamp) + : eth_interface_params.cmd_wr_req; + uint32_t resp_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_wr_ack) + : eth_interface_params.cmd_wr_ack; timestamp = 0; - - if(broadcast) { + + if (broadcast) { req_flags |= eth_interface_params.cmd_broadcast; } - uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. + uint32_t host_dram_block_addr = + host_address_params.eth_routing_buffers_start + + (active_core_for_txn * eth_interface_params.cmd_buf_size + req_wr_ptr) * max_block_size; + uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. if (req_flags & eth_interface_params.cmd_data_block) { // Copy data to sysmem or device DRAM for Block mode @@ -1650,46 +1990,60 @@ void Cluster::write_to_non_mmio_device( resp_flags |= eth_interface_params.cmd_data_block_dram; size_buffer_to_capacity(data_block, block_size); memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size); - if(broadcast) { + if (broadcast) { // Write broadcast header to sysmem - write_to_sysmem(broadcast_header.data(), broadcast_header.size() * sizeof(uint32_t), host_dram_block_addr, host_dram_channel, mmio_capable_chip_logical); + write_to_sysmem( + broadcast_header.data(), + broadcast_header.size() * sizeof(uint32_t), + host_dram_block_addr, + host_dram_channel, + mmio_capable_chip_logical); } // Write payload to sysmem - write_to_sysmem(data_block.data(), data_block.size() * DATA_WORD_SIZE, host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, host_dram_channel, mmio_capable_chip_logical); + write_to_sysmem( + data_block.data(), + data_block.size() * DATA_WORD_SIZE, + host_dram_block_addr + BROADCAST_HEADER_SIZE * broadcast, + host_dram_channel, + mmio_capable_chip_logical); } else { uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + req_wr_ptr * max_block_size; size_buffer_to_capacity(data_block, block_size); memcpy(&data_block[0], (uint8_t*)mem_ptr + offset, transfer_size); - write_device_memory(data_block.data(), data_block.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, buf_address, write_tlb); + write_device_memory( + data_block.data(), + data_block.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + buf_address, + write_tlb); } tt_driver_atomics::sfence(); } // Send the read request - log_assert(broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned. - - if(broadcast) { + log_assert( + broadcast || (req_flags == eth_interface_params.cmd_wr_req) || (((address + offset) % 32) == 0), + "Block mode address must be 32-byte aligned."); // Block mode address must be 32-byte aligned. + + if (broadcast) { // Only specify endpoint local address for broadcast new_cmd->sys_addr = address + offset; - } - else { + } else { new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset); new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf); } - - if(req_flags & eth_interface_params.cmd_data_block) { + + if (req_flags & eth_interface_params.cmd_data_block) { // Block mode new_cmd->data = block_size + BROADCAST_HEADER_SIZE * broadcast; - } - else { - if(size_in_bytes - offset < sizeof(uint32_t)) { + } else { + if (size_in_bytes - offset < sizeof(uint32_t)) { // Handle misalignment at the end of the buffer: // Assemble a padded uint32_t from single bytes, in case we have less than 4 bytes remaining memcpy(&new_cmd->data, static_cast(mem_ptr) + offset, size_in_bytes - offset); - } - else { - new_cmd->data = *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE); + } else { + new_cmd->data = *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE); } } @@ -1697,14 +2051,24 @@ void Cluster::write_to_non_mmio_device( if (use_dram) { new_cmd->src_addr_tag = host_dram_block_addr; } - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb); + write_device_memory( + erisc_command.data(), + erisc_command.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), + write_tlb); tt_driver_atomics::sfence(); erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; std::vector erisc_q_wptr; erisc_q_wptr.resize(1); erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_q_wptr.data(), + erisc_q_wptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); offset += transfer_size; @@ -1717,10 +2081,19 @@ void Cluster::write_to_non_mmio_device( if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]) & eth_interface_params.cmd_buf_ptr_mask, erisc_q_rptr[0])) { active_core_for_txn++; uint32_t update_mask_for_chip = remote_transfer_ethernet_cores[mmio_capable_chip_logical].size() - 1; - active_core_for_txn = non_mmio_transfer_cores_customized ? (active_core_for_txn & update_mask_for_chip) : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); + active_core_for_txn = + non_mmio_transfer_cores_customized + ? (active_core_for_txn & update_mask_for_chip) + : ((active_core_for_txn & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID); // active_core = (active_core & NON_EPOCH_ETH_CORES_MASK) + NON_EPOCH_ETH_CORES_START_ID; - remote_transfer_ethernet_core = remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + remote_transfer_ethernet_core = + remote_transfer_ethernet_cores.at(mmio_capable_chip_logical)[active_core_for_txn]; + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr[0] = erisc_q_ptrs[4]; } @@ -1728,11 +2101,11 @@ void Cluster::write_to_non_mmio_device( } /* - * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core (host) command queue - * DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above + * Note that this function is required to acquire the `NON_MMIO_MUTEX_NAME` mutex for interacting with the ethernet core + * (host) command queue DO NOT use `active_core` or issue any pcie reads/writes to the ethernet core prior to acquiring + * the mutex. For extra information, see the "NON_MMIO_MUTEX Usage" above */ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes) { - using data_word_t = uint32_t; constexpr int DATA_WORD_SIZE = sizeof(data_word_t); std::string write_tlb = "LARGE_WRITE_TLB"; @@ -1740,33 +2113,50 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ std::string empty_tlb = ""; translate_to_noc_table_coords(core.chip, core.y, core.x); - const auto &mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); + const auto& mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(core.chip); const eth_coord_t target_chip = ndesc->get_chip_locations().at(core.chip); std::vector erisc_command; std::vector erisc_q_rptr; - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / DATA_WORD_SIZE); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / DATA_WORD_SIZE); std::vector erisc_resp_q_wptr = std::vector(1); std::vector erisc_resp_q_rptr = std::vector(1); - std::vector data_block; - routing_cmd_t *new_cmd; + routing_cmd_t* new_cmd; - erisc_command.resize(sizeof(routing_cmd_t)/DATA_WORD_SIZE); - new_cmd = (routing_cmd_t *)&erisc_command[0]; + erisc_command.resize(sizeof(routing_cmd_t) / DATA_WORD_SIZE); + new_cmd = (routing_cmd_t*)&erisc_command[0]; // // MUTEX ACQUIRE (NON-MMIO) // do not locate any ethernet core reads/writes before this acquire // - const scoped_lock lock(*get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); + const scoped_lock lock( + *get_mutex(NON_MMIO_MUTEX_NAME, this->get_pci_device(mmio_capable_chip_logical)->get_device_num())); const tt_cxy_pair remote_transfer_ethernet_core = remote_transfer_ethernet_cores[mmio_capable_chip_logical].at(0); - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); - read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb); - read_device_memory(erisc_resp_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); + read_device_memory( + erisc_resp_q_wptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + DATA_WORD_SIZE, + read_tlb); + read_device_memory( + erisc_resp_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); bool full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr.resize(1); @@ -1784,25 +2174,34 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ while (offset < size_in_bytes) { while (full) { - read_device_memory(erisc_q_rptr.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + eth_interface_params.remote_update_ptr_size_bytes, DATA_WORD_SIZE, read_tlb); - full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0],erisc_q_rptr[0]); + read_device_memory( + erisc_q_rptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes + + eth_interface_params.remote_update_ptr_size_bytes, + DATA_WORD_SIZE, + read_tlb); + full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_rptr[0]); } uint32_t req_wr_ptr = erisc_q_ptrs[0] & eth_interface_params.cmd_buf_size_mask; - if ((address + offset) & 0x1F) { // address not 32-byte aligned - block_size = DATA_WORD_SIZE; // 4 byte aligned block + if ((address + offset) & 0x1F) { // address not 32-byte aligned + block_size = DATA_WORD_SIZE; // 4 byte aligned block } else { block_size = offset + max_block_size > size_in_bytes ? size_in_bytes - offset : max_block_size; // Align up to 4 bytes. uint32_t alignment_mask = sizeof(uint32_t) - 1; block_size = (block_size + alignment_mask) & ~alignment_mask; - } - uint32_t req_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) : eth_interface_params.cmd_rd_req; - uint32_t resp_flags = block_size > DATA_WORD_SIZE ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) : eth_interface_params.cmd_rd_data; + uint32_t req_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_req) + : eth_interface_params.cmd_rd_req; + uint32_t resp_flags = block_size > DATA_WORD_SIZE + ? (eth_interface_params.cmd_data_block | eth_interface_params.cmd_rd_data) + : eth_interface_params.cmd_rd_data; uint32_t resp_rd_ptr = erisc_resp_q_rptr[0] & eth_interface_params.cmd_buf_size_mask; uint32_t host_dram_block_addr = host_address_params.eth_routing_buffers_start + resp_rd_ptr * max_block_size; - uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. + uint16_t host_dram_channel = 0; // This needs to be 0, since WH can only map ETH buffers to chan 0. if (use_dram && block_size > DATA_WORD_SIZE) { req_flags |= eth_interface_params.cmd_data_block_dram; @@ -1810,7 +2209,9 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ } // Send the read request - log_assert((req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned. + log_assert( + (req_flags == eth_interface_params.cmd_rd_req) || (((address + offset) & 0x1F) == 0), + "Block mode offset must be 32-byte aligned."); // Block mode offset must be 32-byte aligned. new_cmd->sys_addr = get_sys_addr(target_chip.x, target_chip.y, core.x, core.y, address + offset); new_cmd->rack = get_sys_rack(target_chip.rack, target_chip.shelf); new_cmd->data = block_size; @@ -1818,14 +2219,25 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ if (use_dram) { new_cmd->src_addr_tag = host_dram_block_addr; } - write_device_memory(erisc_command.data(), erisc_command.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), write_tlb);; + write_device_memory( + erisc_command.data(), + erisc_command.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_routing_cmd_queue_base + (sizeof(routing_cmd_t) * req_wr_ptr), + write_tlb); + ; tt_driver_atomics::sfence(); erisc_q_ptrs[0] = (erisc_q_ptrs[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; std::vector erisc_q_wptr; erisc_q_wptr.resize(1); erisc_q_wptr[0] = erisc_q_ptrs[0]; - write_device_memory(erisc_q_wptr.data(), erisc_q_wptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_q_wptr.data(), + erisc_q_wptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); // If there is more data to read and this command will make the q full, set full to 1. // otherwise full stays false so that we do not poll the rd pointer in next iteration. @@ -1833,7 +2245,12 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ // to poll rd pointer in every iteration. if (is_non_mmio_cmd_q_full((erisc_q_ptrs[0]), erisc_q_rptr[0])) { - read_device_memory(erisc_q_ptrs.data(), remote_transfer_ethernet_core, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + remote_transfer_ethernet_core, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); full = is_non_mmio_cmd_q_full(erisc_q_ptrs[0], erisc_q_ptrs[4]); erisc_q_rptr[0] = erisc_q_ptrs[4]; } @@ -1849,13 +2266,23 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ // So we have to wait for wrptr to advance, then wait for flags to be nonzero, then read data. do { - read_device_memory(erisc_resp_q_wptr.data(), remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_resp_q_wptr.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + DATA_WORD_SIZE, + read_tlb); } while (erisc_resp_q_rptr[0] == erisc_resp_q_wptr[0]); tt_driver_atomics::lfence(); uint32_t flags_offset = 12 + sizeof(routing_cmd_t) * resp_rd_ptr; std::vector erisc_resp_flags = std::vector(1); do { - read_device_memory(erisc_resp_flags.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + flags_offset, DATA_WORD_SIZE, read_tlb); + read_device_memory( + erisc_resp_flags.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_routing_cmd_queue_base + flags_offset, + DATA_WORD_SIZE, + read_tlb); } while (erisc_resp_flags[0] == 0); if (erisc_resp_flags[0] == resp_flags) { @@ -1863,27 +2290,40 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ uint32_t data_offset = 8 + sizeof(routing_cmd_t) * resp_rd_ptr; if (block_size == DATA_WORD_SIZE) { std::vector erisc_resp_data = std::vector(1); - read_device_memory(erisc_resp_data.data(), remote_transfer_ethernet_core, eth_interface_params.response_routing_cmd_queue_base + data_offset, DATA_WORD_SIZE, read_tlb); - if(size_in_bytes - offset < 4) { + read_device_memory( + erisc_resp_data.data(), + remote_transfer_ethernet_core, + eth_interface_params.response_routing_cmd_queue_base + data_offset, + DATA_WORD_SIZE, + read_tlb); + if (size_in_bytes - offset < 4) { // Handle misaligned (4 bytes) data at the end of the block. // Only read remaining bytes into the host buffer, instead of reading the full uint32_t std::memcpy((uint8_t*)mem_ptr + offset, erisc_resp_data.data(), size_in_bytes - offset); - } - else { - *((uint32_t*)mem_ptr + offset/DATA_WORD_SIZE) = erisc_resp_data[0]; + } else { + *((uint32_t*)mem_ptr + offset / DATA_WORD_SIZE) = erisc_resp_data[0]; } } else { // Read 4 byte aligned block from device/sysmem if (use_dram) { size_buffer_to_capacity(data_block, block_size); - read_from_sysmem(data_block.data(), host_dram_block_addr, host_dram_channel, block_size, mmio_capable_chip_logical); + read_from_sysmem( + data_block.data(), + host_dram_block_addr, + host_dram_channel, + block_size, + mmio_capable_chip_logical); } else { - uint32_t buf_address = eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size; + uint32_t buf_address = + eth_interface_params.eth_routing_data_buffer_addr + resp_rd_ptr * max_block_size; size_buffer_to_capacity(data_block, block_size); - read_device_memory(data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb); + read_device_memory( + data_block.data(), remote_transfer_ethernet_core, buf_address, block_size, read_tlb); } // assert(mem_ptr.size() - (offset/DATA_WORD_SIZE) >= (block_size * DATA_WORD_SIZE)); - log_assert((data_block.size() * DATA_WORD_SIZE) >= block_size, "Incorrect data size read back from sysmem/device"); + log_assert( + (data_block.size() * DATA_WORD_SIZE) >= block_size, + "Incorrect data size read back from sysmem/device"); // Account for misalignment by skipping any padding bytes in the copied data_block memcpy((uint8_t*)mem_ptr + offset, data_block.data(), std::min(block_size, size_in_bytes - offset)); } @@ -1891,40 +2331,53 @@ void Cluster::read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_ // Finally increment the rdptr for the response command q erisc_resp_q_rptr[0] = (erisc_resp_q_rptr[0] + 1) & eth_interface_params.cmd_buf_ptr_mask; - write_device_memory(erisc_resp_q_rptr.data(), erisc_resp_q_rptr.size() * DATA_WORD_SIZE, remote_transfer_ethernet_core, eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + eth_interface_params.cmd_counters_size_bytes, write_tlb); + write_device_memory( + erisc_resp_q_rptr.data(), + erisc_resp_q_rptr.size() * DATA_WORD_SIZE, + remote_transfer_ethernet_core, + eth_interface_params.response_cmd_queue_base + sizeof(remote_update_ptr_t) + + eth_interface_params.cmd_counters_size_bytes, + write_tlb); tt_driver_atomics::sfence(); log_assert(erisc_resp_flags[0] == resp_flags, "Unexpected ERISC Response Flags."); offset += block_size; } - } void Cluster::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) { - if(flush_non_mmio_per_chip[chip_id]) { + if (flush_non_mmio_per_chip[chip_id]) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole"); std::string read_tlb = "LARGE_READ_TLB"; auto chips_with_mmio = this->get_target_mmio_device_ids(); if (chips_with_mmio.find(chip_id) == chips_with_mmio.end()) { - log_debug(LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id); + log_debug( + LogSiliconDriver, "Chip {} is not an MMIO chip, skipping wait_for_connected_non_mmio_flush", chip_id); return; } if (arch_name == tt::ARCH::WORMHOLE_B0) { std::vector erisc_txn_counters = std::vector(2); - std::vector erisc_q_ptrs = std::vector(eth_interface_params.remote_update_ptr_size_bytes*2 / sizeof(uint32_t)); + std::vector erisc_q_ptrs = + std::vector(eth_interface_params.remote_update_ptr_size_bytes * 2 / sizeof(uint32_t)); - //wait for all queues to be empty. - for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) { + // wait for all queues to be empty. + for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) { do { - read_device_memory(erisc_q_ptrs.data(), cxy, eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, eth_interface_params.remote_update_ptr_size_bytes*2, read_tlb); + read_device_memory( + erisc_q_ptrs.data(), + cxy, + eth_interface_params.request_cmd_queue_base + eth_interface_params.cmd_counters_size_bytes, + eth_interface_params.remote_update_ptr_size_bytes * 2, + read_tlb); } while (erisc_q_ptrs[0] != erisc_q_ptrs[4]); } - //wait for all write responses to come back. - for (tt_cxy_pair &cxy : remote_transfer_ethernet_cores.at(chip_id)) { + // wait for all write responses to come back. + for (tt_cxy_pair& cxy : remote_transfer_ethernet_cores.at(chip_id)) { do { - read_device_memory(erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb); + read_device_memory( + erisc_txn_counters.data(), cxy, eth_interface_params.request_cmd_queue_base, 8, read_tlb); } while (erisc_txn_counters[0] != erisc_txn_counters[1]); } } @@ -1932,7 +2385,6 @@ void Cluster::wait_for_connected_non_mmio_flush(const chip_id_t chip_id) { } } - void Cluster::wait_for_non_mmio_flush(const chip_id_t chip_id) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO flush not supported in Blackhole"); std::string read_tlb = "LARGE_READ_TLB"; @@ -1953,39 +2405,48 @@ void Cluster::wait_for_non_mmio_flush() { } // Broadcast Functions -void Cluster::generate_tensix_broadcast_grids_for_grayskull(std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude) { +void Cluster::generate_tensix_broadcast_grids_for_grayskull( + std::set>& broadcast_grids, + std::set& rows_to_exclude, + std::set& cols_to_exclude) { // If row 0 is not explicitly excluded, exclude it here since its non-tensix rows_to_exclude.insert(0); // If row 11 is excluded, we can close the SOC grid. If not, exclude row 12 to close grid. - if(rows_to_exclude.find(11) == rows_to_exclude.end()) { + if (rows_to_exclude.find(11) == rows_to_exclude.end()) { rows_to_exclude.insert(12); } // If col 0 is not explicitly excluded, exclude it here since its non-tensix cols_to_exclude.insert(0); // If col 12 is excluded, we can close the SOC grid. If not, exclude col 13 to close grid. - if(cols_to_exclude.find(12) == cols_to_exclude.end()) { + if (cols_to_exclude.find(12) == cols_to_exclude.end()) { cols_to_exclude.insert(13); } std::vector> bb_x_coords = {}; std::vector> bb_y_coords = {}; // Generate starting and ending x coordinates of each bounding box/grid - for(auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) { - if(x_it == std::prev(cols_to_exclude.end(), 1)) continue; - if(cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) { + for (auto x_it = cols_to_exclude.begin(); x_it != cols_to_exclude.end(); x_it++) { + if (x_it == std::prev(cols_to_exclude.end(), 1)) { + continue; + } + if (cols_to_exclude.find(*(x_it) + 1) == cols_to_exclude.end() and + cols_to_exclude.find(*(std::next(x_it, 1)) - 1) == cols_to_exclude.end()) { bb_x_coords.push_back({*(x_it) + 1, *(std::next(x_it, 1)) - 1}); } } - for(auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) { - if(y_it == std::prev(rows_to_exclude.end(), 1)) continue; - if(rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) { + for (auto y_it = rows_to_exclude.begin(); y_it != rows_to_exclude.end(); y_it++) { + if (y_it == std::prev(rows_to_exclude.end(), 1)) { + continue; + } + if (rows_to_exclude.find((*y_it) + 1) == rows_to_exclude.end() and + rows_to_exclude.find(*std::next(y_it, 1) - 1) == rows_to_exclude.end()) { bb_y_coords.push_back({*(y_it) + 1, *(std::next(y_it, 1)) - 1}); } } // Assemble x and y coordinates into bounding box vertices - for(const auto& x_pair : bb_x_coords) { - for(const auto& y_pair : bb_y_coords) { + for (const auto& x_pair : bb_x_coords) { + for (const auto& y_pair : bb_y_coords) { tt_xy_pair top_left = tt_xy_pair(x_pair.first, y_pair.first); tt_xy_pair bot_right = tt_xy_pair(x_pair.second, y_pair.second); broadcast_grids.insert({top_left, bot_right}); @@ -1993,81 +2454,94 @@ void Cluster::generate_tensix_broadcast_grids_for_grayskull(std::set>>& Cluster::get_ethernet_broadcast_headers(const std::set& chips_to_exclude) { +std::unordered_map>>& Cluster::get_ethernet_broadcast_headers( + const std::set& chips_to_exclude) { // Generate headers for Ethernet Broadcast (WH) only. Each header corresponds to a unique broadcast "grid". - if(bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) { + if (bcast_header_cache.find(chips_to_exclude) == bcast_header_cache.end()) { bcast_header_cache[chips_to_exclude] = {}; - std::unordered_map>> broadcast_mask_for_target_chips_per_group = {}; + std::unordered_map>> + broadcast_mask_for_target_chips_per_group = {}; std::map, std::tuple>> broadcast_header_union_per_group = {}; chip_id_t first_mmio_chip = *(get_target_mmio_device_ids().begin()); - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) == chips_to_exclude.end()) { + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) == chips_to_exclude.end()) { // Get shelf local physical chip id included in broadcast - chip_id_t physical_chip_id = ndesc -> get_shelf_local_physical_chip_coords(chip); - eth_coord_t eth_coords = ndesc -> get_chip_locations().at(chip); + chip_id_t physical_chip_id = ndesc->get_shelf_local_physical_chip_coords(chip); + eth_coord_t eth_coords = ndesc->get_chip_locations().at(chip); // Rack word to be set in header uint32_t rack_word = eth_coords.rack >> 2; // Rack byte to be set in header uint32_t rack_byte = eth_coords.rack % 4; // 1st level grouping: Group broadcasts based on the MMIO chip they must go through - // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each set connected to host through its closest MMIO chip - // For the first shelf, pass broadcasts to specific chips through their closest MMIO chip - // All other shelves are fully connected galaxy grids. These are connected to all MMIO devices. Use any (or the first) MMIO device in the list. + // Nebula + Galaxy Topology assumption: Disjoint sets can only be present in the first shelf, with each + // set connected to host through its closest MMIO chip For the first shelf, pass broadcasts to specific + // chips through their closest MMIO chip All other shelves are fully connected galaxy grids. These are + // connected to all MMIO devices. Use any (or the first) MMIO device in the list. chip_id_t closest_mmio_chip = 0; if (eth_coords.rack == 0 && eth_coords.shelf == 0) { - // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its own MMIO counterpart. - closest_mmio_chip = ndesc -> get_closest_mmio_capable_chip(chip); - } - else { - // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are connected. + // Shelf 0 + Rack 0: Either an MMIO chip or a remote chip potentially connected to host through its + // own MMIO counterpart. + closest_mmio_chip = ndesc->get_closest_mmio_capable_chip(chip); + } else { + // All other shelves: Group these under the same/first MMIO chip, since all MMIO chips are + // connected. closest_mmio_chip = first_mmio_chip; } - if(broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == broadcast_mask_for_target_chips_per_group.end()) { + if (broadcast_mask_for_target_chips_per_group.find(closest_mmio_chip) == + broadcast_mask_for_target_chips_per_group.end()) { broadcast_mask_for_target_chips_per_group.insert({closest_mmio_chip, {}}); } - // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves that contain this physical id. - if(broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { + // For each target physical chip id (local to a shelf), generate headers based on all racks and shelves + // that contain this physical id. + if (broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).find(physical_chip_id) == + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).end()) { // Target seen for the first time. std::vector broadcast_mask(8, 0); broadcast_mask.at(rack_word) |= (1 << eth_coords.shelf) << rack_byte; broadcast_mask.at(3) |= 1 << physical_chip_id; - broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).insert({physical_chip_id, broadcast_mask}); + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip) + .insert({physical_chip_id, broadcast_mask}); - } - else { + } else { // Target was seen before -> include curr rack and shelf in header - broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip).at(physical_chip_id).at(rack_word) |= static_cast(1 << eth_coords.shelf) << rack_byte; + broadcast_mask_for_target_chips_per_group.at(closest_mmio_chip) + .at(physical_chip_id) + .at(rack_word) |= static_cast(1 << eth_coords.shelf) << rack_byte; } } } - // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The number of groups after this step represent the final set of broadcast grids. - for(auto& mmio_group : broadcast_mask_for_target_chips_per_group) { - for(auto& chip : mmio_group.second) { + // 2nd level grouping: For each MMIO group, further group the chips based on their rack and shelf headers. The + // number of groups after this step represent the final set of broadcast grids. + for (auto& mmio_group : broadcast_mask_for_target_chips_per_group) { + for (auto& chip : mmio_group.second) { // Generate a hash for this MMIO Chip + Rack + Shelf group - std::vector header_hash = {mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)}; - if(broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) { - broadcast_header_union_per_group.insert({header_hash, std::make_tuple(mmio_group.first, chip.second)}); - } - else { + std::vector header_hash = { + mmio_group.first, chip.second.at(0), chip.second.at(1), chip.second.at(2)}; + if (broadcast_header_union_per_group.find(header_hash) == broadcast_header_union_per_group.end()) { + broadcast_header_union_per_group.insert( + {header_hash, std::make_tuple(mmio_group.first, chip.second)}); + } else { // If group found, update chip header entry std::get<1>(broadcast_header_union_per_group.at(header_hash)).at(3) |= chip.second.at(3); } } } // Get all broadcast headers per MMIO group - for(const auto& header : broadcast_header_union_per_group) { + for (const auto& header : broadcast_header_union_per_group) { chip_id_t mmio_chip = std::get<0>(header.second); - if(bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) { + if (bcast_header_cache[chips_to_exclude].find(mmio_chip) == bcast_header_cache[chips_to_exclude].end()) { bcast_header_cache[chips_to_exclude].insert({mmio_chip, {}}); } bcast_header_cache[chips_to_exclude].at(mmio_chip).push_back(std::get<1>(header.second)); } // Invert headers (FW convention) - for(auto& bcast_group : bcast_header_cache[chips_to_exclude]) { - for(auto& header : bcast_group.second) { + for (auto& bcast_group : bcast_header_cache[chips_to_exclude]) { + for (auto& header : bcast_group.second) { int header_idx = 0; - for(auto& header_entry : header) { - if(header_idx == 4) break; + for (auto& header_entry : header) { + if (header_idx == 4) { + break; + } header_entry = ~header_entry; header_idx++; } @@ -2077,14 +2551,23 @@ std::unordered_map>>& Cluster::get_ether return bcast_header_cache[chips_to_exclude]; } -void Cluster::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb) { - // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet Broadcast for WH. - PCIDevice *pci_device = get_pci_device(chip); +void Cluster::pcie_broadcast_write( + chip_id_t chip, + const void* mem_ptr, + uint32_t size_in_bytes, + std::uint32_t addr, + const tt_xy_pair& start, + const tt_xy_pair& end, + const std::string& fallback_tlb) { + // Use the specified TLB to broadcast data to all cores included in the [start, end] grid -> GS Only. Use Ethernet + // Broadcast for WH. + PCIDevice* pci_device = get_pci_device(chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const uint8_t* buffer_addr = static_cast(mem_ptr); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); - while(size_in_bytes > 0) { - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast(tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); + while (size_in_bytes > 0) { + auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb_broadcast( + tlb_index, addr, harvested_coord_translation, start, end, dynamic_tlb_ordering_modes.at(fallback_tlb)); uint64_t transfer_size = std::min((uint64_t)size_in_bytes, tlb_size); pci_device->write_block(mapped_address, transfer_size, buffer_addr); @@ -2094,155 +2577,235 @@ void Cluster::pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t } } -inline bool tensix_or_eth_in_broadcast(const std::set& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) { +inline bool tensix_or_eth_in_broadcast( + const std::set& cols_to_exclude, + const tt::umd::architecture_implementation* architecture_implementation) { bool found_tensix_or_eth = false; - for(const auto& col : architecture_implementation->get_t6_x_locations()) { + for (const auto& col : architecture_implementation->get_t6_x_locations()) { found_tensix_or_eth |= (cols_to_exclude.find(col) == cols_to_exclude.end()); } return found_tensix_or_eth; } -inline bool valid_tensix_broadcast_grid(const std::set& rows_to_exclude, const std::set& cols_to_exclude, const tt::umd::architecture_implementation* architecture_implementation) { +inline bool valid_tensix_broadcast_grid( + const std::set& rows_to_exclude, + const std::set& cols_to_exclude, + const tt::umd::architecture_implementation* architecture_implementation) { bool t6_bcast_rows_complete = true; bool t6_bcast_rows_empty = true; - - for(const auto& row : architecture_implementation->get_t6_y_locations()) { + + for (const auto& row : architecture_implementation->get_t6_y_locations()) { t6_bcast_rows_complete &= (rows_to_exclude.find(row) == rows_to_exclude.end()); t6_bcast_rows_empty &= (rows_to_exclude.find(row) != rows_to_exclude.end()); } return t6_bcast_rows_complete || t6_bcast_rows_empty; } - -void Cluster::ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, - const std::set& chips_to_exclude, const std::set& rows_to_exclude, - std::set& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords) { - if(use_ethernet_broadcast) { +void Cluster::ethernet_broadcast_write( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + const std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb, + bool use_virtual_coords) { + if (use_ethernet_broadcast) { // Broadcast through ERISC core supported - std::unordered_map>>& broadcast_headers = get_ethernet_broadcast_headers(chips_to_exclude); - // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level broadcast headers on future/ + std::unordered_map>>& broadcast_headers = + get_ethernet_broadcast_headers(chips_to_exclude); + // Apply row and column exclusion mask explictly. Placing this here if we want to cache the higher level + // broadcast headers on future/ std::uint32_t row_exclusion_mask = 0; std::uint32_t col_exclusion_mask = 0; - for(const auto& row : rows_to_exclude) { + for (const auto& row : rows_to_exclude) { row_exclusion_mask |= 1 << row; } - for(const auto& col : cols_to_exclude) { + for (const auto& col : cols_to_exclude) { col_exclusion_mask |= 1 << (16 + col); } // Write broadcast block to device. - for(auto& mmio_group : broadcast_headers) { - for(auto& header : mmio_group.second) { - header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks + for (auto& mmio_group : broadcast_headers) { + for (auto& header : mmio_group.second) { + header.at(4) = use_virtual_coords * 0x8000; // Reset row/col exclusion masks header.at(4) |= row_exclusion_mask; header.at(4) |= col_exclusion_mask; // Write Target: x-y endpoint is a don't care. Initialize to tt_xy_pair(1, 1) - write_to_non_mmio_device(mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header); + write_to_non_mmio_device( + mem_ptr, size_in_bytes, tt_cxy_pair(mmio_group.first, tt_xy_pair(1, 1)), address, true, header); } } - } - else { + } else { // Broadcast not supported. Implement this at the software level as a for loop std::vector cores_to_write = {}; - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue; - for(const auto& core : get_soc_descriptor(chip).cores) { - if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb); + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) != chips_to_exclude.end()) { + continue; + } + for (const auto& core : get_soc_descriptor(chip).cores) { + if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and + rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and + core.second.type != CoreType::HARVESTED) { + write_to_device( + mem_ptr, size_in_bytes, tt_cxy_pair(chip, core.first.x, core.first.y), address, fallback_tlb); } } } } } -void Cluster::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, - const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { +void Cluster::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { if (arch_name == tt::ARCH::GRAYSKULL) { // Device FW disables broadcasts to all non tensix cores. std::vector dram_cores_to_write = {}; std::vector dram_rows = {0, 6}; std::vector dram_cols = {1, 4, 7, 10}; - for(const auto& row : dram_rows) { - for(const auto& col : dram_cols) { - if(rows_to_exclude.find(row) == rows_to_exclude.end() and cols_to_exclude.find(col) == cols_to_exclude.end()) { + for (const auto& row : dram_rows) { + for (const auto& col : dram_cols) { + if (rows_to_exclude.find(row) == rows_to_exclude.end() and + cols_to_exclude.find(col) == cols_to_exclude.end()) { dram_cores_to_write.push_back(tt_xy_pair(col, row)); } } } - + std::set> broadcast_grids = {}; generate_tensix_broadcast_grids_for_grayskull(broadcast_grids, rows_to_exclude, cols_to_exclude); - for(const auto& chip : target_devices_in_cluster) { - if(chips_to_exclude.find(chip) != chips_to_exclude.end()) continue; - for(const auto& dram : dram_cores_to_write) { + for (const auto& chip : target_devices_in_cluster) { + if (chips_to_exclude.find(chip) != chips_to_exclude.end()) { + continue; + } + for (const auto& dram : dram_cores_to_write) { write_device_memory(mem_ptr, size_in_bytes, tt_cxy_pair(chip, dram), address, fallback_tlb); } - for(const auto& grid : broadcast_grids) { + for (const auto& grid : broadcast_grids) { pcie_broadcast_write(chip, mem_ptr, size_in_bytes, address, grid.first, grid.second, fallback_tlb); } - } - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } + } else if (arch_name == tt::ARCH::BLACKHOLE) { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { - log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); - if(cols_to_exclude.find(0) == cols_to_exclude.end()) { + if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(9) == cols_to_exclude.end()) { + log_assert( + !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), + "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole."); + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { // When broadcast includes column zero do not exclude anything std::set unsafe_rows = {}; std::set cols_to_exclude_for_col_0_bcast = cols_to_exclude; std::set rows_to_exclude_for_col_0_bcast = rows_to_exclude; cols_to_exclude_for_col_0_bcast.insert(9); rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end()); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude_for_col_0_bcast, + cols_to_exclude_for_col_0_bcast, + fallback_tlb, + false); } - if(cols_to_exclude.find(9) == cols_to_exclude.end()) { + if (cols_to_exclude.find(9) == cols_to_exclude.end()) { std::set cols_to_exclude_for_col_9_bcast = cols_to_exclude; cols_to_exclude_for_col_9_bcast.insert(0); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude_for_col_9_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude_for_col_9_bcast, + fallback_tlb, + false); } + } else { + log_assert( + use_virtual_coords_for_eth_broadcast or + valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), + "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude, + fallback_tlb, + use_virtual_coords_for_eth_broadcast); } - else { - log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), - "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast); - } - } - else { + } else { auto architecture_implementation = tt::umd::architecture_implementation::create(arch_name); - if(cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) { - log_assert(!tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); - if(cols_to_exclude.find(0) == cols_to_exclude.end()) { - // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, since writing to these is unsafe - // ERISC FW does not exclude these. + if (cols_to_exclude.find(0) == cols_to_exclude.end() or cols_to_exclude.find(5) == cols_to_exclude.end()) { + log_assert( + !tensix_or_eth_in_broadcast(cols_to_exclude, architecture_implementation.get()), + "Cannot broadcast to tensix/ethernet and DRAM simultaneously on Wormhole."); + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { + // When broadcast includes column zero Exclude PCIe, ARC and router cores from broadcast explictly, + // since writing to these is unsafe ERISC FW does not exclude these. std::set unsafe_rows = {2, 3, 4, 8, 9, 10}; std::set cols_to_exclude_for_col_0_bcast = cols_to_exclude; std::set rows_to_exclude_for_col_0_bcast = rows_to_exclude; cols_to_exclude_for_col_0_bcast.insert(5); rows_to_exclude_for_col_0_bcast.insert(unsafe_rows.begin(), unsafe_rows.end()); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude_for_col_0_bcast, cols_to_exclude_for_col_0_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude_for_col_0_bcast, + cols_to_exclude_for_col_0_bcast, + fallback_tlb, + false); } - if(cols_to_exclude.find(5) == cols_to_exclude.end()) { + if (cols_to_exclude.find(5) == cols_to_exclude.end()) { std::set cols_to_exclude_for_col_5_bcast = cols_to_exclude; cols_to_exclude_for_col_5_bcast.insert(0); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude_for_col_5_bcast, fallback_tlb, false); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude_for_col_5_bcast, + fallback_tlb, + false); } - } - else { - log_assert(use_virtual_coords_for_eth_broadcast or valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), - "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); - ethernet_broadcast_write(mem_ptr, size_in_bytes, address, chips_to_exclude, - rows_to_exclude, cols_to_exclude, fallback_tlb, use_virtual_coords_for_eth_broadcast); - } - } -} - -int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { + } else { + log_assert( + use_virtual_coords_for_eth_broadcast or + valid_tensix_broadcast_grid(rows_to_exclude, cols_to_exclude, architecture_implementation.get()), + "Must broadcast to all tensix rows when ERISC FW is < 6.8.0."); + ethernet_broadcast_write( + mem_ptr, + size_in_bytes, + address, + chips_to_exclude, + rows_to_exclude, + cols_to_exclude, + fallback_tlb, + use_virtual_coords_for_eth_broadcast); + } + } +} + +int Cluster::remote_arc_msg( + int chip, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { constexpr uint64_t ARC_RESET_SCRATCH_ADDR = 0x880030060; constexpr uint64_t ARC_RESET_MISC_CNTL_ADDR = 0x880030100; @@ -2251,18 +2814,14 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin if ((msg_code & 0xff00) != 0xaa00) { log_error("Malformed message. msg_code is 0x{:x} but should be 0xaa..", msg_code); } - log_assert (arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed + log_assert(arg0 <= 0xffff and arg1 <= 0xffff, "Only 16 bits allowed in arc_msg args"); // Only 16 bits are allowed - uint32_t fw_arg = arg0 | (arg1<<16); + uint32_t fw_arg = arg0 | (arg1 << 16); int exit_code = 0; - { - write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); - } + { write_to_non_mmio_device(&fw_arg, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 3 * 4); } - { - write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); - } + { write_to_non_mmio_device(&msg_code, sizeof(fw_arg), core, ARC_RESET_SCRATCH_ADDR + 5 * 4); } wait_for_non_mmio_flush(); uint32_t misc = 0; @@ -2284,7 +2843,11 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin if (std::chrono::system_clock::now() - start > timeout_seconds) { std::stringstream ss; ss << std::hex << msg_code; - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", timeout, chip, ss.str())); + throw std::runtime_error(fmt::format( + "Timed out after waiting {} seconds for device {} ARC to respond to message 0x{}", + timeout, + chip, + ss.str())); } uint32_t status = 0; @@ -2310,7 +2873,8 @@ int Cluster::remote_arc_msg(int chip, uint32_t msg_code, bool wait_for_done, uin return exit_code; } -void Cluster::write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { +void Cluster::write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { write_buffer(mem_ptr, size, addr, channel, src_device_id); } @@ -2318,58 +2882,86 @@ void Cluster::read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, u read_buffer(mem_ptr, addr, channel, size, src_device_id); } -void Cluster::set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb) { - tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered +void Cluster::set_membar_flag( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_value, + const uint32_t barrier_addr, + const std::string& fallback_tlb) { + tt_driver_atomics::sfence(); // Ensure that writes before this do not get reordered std::unordered_set cores_synced = {}; std::vector barrier_val_vec = {barrier_value}; for (const auto& core : cores) { - write_to_device(barrier_val_vec.data(), barrier_val_vec.size() * sizeof(uint32_t), tt_cxy_pair(chip, core), barrier_addr, fallback_tlb); - } - tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed + write_to_device( + barrier_val_vec.data(), + barrier_val_vec.size() * sizeof(uint32_t), + tt_cxy_pair(chip, core), + barrier_addr, + fallback_tlb); + } + tt_driver_atomics::sfence(); // Ensure that all writes in the Host WC buffer are flushed while (cores_synced.size() != cores.size()) { - for(const auto& core : cores) { + for (const auto& core : cores) { if (cores_synced.find(core) == cores_synced.end()) { uint32_t readback_val; - read_from_device(&readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb); + read_from_device( + &readback_val, tt_cxy_pair(chip, core), barrier_addr, sizeof(std::uint32_t), fallback_tlb); if (readback_val == barrier_value) { cores_synced.insert(core); - } - else { - log_trace(LogSiliconDriver, "Waiting for core {} to recieve mem bar flag {} in function", core.str(), barrier_value); + } else { + log_trace( + LogSiliconDriver, + "Waiting for core {} to recieve mem bar flag {} in function", + core.str(), + barrier_value); } } } } // Ensure that reads or writes after this do not get reordered. // Reordering can cause races where data gets transferred before the barrier has returned - tt_driver_atomics::mfence(); + tt_driver_atomics::mfence(); } -void Cluster::insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb) { +void Cluster::insert_host_to_device_barrier( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_addr, + const std::string& fallback_tlb) { // Ensure that this memory barrier is atomic across processes/threads - const scoped_lock lock(*get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num())); + const scoped_lock lock( + *get_mutex(MEM_BARRIER_MUTEX_NAME, this->get_pci_device(chip)->get_device_num())); set_membar_flag(chip, cores, tt_MemBarFlag::SET, barrier_addr, fallback_tlb); set_membar_flag(chip, cores, tt_MemBarFlag::RESET, barrier_addr, fallback_tlb); } void Cluster::init_membars() { - for(const auto& chip : target_devices_in_cluster) { - if (ndesc -> is_chip_mmio_capable(chip)) { - set_membar_flag(chip, workers_per_chip.at(chip), tt_MemBarFlag::RESET, l1_address_params.tensix_l1_barrier_base, "LARGE_WRITE_TLB"); - set_membar_flag(chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB"); - set_membar_flag(chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB"); + for (const auto& chip : target_devices_in_cluster) { + if (ndesc->is_chip_mmio_capable(chip)) { + set_membar_flag( + chip, + workers_per_chip.at(chip), + tt_MemBarFlag::RESET, + l1_address_params.tensix_l1_barrier_base, + "LARGE_WRITE_TLB"); + set_membar_flag( + chip, eth_cores, tt_MemBarFlag::RESET, l1_address_params.eth_l1_barrier_base, "LARGE_WRITE_TLB"); + set_membar_flag( + chip, dram_cores, tt_MemBarFlag::RESET, dram_address_params.DRAM_BARRIER_BASE, "LARGE_WRITE_TLB"); } } } -void Cluster::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - if (ndesc -> is_chip_mmio_capable(chip)) { + +void Cluster::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + if (ndesc->is_chip_mmio_capable(chip)) { const auto& all_workers = workers_per_chip.at(chip); const auto& all_eth = eth_cores; if (cores.size()) { // Insert barrier on specific cores with L1 std::unordered_set workers_to_sync = {}; std::unordered_set eth_to_sync = {}; - + for (const auto& core : cores) { if (all_workers.find(core) != all_workers.end()) { workers_to_sync.insert(core); @@ -2379,59 +2971,60 @@ void Cluster::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, c log_fatal("Can only insert an L1 Memory barrier on Tensix or Ethernet cores."); } } - insert_host_to_device_barrier(chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb); + insert_host_to_device_barrier( + chip, workers_to_sync, l1_address_params.tensix_l1_barrier_base, fallback_tlb); insert_host_to_device_barrier(chip, eth_to_sync, l1_address_params.eth_l1_barrier_base, fallback_tlb); } else { // Insert barrier on all cores with L1 insert_host_to_device_barrier(chip, all_workers, l1_address_params.tensix_l1_barrier_base, fallback_tlb); insert_host_to_device_barrier(chip, all_eth, l1_address_params.eth_l1_barrier_base, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void Cluster::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - if (ndesc -> is_chip_mmio_capable(chip)) { +void Cluster::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + if (ndesc->is_chip_mmio_capable(chip)) { if (cores.size()) { - for(const auto& core : cores) { - log_assert(dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores."); + for (const auto& core : cores) { + log_assert( + dram_cores.find(core) != dram_cores.end(), "Can only insert a DRAM Memory barrier on DRAM cores."); } insert_host_to_device_barrier(chip, cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); - } - else { + } else { // Insert Barrier on all DRAM Cores insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void Cluster::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - if (ndesc -> is_chip_mmio_capable(chip)) { +void Cluster::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { + if (ndesc->is_chip_mmio_capable(chip)) { if (channels.size()) { std::unordered_set dram_cores_to_sync = {}; - for(const auto& chan : channels) { + for (const auto& chan : channels) { dram_cores_to_sync.insert(get_soc_descriptor(chip).get_core_for_dram_channel(chan, 0)); } - insert_host_to_device_barrier(chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); - } - else { + insert_host_to_device_barrier( + chip, dram_cores_to_sync, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); + } else { // Insert Barrier on all DRAM Cores insert_host_to_device_barrier(chip, dram_cores, dram_address_params.DRAM_BARRIER_BASE, fallback_tlb); } - } - else { + } else { wait_for_non_mmio_flush(); } } -void Cluster::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); - if(target_is_mmio_capable) { +void Cluster::write_to_device( + const void* mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) { + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); + if (target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { write_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb); } else { @@ -2439,100 +3032,118 @@ void Cluster::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair co } } else { log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); - log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet writes to a single chip cluster!"); + log_assert( + (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, + "Cannot issue ethernet writes to a single chip cluster!"); write_to_non_mmio_device(mem_ptr, size, core, addr); } } -void Cluster::read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - PCIDevice *pci_device = get_pci_device(core.chip); +void Cluster::read_mmio_device_register( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + PCIDevice* pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); - // Align block to 4bytes if needed. + auto [mapped_address, tlb_size] = + pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); + // Align block to 4bytes if needed. auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size); pci_device->read_regs(mapped_address, aligned_buf.block_size / sizeof(std::uint32_t), aligned_buf.local_storage); - if(aligned_buf.input_size != aligned_buf.block_size) { + if (aligned_buf.input_size != aligned_buf.block_size) { // Copy value from aligned buffer to main buffer. std::memcpy(mem_ptr, aligned_buf.local_storage, size); } } - -void Cluster::write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - PCIDevice *pci_device = get_pci_device(core.chip); +void Cluster::write_mmio_device_register( + const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + PCIDevice* pci_device = get_pci_device(core.chip); const auto tlb_index = dynamic_tlb_config.at(fallback_tlb); const scoped_lock lock(*get_mutex(fallback_tlb, pci_device->get_device_num())); log_debug(LogSiliconDriver, " dynamic tlb_index: {}", tlb_index); - auto [mapped_address, tlb_size] = pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); - // Align block to 4bytes if needed. + auto [mapped_address, tlb_size] = + pci_device->set_dynamic_tlb(tlb_index, core, addr, harvested_coord_translation, TLB_DATA::Strict); + // Align block to 4bytes if needed. auto aligned_buf = tt_4_byte_aligned_buffer(mem_ptr, size); - if(aligned_buf.input_size != aligned_buf.block_size) { + if (aligned_buf.input_size != aligned_buf.block_size) { // Copy value from main buffer to aligned buffer std::memcpy(aligned_buf.local_storage, mem_ptr, size); } pci_device->write_regs(mapped_address, aligned_buf.block_size / sizeof(uint32_t), aligned_buf.local_storage); } -void Cluster::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - bool target_is_mmio_capable = ndesc -> is_chip_mmio_capable(core.chip); +void Cluster::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + bool target_is_mmio_capable = ndesc->is_chip_mmio_capable(core.chip); if (target_is_mmio_capable) { if (fallback_tlb == "REG_TLB") { read_mmio_device_register(mem_ptr, core, addr, size, fallback_tlb); } else { read_device_memory(mem_ptr, core, addr, size, fallback_tlb); } - } - else { - log_assert(arch_name != tt::ARCH::BLACKHOLE, "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static - log_assert((get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, "Cannot issue ethernet reads from a single chip cluster!"); + } else { + log_assert( + arch_name != tt::ARCH::BLACKHOLE, + "Non-MMIO targets not supported in Blackhole"); // MT: Use only dynamic TLBs and never program static + log_assert( + (get_soc_descriptor(core.chip).ethernet_cores).size() > 0 && get_number_of_chips_in_cluster() > 1, + "Cannot issue ethernet reads from a single chip cluster!"); read_from_non_mmio_device(mem_ptr, core, addr, size); } } -int Cluster::arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done, uint32_t arg0, uint32_t arg1, int timeout, uint32_t *return_3, uint32_t *return_4) { +int Cluster::arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done, + uint32_t arg0, + uint32_t arg1, + int timeout, + uint32_t* return_3, + uint32_t* return_4) { log_assert(arch_name != tt::ARCH::BLACKHOLE, "ARC messages not supported in Blackhole"); - if(ndesc -> is_chip_mmio_capable(logical_device_id)) { + if (ndesc->is_chip_mmio_capable(logical_device_id)) { return pcie_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4); - } - else { + } else { return remote_arc_msg(logical_device_id, msg_code, wait_for_done, arg0, arg1, timeout, return_3, return_4); } } -void Cluster::send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) { +void Cluster::send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; write_to_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0, "REG_TLB"); tt_driver_atomics::sfence(); } -void Cluster::send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets) { +void Cluster::send_remote_tensix_risc_reset_to_core( + const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets) { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; write_to_non_mmio_device(&valid_val, sizeof(uint32_t), core, 0xFFB121B0); tt_driver_atomics::sfence(); } -int Cluster::set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state) { +int Cluster::set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); - return remote_arc_msg(chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL); + return remote_arc_msg( + chip, get_power_state_arc_msg(mmio_capable_chip_logical, device_state), true, 0, 0, 1, NULL, NULL); } - void Cluster::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) { uint32_t msg_success = 0x0; auto timeout_seconds = std::chrono::seconds(timeout); auto start = std::chrono::system_clock::now(); while (msg_success != 1) { if (std::chrono::system_clock::now() - start > timeout_seconds) { - throw std::runtime_error(fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout)); + throw std::runtime_error( + fmt::format("Timed out after waiting {} seconds for DRAM to finish training", timeout)); } int msg_rt = remote_arc_msg(chip, 0xaa58, true, 0xFFFF, 0xFFFF, 1, &msg_success, NULL); if (msg_rt == MSG_ERROR_REPLY) { @@ -2541,16 +3152,14 @@ void Cluster::enable_remote_ethernet_queue(const chip_id_t& chip, int timeout) { } } - -void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets) { - if(arch_name == tt::ARCH::GRAYSKULL) { - for (auto &device_it : m_pci_device_map) { +void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets) { + if (arch_name == tt::ARCH::GRAYSKULL) { + for (auto& device_it : m_pci_device_map) { broadcast_pcie_tensix_risc_reset(device_it.first, soft_resets); } - } - else { + } else { auto valid = soft_resets & ALL_TENSIX_SOFT_RESET; - uint32_t valid_val = (std::underlying_type::type) valid; + uint32_t valid_val = (std::underlying_type::type)valid; std::set chips_to_exclude = {}; std::set rows_to_exclude; std::set columns_to_exclude; @@ -2562,7 +3171,14 @@ void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOption columns_to_exclude = {0, 5}; } std::string fallback_tlb = "LARGE_WRITE_TLB"; - broadcast_write_to_cluster(&valid_val, sizeof(uint32_t), 0xFFB121B0, chips_to_exclude, rows_to_exclude, columns_to_exclude, fallback_tlb); + broadcast_write_to_cluster( + &valid_val, + sizeof(uint32_t), + 0xFFB121B0, + chips_to_exclude, + rows_to_exclude, + columns_to_exclude, + fallback_tlb); // Ensure that reset signal is globally visible wait_for_non_mmio_flush(); } @@ -2571,22 +3187,23 @@ void Cluster::broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOption void Cluster::set_power_state(tt_DevicePowerState device_state) { // MT Initial BH - ARC messages not supported in Blackhole if (arch_name != tt::ARCH::BLACKHOLE) { - for(auto& chip : target_devices_in_cluster) { - if(ndesc -> is_chip_mmio_capable(chip)) { + for (auto& chip : target_devices_in_cluster) { + if (ndesc->is_chip_mmio_capable(chip)) { set_pcie_power_state(device_state); } else { int exit_code = set_remote_power_state(chip, device_state); - log_assert(exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code); + log_assert( + exit_code == 0, "Failed to set power state to {} with exit code: {}", (int)device_state, exit_code); } } } } void Cluster::enable_ethernet_queue(int timeout) { - for (const chip_id_t &chip : target_devices_in_cluster) { + for (const chip_id_t& chip : target_devices_in_cluster) { auto arch = get_soc_descriptor(chip).arch; - switch (arch) { + switch (arch) { case tt::ARCH::WORMHOLE_B0: { if (ndesc->is_chip_mmio_capable(chip)) { enable_local_ethernet_queue(chip, timeout); @@ -2595,20 +3212,17 @@ void Cluster::enable_ethernet_queue(int timeout) { } break; - case tt::ARCH::BLACKHOLE: - log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet"); + case tt::ARCH::BLACKHOLE: + log_assert(false, "Arch BLACKHOLE doesn't support ethernet queues yet"); } default: { break; } } - } } -std::set Cluster::get_target_remote_device_ids() { - return target_remote_chips; -} +std::set Cluster::get_target_remote_device_ids() { return target_remote_chips; } void Cluster::deassert_resets_and_set_power_state() { // Assert tensix resets on all chips in cluster @@ -2617,15 +3231,29 @@ void Cluster::deassert_resets_and_set_power_state() { // MT Initial BH - ARC messages not supported in Blackhole if (arch_name != tt::ARCH::BLACKHOLE) { // Send ARC Messages to deassert RISCV resets - for (auto &device_it : m_pci_device_map){ - arc_msg(device_it.first, 0xaa00 | device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0, 0); - } - if(ndesc != nullptr) { - for(const chip_id_t& chip : target_devices_in_cluster) { - if(!ndesc -> is_chip_mmio_capable(chip)) { + for (auto& device_it : m_pci_device_map) { + arc_msg( + device_it.first, + 0xaa00 | + device_it.second.get()->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), + true, + 0, + 0); + } + if (ndesc != nullptr) { + for (const chip_id_t& chip : target_devices_in_cluster) { + if (!ndesc->is_chip_mmio_capable(chip)) { auto mmio_capable_chip_logical = ndesc->get_closest_mmio_capable_chip(chip); auto pci_device = get_pci_device(mmio_capable_chip_logical); - remote_arc_msg(chip, 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), true, 0x0, 0x0, 1, NULL, NULL); + remote_arc_msg( + chip, + 0xaa00 | pci_device->get_architecture_implementation()->get_arc_message_deassert_riscv_reset(), + true, + 0x0, + 0x0, + 1, + NULL, + NULL); } } enable_ethernet_queue(30); @@ -2636,11 +3264,16 @@ void Cluster::deassert_resets_and_set_power_state() { } void Cluster::verify_eth_fw() { - for(const auto& chip : target_devices_in_cluster) { + for (const auto& chip : target_devices_in_cluster) { uint32_t fw_version; std::vector fw_versions; - for (const tt_xy_pair ð_core : get_soc_descriptor(chip).ethernet_cores) { - read_from_device(&fw_version, tt_cxy_pair(chip, eth_core), l1_address_params.fw_version_addr, sizeof(uint32_t), "LARGE_READ_TLB"); + for (const tt_xy_pair& eth_core : get_soc_descriptor(chip).ethernet_cores) { + read_from_device( + &fw_version, + tt_cxy_pair(chip, eth_core), + l1_address_params.fw_version_addr, + sizeof(uint32_t), + "LARGE_READ_TLB"); fw_versions.push_back(fw_version); } verify_sw_fw_versions(chip, SW_VERSION, fw_versions); @@ -2648,7 +3281,7 @@ void Cluster::verify_eth_fw() { } } -void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions) { +void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions) { tt_version sw(sw_version), fw_first_eth_core(fw_versions.at(0)); log_info( LogSiliconDriver, @@ -2656,7 +3289,7 @@ void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std sw.str(), fw_first_eth_core.str(), device_id); - for (std::uint32_t &fw_version : fw_versions) { + for (std::uint32_t& fw_version : fw_versions) { tt_version fw(fw_version); log_assert(fw == fw_first_eth_core, "FW versions are not the same across different ethernet cores"); log_assert(sw.major == fw.major, "SW/FW major version number out of sync"); @@ -2669,14 +3302,16 @@ void Cluster::verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std use_ethernet_broadcast &= fw_first_eth_core >= tt_version(6, 5, 0); // Virtual coordinates can be used for broadcast headers if ERISC FW >= 6.8.0 and NOC translation is enabled // Temporarily enable this feature for 6.7.241 as well for testing. - use_virtual_coords_for_eth_broadcast &= (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && translation_tables_en; + use_virtual_coords_for_eth_broadcast &= + (fw_first_eth_core >= tt_version(6, 8, 0) || fw_first_eth_core == tt_version(6, 7, 241)) && + translation_tables_en; } -void Cluster::start_device(const tt_device_params &device_params) { - if(device_params.init_device) { +void Cluster::start_device(const tt_device_params& device_params) { + if (device_params.init_device) { initialize_pcie_devices(); // MT Initial BH - Ethernet firmware not present in Blackhole - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { verify_eth_fw(); } deassert_resets_and_set_power_state(); @@ -2688,7 +3323,6 @@ void Cluster::close_device() { broadcast_tensix_risc_reset_to_cluster(TENSIX_ASSERT_SOFT_RESET); } - void Cluster::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { l1_address_params = l1_address_params_; } @@ -2705,24 +3339,29 @@ void Cluster::set_driver_eth_interface_params(const tt_driver_eth_interface_para eth_interface_params = eth_interface_params_; } -void Cluster::setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { +void Cluster::setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function) { map_core_to_tlb_per_chip[logical_device_id] = mapping_function; tlbs_init_per_chip[logical_device_id] = true; } std::uint32_t Cluster::get_num_dram_channels(std::uint32_t device_id) { - log_assert(target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), "Querying DRAM parameters for a device that does not exist."); + log_assert( + target_devices_in_cluster.find(device_id) != target_devices_in_cluster.end(), + "Querying DRAM parameters for a device that does not exist."); return get_soc_descriptor(device_id).get_num_dram_channels(); } std::uint64_t Cluster::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { log_assert(channel < get_num_dram_channels(device_id), "Querying size for a device channel that does not exist."); - return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now } std::uint32_t Cluster::get_num_host_channels(std::uint32_t device_id) { auto devices = get_target_mmio_device_ids(); - log_assert(devices.find(device_id) != devices.end(), "Querying Host Address parameters for a non-mmio device or a device does not exist."); + log_assert( + devices.find(device_id) != devices.end(), + "Querying Host Address parameters for a non-mmio device or a device does not exist."); return m_pci_device_map.at(device_id)->get_num_host_mem_channels(); } @@ -2740,22 +3379,22 @@ std::uint32_t Cluster::get_numa_node_for_pcie_device(std::uint32_t device_id) { std::uint64_t Cluster::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { // TODO: Should probably be lowered to TTDevice. tt::ARCH arch = get_soc_descriptor(chip_id).arch; - if(arch == tt::ARCH::WORMHOLE_B0) { + if (arch == tt::ARCH::WORMHOLE_B0) { return 0x800000000; - } - else if (arch == tt::ARCH::BLACKHOLE) { + } else if (arch == tt::ARCH::BLACKHOLE) { // Enable 4th ATU window. return 1ULL << 60; - } - else { + } else { return 0; } } tt_version Cluster::get_ethernet_fw_version() const { log_assert(arch_name == tt::ARCH::WORMHOLE_B0, "Can only get Ethernet FW version for Wormhole architectures."); - log_assert(eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, "Device must be started before querying Ethernet FW version."); + log_assert( + eth_fw_version.major != 0xffff and eth_fw_version.minor != 0xff and eth_fw_version.patch != 0xff, + "Device must be started before querying Ethernet FW version."); return eth_fw_version; } -} +} // namespace tt::umd diff --git a/device/cluster.h b/device/cluster.h index b5caaa85f..8fde9f782 100644 --- a/device/cluster.h +++ b/device/cluster.h @@ -8,20 +8,19 @@ #include #include #include +#include #include #include #include -#include -#include "tt_soc_descriptor.h" -#include "tt_xy_pair.h" -#include "tt_silicon_driver_common.hpp" -#include "device/tt_cluster_descriptor_types.h" #include "device/tlb.h" +#include "device/tt_cluster_descriptor_types.h" #include "device/tt_io.hpp" - -#include "pcie/pci_device.hpp" #include "fmt/core.h" +#include "pcie/pci_device.hpp" +#include "tt_silicon_driver_common.hpp" +#include "tt_soc_descriptor.h" +#include "tt_xy_pair.h" using TLB_DATA = tt::umd::tlb_data; @@ -30,29 +29,32 @@ using TLB_DATA = tt::umd::tlb_data; tt::ARCH detect_arch(int pci_device_num); tt::ARCH detect_arch(); -namespace boost::interprocess{ - class named_mutex; +namespace boost::interprocess { +class named_mutex; } class tt_ClusterDescriptor; -enum tt_DevicePowerState { - BUSY, - SHORT_IDLE, - LONG_IDLE -}; +enum tt_DevicePowerState { BUSY, SHORT_IDLE, LONG_IDLE }; enum tt_MemBarFlag { SET = 0xaa, RESET = 0xbb, }; -inline std::ostream &operator <<(std::ostream &os, const tt_DevicePowerState power_state) { +inline std::ostream& operator<<(std::ostream& os, const tt_DevicePowerState power_state) { switch (power_state) { - case tt_DevicePowerState::BUSY: os << "Busy"; break; - case tt_DevicePowerState::SHORT_IDLE: os << "SHORT_IDLE"; break; - case tt_DevicePowerState::LONG_IDLE: os << "LONG_IDLE"; break; - default: throw ("Unknown DevicePowerState"); + case tt_DevicePowerState::BUSY: + os << "Busy"; + break; + case tt_DevicePowerState::SHORT_IDLE: + os << "SHORT_IDLE"; + break; + case tt_DevicePowerState::LONG_IDLE: + os << "LONG_IDLE"; + break; + default: + throw("Unknown DevicePowerState"); } return os; } @@ -116,20 +118,22 @@ struct tt_version { std::uint16_t major = 0xffff; std::uint8_t minor = 0xff; std::uint8_t patch = 0xff; + tt_version() {} + tt_version(std::uint16_t major_, std::uint8_t minor_, std::uint8_t patch_) { major = major_; minor = minor_; patch = patch_; } + tt_version(std::uint32_t version) { major = (version >> 16) & 0xff; minor = (version >> 12) & 0xf; patch = version & 0xfff; } - std::string str() const { - return fmt::format("{}.{}.{}", major, minor, patch); - } + + std::string str() const { return fmt::format("{}.{}.{}", major, minor, patch); } }; struct tt_device_params { @@ -140,29 +144,32 @@ struct tt_device_params { bool init_device = true; bool early_open_device = false; int aiclk = 0; + // The command-line input for vcd_dump_cores can have the following format: // {"*-2", "1-*", "*-*", "1-2"} // '*' indicates we must dump all the cores in that dimension. // This function takes the vector above and unrolles the coords with '*' in one or both dimensions. std::vector unroll_vcd_dump_cores(tt_xy_pair grid_size) const { std::vector unrolled_dump_core; - for (auto &dump_core: vcd_dump_cores) { + for (auto& dump_core : vcd_dump_cores) { // If the input is a single *, then dump all cores. if (dump_core == "*") { for (size_t x = 0; x < grid_size.x; x++) { - for (size_t y = 0; y < grid_size.y; y++) { - std::string current_core_coord = fmt::format("{}-{}", x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { - unrolled_dump_core.push_back(current_core_coord); + for (size_t y = 0; y < grid_size.y; y++) { + std::string current_core_coord = fmt::format("{}-{}", x, y); + if (std::find( + std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { + unrolled_dump_core.push_back(current_core_coord); + } } } - } continue; } // Each core coordinate must contain three characters: "core.x-core.y". assert(dump_core.size() <= 5); size_t delimiter_pos = dump_core.find('-'); - assert (delimiter_pos != std::string::npos); // y-dim should exist in core coord. + assert(delimiter_pos != std::string::npos); // y-dim should exist in core coord. std::string core_dim_x = dump_core.substr(0, delimiter_pos); size_t core_dim_y_start = delimiter_pos + 1; @@ -172,7 +179,9 @@ struct tt_device_params { for (size_t x = 0; x < grid_size.x; x++) { for (size_t y = 0; y < grid_size.y; y++) { std::string current_core_coord = fmt::format("{}-{}", x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find( + std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } @@ -180,14 +189,16 @@ struct tt_device_params { } else if (core_dim_x == "*") { for (size_t x = 0; x < grid_size.x; x++) { std::string current_core_coord = fmt::format("{}-{}", x, core_dim_y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } } else if (core_dim_y == "*") { for (size_t y = 0; y < grid_size.y; y++) { std::string current_core_coord = fmt::format("{}-{}", core_dim_x, y); - if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == std::end(unrolled_dump_core)) { + if (std::find(std::begin(unrolled_dump_core), std::end(unrolled_dump_core), current_core_coord) == + std::end(unrolled_dump_core)) { unrolled_dump_core.push_back(current_core_coord); } } @@ -199,10 +210,9 @@ struct tt_device_params { } std::vector expand_plusargs() const { - std::vector all_plusargs { + std::vector all_plusargs{ fmt::format("+enable_perf_scoreboard={}", enable_perf_scoreboard), - fmt::format("+register_monitor={}", register_monitor) - }; + fmt::format("+register_monitor={}", register_monitor)}; all_plusargs.insert(all_plusargs.end(), plusargs.begin(), plusargs.end()); @@ -216,18 +226,18 @@ struct tt_device_params { * Exposes a generic interface to callers, providing declarations for virtual functions defined differently for Silicon. * Valid usage consists of declaring a tt_device object and initializing it to Silicon backend. * Using tt_device itself will throw errors, since its APIs are undefined. - */ -class tt_device -{ - public: + */ +class tt_device { +public: tt_device(); virtual ~tt_device(); + // Setup/Teardown Functions /** * Set L1 Address Map parameters used by UMD to communicate with the TT Device. * * @param l1_address_params_ All the L1 parameters required by UMD - */ + */ virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { throw std::runtime_error("---- tt_device::set_device_l1_address_params is not implemented\n"); } @@ -240,9 +250,9 @@ class tt_device * Set Host Address Map parameters used by UMD to communicate with the TT Device (used for remote transactions). * * @param host_address_params_ All the Host Address space parameters required by UMD. - */ - [[deprecated("Using unnecessary function.")]] - virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) { + */ + [[deprecated("Using unnecessary function.")]] virtual void set_driver_host_address_params( + const tt_driver_host_address_params& host_address_params_) { throw std::runtime_error("---- tt_device::set_driver_host_address_params is not implemented\n"); } @@ -250,9 +260,9 @@ class tt_device * Set ERISC Firmware parameters used by UMD to communicate with the TT Device (used for remote transactions). * * @param eth_interface_params_ All the Ethernet Firmware parameters required by UMD. - */ - [[deprecated("Using unnecessary function.")]] - virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) { + */ + [[deprecated("Using unnecessary function.")]] virtual void set_driver_eth_interface_params( + const tt_driver_eth_interface_params& eth_interface_params_) { throw std::runtime_error("---- tt_device::set_driver_eth_interface_params is not implemented\n"); } @@ -264,8 +274,13 @@ class tt_device * @param tlb_index TLB id that will be programmed. * @param address Start address TLB is mapped to. * @param ordering Ordering mode for the TLB. - */ - virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Relaxed) { + */ + virtual void configure_tlb( + chip_id_t logical_device_id, + tt_xy_pair core, + std::int32_t tlb_index, + std::int32_t address, + uint64_t ordering = TLB_DATA::Relaxed) { throw std::runtime_error("---- tt_device::configure_tlb is not implemented\n"); } @@ -274,45 +289,51 @@ class tt_device * * @param fallback_tlb Dynamic TLB being targeted. * @param ordering Ordering mode for the TLB. - */ + */ virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted) { throw std::runtime_error("---- tt_device::set_fallback_tlb_ordering_mode is not implemented\n"); } - + /** - * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per core). + * Give UMD a 1:1 function mapping a core to its appropriate static TLB (currently only support a single TLB per + * core). * * @param logical_device_id MMIO chip being targeted. * @param mapping_function Function which maps core to TLB index. */ - virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function) { + virtual void setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function) { throw std::runtime_error("---- tt_device::setup_core_to_tlb_map is not implemented\n"); } /** - * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to use a subset of cores from the active_eth_cores_per_chip set for all host->cluster - * non-MMIO transfers. If this function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). - * If default behaviour is not desired, this function must be called for all MMIO devices. + * Pass in ethernet cores with active links for a specific MMIO chip. When called, this function will force UMD to + * use a subset of cores from the active_eth_cores_per_chip set for all host->cluster non-MMIO transfers. If this + * function is not called, UMD will use a default set of ethernet core indices for these transfers (0 through 5). If + * default behaviour is not desired, this function must be called for all MMIO devices. * * @param mmio_chip Device being targeted. * @param active_eth_cores_per_chip The active ethernet cores for this chip. */ - virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { - throw std::runtime_error("---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip) { + throw std::runtime_error( + "---- tt_device::configure_active_ethernet_cores_for_mmio_device is not implemented\n"); } /** - * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize iATUs for PCIe devices and ethernet queues for remote chips. + * On Silicon: Assert soft Tensix reset, deassert RiscV reset, set power state to busy (ramp up AICLK), initialize + * iATUs for PCIe devices and ethernet queues for remote chips. * * @param device_params Object specifying initialization configuration. */ - virtual void start_device(const tt_device_params &device_params) { + virtual void start_device(const tt_device_params& device_params) { throw std::runtime_error("---- tt_device::start_device is not implemented\n"); } /** * Broadcast deassert soft Tensix Reset to the entire device (to be done after start_device is called). - */ + */ virtual void deassert_risc_reset() { throw std::runtime_error("---- tt_device::deassert_risc_reset is not implemented\n"); } @@ -321,14 +342,15 @@ class tt_device * Send a soft deassert reset signal to a single tensix core. * * @param core Chip and core being targeted. - */ - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET) { + */ + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) { throw std::runtime_error("---- tt_device::deassert_risc_reset_at_core is not implemented\n"); } /** * Broadcast assert soft Tensix Reset to the entire device. - */ + */ virtual void assert_risc_reset() { throw std::runtime_error("---- tt_device::assert_risc_reset is not implemented\n"); } @@ -337,7 +359,7 @@ class tt_device * Send a soft assert reset signal to a single tensix core. * * @param core Chip and core being targeted. - */ + */ virtual void assert_risc_reset_at_core(tt_cxy_pair core) { throw std::runtime_error("---- tt_device::assert_risc_reset_at_core is not implemented\n"); } @@ -345,17 +367,15 @@ class tt_device /** * To be called at the end of a run. * Set power state to idle, assert tensix reset at all cores. - */ - virtual void close_device() { - throw std::runtime_error("---- tt_device::close_device is not implemented\n"); - } + */ + virtual void close_device() { throw std::runtime_error("---- tt_device::close_device is not implemented\n"); } // Runtime functions /** * Non-MMIO (ethernet) barrier. - * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding with the next one. - * This will be applied to all chips in the cluster. - */ + * Similar to an mfence for host -> host transfers. Will flush all in-flight ethernet transactions before proceeding + * with the next one. This will be applied to all chips in the cluster. + */ virtual void wait_for_non_mmio_flush() { throw std::runtime_error("---- tt_device::wait_for_non_mmio_flush is not implemented\n"); } @@ -377,12 +397,20 @@ class tt_device * @param addr Address to write to. * @param tlb_to_use Specifies fallback/dynamic TLB to use. */ - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::write_to_device is not implemented\n"); } - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb) { + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb) { throw std::runtime_error("---- tt_device::broadcast_write_to_cluster is not implemented\n"); } @@ -395,44 +423,54 @@ class tt_device * @param size Number of bytes to read. * @param fallback_tlb Specifies fallback/dynamic TLB to use. */ - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { // Only implement this for Silicon Backend throw std::runtime_error("---- tt_device::read_from_device is not implemented\n"); } /** * Write uint32_t vector to specified address and channel on host (defined for Silicon). - * + * * @param vec Data to write. * @param addr Address to write to. * @param channel Host channel to target. * @param src_device_id Chip to target. */ - virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { + virtual void write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) { throw std::runtime_error("---- tt_device::write_to_sysmem is not implemented\n"); } - virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { + + virtual void read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) { throw std::runtime_error("---- tt_device::read_from_sysmem is not implemented\n"); } - virtual void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { + + virtual void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { throw std::runtime_error("---- tt_device::l1_membar is not implemented\n"); } - virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) { + + virtual void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) { throw std::runtime_error("---- tt_device::dram_membar is not implemented\n"); } - virtual void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { + + virtual void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) { throw std::runtime_error("---- tt_device::dram_membar is not implemented\n"); } // Misc. Functions to Query/Set Device State /** - * Query post harvesting SOC descriptors from UMD in virtual coordinates. + * Query post harvesting SOC descriptors from UMD in virtual coordinates. * These descriptors should be used for looking up cores that are passed into UMD APIs. */ virtual std::unordered_map& get_virtual_soc_descriptors() { throw std::runtime_error("---- tt_device:get_virtual_soc_descriptors is not implemented\n"); } - + /** * Determine if UMD performed harvesting on SOC descriptors. */ @@ -440,18 +478,18 @@ class tt_device throw std::runtime_error("---- tt_device:using_harvested_soc_descriptors is not implemented\n"); return 0; } - + /** * Get harvesting masks for all chips/SOC Descriptors in the cluster. * Each mask represents a map of enabled (0) and disabled (1) rows on a specific chip (in NOC0 Coordinateds). - */ + */ virtual std::unordered_map get_harvesting_masks_for_soc_descriptors() { throw std::runtime_error("---- tt_device:get_harvesting_masks_for_soc_descriptors is not implemented\n"); } /** * Issue message to device, meant to be picked up by ARC firmware. - * + * * @param logical_device_id Chip to target. * @param msg_code Specifies type of ARC message. * @param wait_for_done Block until ARC responds. @@ -460,8 +498,16 @@ class tt_device * @param timeout Timeout on ARC. * @param return3 Return value from ARC. * @param return4 Return value from ARC. - */ - virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr) { + */ + virtual int arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr) { throw std::runtime_error("---- tt_device::arc_msg is not implemented\n"); } @@ -471,28 +517,28 @@ class tt_device * @param device_id Chip to target. * @param r Row coordinate. * @param c Column coordinate. - */ - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { + */ + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { throw std::runtime_error("---- tt_device::translate_to_noc_table_coords is not implemented\n"); } /** * Get the total number of chips in the cluster based on the network descriptor. - */ + */ virtual int get_number_of_chips_in_cluster() { throw std::runtime_error("---- tt_device::get_number_of_chips_in_cluster is not implemented\n"); } /** * Get the logical ids for all chips in the cluster - */ + */ virtual std::unordered_set get_all_chips_in_cluster() { throw std::runtime_error("---- tt_device::get_all_chips_in_cluster is not implemented\n"); } /** * Get cluster descriptor object being used in UMD instance. - */ + */ virtual tt_ClusterDescriptor* get_cluster_description() { throw std::runtime_error("---- tt_device::get_cluster_description is not implemented\n"); } @@ -514,9 +560,9 @@ class tt_device /** * Get clock frequencies for all MMIO devices targeted by UMD. */ - virtual std::map get_clocks() { + virtual std::map get_clocks() { throw std::runtime_error("---- tt_device::get_clocks is not implemented\n"); - return std::map(); + return std::map(); } virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) { @@ -534,7 +580,7 @@ class tt_device * Query number of DRAM channels on a specific device. * * @param device_id Logical device id to query. - */ + */ virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_num_dram_channels is not implemented\n"); return 0; @@ -542,10 +588,10 @@ class tt_device /** * Get size for a specific DRAM channel on a device. - * + * * @param device_id Device to target. * @param channel DRAM channel to target. - */ + */ virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { throw std::runtime_error("---- tt_device::get_dram_channel_size is not implemented\n"); return 0; @@ -555,7 +601,7 @@ class tt_device * Query number of Host channels (hugepages) allocated for a specific device. * * @param device_id Logical device id to target. - */ + */ virtual std::uint32_t get_num_host_channels(std::uint32_t device_id) { throw std::runtime_error("---- tt_device::get_num_host_channels is not implemented\n"); return 0; @@ -566,20 +612,21 @@ class tt_device * * @param device_id Logical device id to target. * @param channel Logical host channel to target. - */ + */ virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { throw std::runtime_error("---- tt_device::get_host_channel_size is not implemented\n"); return 0; } /** - * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific device. - * + * Get absolute address corresponding to a zero based offset into a specific host memory channel for a specific + * device. + * * @param offset Offset wrt the start of the channel's address space. - * @param src_device_id Device to target. + * @param src_device_id Device to target. * @param channel Host memory channel. */ - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { throw std::runtime_error("---- tt_device::host_dma_address is not implemented\n"); return nullptr; } @@ -588,24 +635,24 @@ class tt_device throw std::runtime_error("---- tt_device::get_pcie_base_addr_from_device is not implemented\n"); return 0; } + const tt_SocDescriptor& get_soc_descriptor(chip_id_t chip_id) const; bool performed_harvesting = false; std::unordered_map harvested_rows_per_target = {}; bool translation_tables_en = false; - protected: +protected: std::unordered_map soc_descriptor_per_chip = {}; }; namespace tt::umd { /** -* Silicon Driver Class, derived from the tt_device class + * Silicon Driver Class, derived from the tt_device class * Implements APIs to communicate with a physical Tenstorrent Device. -*/ -class Cluster: public tt_device -{ + */ +class Cluster : public tt_device { public: // Constructor /** @@ -619,11 +666,17 @@ class Cluster: public tt_device * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. * @param simulated_harvesting_masks - */ - Cluster(const std::string &sdesc_path, const std::string &ndesc_path, const std::set &target_devices, - const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, - const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); - + */ + Cluster( + const std::string& sdesc_path, + const std::string& ndesc_path, + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device = 1, + const bool skip_driver_allocs = false, + const bool clean_system_resources = false, + bool perform_harvesting = true, + std::unordered_map simulated_harvesting_masks = {}); + /** * Cluster constructor. This constructor should be used to work towards removing all * of the params from the constructor of tt_SiliconDevice (to become Cluster). @@ -633,9 +686,13 @@ class Cluster: public tt_device * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. * @param simulated_harvesting_masks - */ - Cluster(const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, - const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); + */ + Cluster( + const uint32_t& num_host_mem_ch_per_mmio_device = 1, + const bool skip_driver_allocs = false, + const bool clean_system_resources = false, + bool perform_harvesting = true, + std::unordered_map simulated_harvesting_masks = {}); /** * Cluster constructor. This constructor should be used to target specific devices in a cluster. @@ -646,42 +703,69 @@ class Cluster: public tt_device * @param clean_system_resource Specifies if host state from previous runs needs to be cleaned up. * @param perform_harvesting Allow the driver to modify the SOC descriptors per chip. * @param simulated_harvesting_masks - */ - Cluster(const std::set &target_devices, const uint32_t &num_host_mem_ch_per_mmio_device = 1, const bool skip_driver_allocs = false, - const bool clean_system_resources = false, bool perform_harvesting = true, std::unordered_map simulated_harvesting_masks = {}); + */ + Cluster( + const std::set& target_devices, + const uint32_t& num_host_mem_ch_per_mmio_device = 1, + const bool skip_driver_allocs = false, + const bool clean_system_resources = false, + bool perform_harvesting = true, + std::unordered_map simulated_harvesting_masks = {}); - //Setup/Teardown Functions + // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors(); virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_); virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); - virtual void configure_tlb(chip_id_t logical_device_id, tt_xy_pair core, std::int32_t tlb_index, std::int32_t address, uint64_t ordering = TLB_DATA::Posted); + virtual void configure_tlb( + chip_id_t logical_device_id, + tt_xy_pair core, + std::int32_t tlb_index, + std::int32_t address, + uint64_t ordering = TLB_DATA::Posted); virtual void set_fallback_tlb_ordering_mode(const std::string& fallback_tlb, uint64_t ordering = TLB_DATA::Posted); - virtual void setup_core_to_tlb_map(const chip_id_t logical_device_id, std::function mapping_function); - virtual void configure_active_ethernet_cores_for_mmio_device(chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); - virtual void start_device(const tt_device_params &device_params); + virtual void setup_core_to_tlb_map( + const chip_id_t logical_device_id, std::function mapping_function); + virtual void configure_active_ethernet_cores_for_mmio_device( + chip_id_t mmio_chip, const std::unordered_set& active_eth_cores_per_chip); + virtual void start_device(const tt_device_params& device_params); virtual void assert_risc_reset(); virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); virtual void assert_risc_reset_at_core(tt_cxy_pair core); virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - virtual void write_to_sysmem(const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); - virtual void read_from_sysmem(void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); + + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + virtual void write_to_sysmem( + const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id); + virtual void read_from_sysmem( + void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id); virtual void wait_for_non_mmio_flush(); virtual void wait_for_non_mmio_flush(const chip_id_t chip_id); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); // These functions are used by Debuda, so make them public - void bar_write32 (int logical_device_id, uint32_t addr, uint32_t data); - uint32_t bar_read32 (int logical_device_id, uint32_t addr); + void bar_write32(int logical_device_id, uint32_t addr, uint32_t data); + uint32_t bar_read32(int logical_device_id, uint32_t addr); /** * If the tlbs are initialized, returns a tuple with the TLB base address and its size @@ -699,16 +783,24 @@ class Cluster: public tt_device * - the mapping is unchanged during the lifetime of the returned object. * - the Cluster instance outlives the returned object. * - use of the returned object is congruent with the target's TLB setup. - * + * * @param target The target chip and core to write to. */ tt::Writer get_static_tlb_writer(tt_cxy_pair target); // Misc. Functions to Query/Set Device State - virtual int arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); + virtual int arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); virtual tt_ClusterDescriptor* get_cluster_description(); @@ -716,13 +808,16 @@ class Cluster: public tt_device static std::vector detect_available_device_ids(); virtual std::set get_target_mmio_device_ids(); virtual std::set get_target_remote_device_ids(); - virtual std::map get_clocks(); - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + virtual std::map get_clocks(); + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; - static std::vector extract_rows_to_remove(const tt::ARCH &arch, const int worker_grid_rows, const int harvested_rows); - static void remove_worker_row_from_descriptor(tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); + static std::vector extract_rows_to_remove( + const tt::ARCH& arch, const int worker_grid_rows, const int harvested_rows); + static void remove_worker_row_from_descriptor( + tt_SocDescriptor& full_soc_descriptor, const std::vector& row_coordinates_to_remove); static void harvest_rows_in_soc_descriptor(tt::ARCH arch, tt_SocDescriptor& sdesc, uint32_t harvested_rows); - static std::unordered_map create_harvested_coord_translation(const tt::ARCH arch, bool identity_map); + static std::unordered_map create_harvested_coord_translation( + const tt::ARCH arch, bool identity_map); std::unordered_map get_harvested_coord_translation_map(chip_id_t logical_device_id); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); @@ -731,74 +826,153 @@ class Cluster: public tt_device virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); virtual tt_version get_ethernet_fw_version() const; // TODO: This should be accessible through public API, probably to be moved to tt_device. - PCIDevice *get_pci_device(int device_id) const; + PCIDevice* get_pci_device(int device_id) const; // Destructor - virtual ~Cluster (); + virtual ~Cluster(); private: // Helper functions // Startup + teardown - void create_device(const std::unordered_set &target_mmio_device_ids, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, const bool clean_system_resources); + void create_device( + const std::unordered_set& target_mmio_device_ids, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources); void initialize_interprocess_mutexes(int pci_interface_id, bool cleanup_mutexes_in_shm); void cleanup_shared_host_state(); void initialize_pcie_devices(); - void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions &cores); - void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions &soft_resets); - void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); - void send_tensix_risc_reset_to_core(const tt_cxy_pair &core, const TensixSoftResetOptions &soft_resets); + void broadcast_pcie_tensix_risc_reset(chip_id_t chip_id, const TensixSoftResetOptions& cores); + void broadcast_tensix_risc_reset_to_cluster(const TensixSoftResetOptions& soft_resets); + void send_remote_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets); + void send_tensix_risc_reset_to_core(const tt_cxy_pair& core, const TensixSoftResetOptions& soft_resets); void perform_harvesting_and_populate_soc_descriptors(const std::string& sdesc_path, const bool perform_harvesting); void populate_cores(); - void init_pcie_iatus(); // No more p2p support. + void init_pcie_iatus(); // No more p2p support. void check_pcie_device_initialized(int device_id); void set_pcie_power_state(tt_DevicePowerState state); - int set_remote_power_state(const chip_id_t &chip, tt_DevicePowerState device_state); + int set_remote_power_state(const chip_id_t& chip, tt_DevicePowerState device_state); void set_power_state(tt_DevicePowerState state); uint32_t get_power_state_arc_msg(chip_id_t chip_id, tt_DevicePowerState state); void enable_local_ethernet_queue(const chip_id_t& chip, int timeout); void enable_ethernet_queue(int timeout); void enable_remote_ethernet_queue(const chip_id_t& chip, int timeout); void deassert_resets_and_set_power_state(); - int iatu_configure_peer_region (int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size); - uint32_t get_harvested_noc_rows (uint32_t harvesting_mask); - uint32_t get_harvested_rows (int logical_device_id); + int iatu_configure_peer_region( + int logical_device_id, uint32_t peer_region_id, uint64_t bar_addr_64, uint32_t region_size); + uint32_t get_harvested_noc_rows(uint32_t harvesting_mask); + uint32_t get_harvested_rows(int logical_device_id); int get_clock(int logical_device_id); // Communication Functions - void read_buffer(void* mem_ptr, std::uint32_t address, std::uint16_t channel, std::uint32_t size_in_bytes, chip_id_t src_device_id); - void write_buffer(const void *mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); - void write_device_memory(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair target, std::uint32_t address, const std::string& fallback_tlb); - void write_to_non_mmio_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t address, bool broadcast = false, std::vector broadcast_header = {}); - void read_device_memory(void *mem_ptr, tt_cxy_pair target, std::uint32_t address, std::uint32_t size_in_bytes, const std::string& fallback_tlb); + void read_buffer( + void* mem_ptr, + std::uint32_t address, + std::uint16_t channel, + std::uint32_t size_in_bytes, + chip_id_t src_device_id); + void write_buffer( + const void* mem_ptr, std::uint32_t size, std::uint32_t address, std::uint16_t channel, chip_id_t src_device_id); + void write_device_memory( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair target, + std::uint32_t address, + const std::string& fallback_tlb); + void write_to_non_mmio_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t address, + bool broadcast = false, + std::vector broadcast_header = {}); + void read_device_memory( + void* mem_ptr, + tt_cxy_pair target, + std::uint32_t address, + std::uint32_t size_in_bytes, + const std::string& fallback_tlb); void read_from_non_mmio_device(void* mem_ptr, tt_cxy_pair core, uint64_t address, uint32_t size_in_bytes); - void read_mmio_device_register(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - void write_mmio_device_register(const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); - void pcie_broadcast_write(chip_id_t chip, const void* mem_ptr, uint32_t size_in_bytes, std::uint32_t addr, const tt_xy_pair& start, const tt_xy_pair& end, const std::string& fallback_tlb); - void ethernet_broadcast_write(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, const std::set& rows_to_exclude, - std::set& cols_to_exclude, const std::string& fallback_tlb, bool use_virtual_coords); - void set_membar_flag(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_value, const uint32_t barrier_addr, const std::string& fallback_tlb); - void insert_host_to_device_barrier(const chip_id_t chip, const std::unordered_set& cores, const uint32_t barrier_addr, const std::string& fallback_tlb); + void read_mmio_device_register( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + void write_mmio_device_register( + const void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + void pcie_broadcast_write( + chip_id_t chip, + const void* mem_ptr, + uint32_t size_in_bytes, + std::uint32_t addr, + const tt_xy_pair& start, + const tt_xy_pair& end, + const std::string& fallback_tlb); + void ethernet_broadcast_write( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + const std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb, + bool use_virtual_coords); + void set_membar_flag( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_value, + const uint32_t barrier_addr, + const std::string& fallback_tlb); + void insert_host_to_device_barrier( + const chip_id_t chip, + const std::unordered_set& cores, + const uint32_t barrier_addr, + const std::string& fallback_tlb); void init_membars(); uint64_t get_sys_addr(uint32_t chip_x, uint32_t chip_y, uint32_t noc_x, uint32_t noc_y, uint64_t offset); uint16_t get_sys_rack(uint32_t rack_x, uint32_t rack_y); bool is_non_mmio_cmd_q_full(uint32_t curr_wptr, uint32_t curr_rptr); - int pcie_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); - int remote_arc_msg(int logical_device_id, uint32_t msg_code, bool wait_for_done = true, uint32_t arg0 = 0, uint32_t arg1 = 0, int timeout=1, uint32_t *return_3 = nullptr, uint32_t *return_4 = nullptr); - bool address_in_tlb_space(uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); + int pcie_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); + int remote_arc_msg( + int logical_device_id, + uint32_t msg_code, + bool wait_for_done = true, + uint32_t arg0 = 0, + uint32_t arg1 = 0, + int timeout = 1, + uint32_t* return_3 = nullptr, + uint32_t* return_4 = nullptr); + bool address_in_tlb_space( + uint32_t address, uint32_t size_in_bytes, int32_t tlb_index, uint64_t tlb_size, uint32_t chip); std::shared_ptr get_mutex(const std::string& tlb_name, int pci_interface_id); - virtual uint32_t get_harvested_noc_rows_for_chip(int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips - void generate_tensix_broadcast_grids_for_grayskull( std::set>& broadcast_grids, std::set& rows_to_exclude, std::set& cols_to_exclude); - std::unordered_map>>& get_ethernet_broadcast_headers(const std::set& chips_to_exclude); + virtual uint32_t get_harvested_noc_rows_for_chip( + int logical_device_id); // Returns one-hot encoded harvesting mask for PCIe mapped chips + void generate_tensix_broadcast_grids_for_grayskull( + std::set>& broadcast_grids, + std::set& rows_to_exclude, + std::set& cols_to_exclude); + std::unordered_map>>& get_ethernet_broadcast_headers( + const std::set& chips_to_exclude); // Test functions void verify_eth_fw(); - void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector &fw_versions); - int test_setup_interface (); + void verify_sw_fw_versions(int device_id, std::uint32_t sw_version, std::vector& fw_versions); + int test_setup_interface(); // This functions has to be called for local chip, and then it will wait for all connected remote chips to flush. void wait_for_connected_non_mmio_flush(chip_id_t chip_id); - void construct_cluster(const std::string& sdesc_path, const uint32_t &num_host_mem_ch_per_mmio_device, const bool skip_driver_allocs, - const bool clean_system_resources, bool perform_harvesting, std::unordered_map simulated_harvesting_masks); + void construct_cluster( + const std::string& sdesc_path, + const uint32_t& num_host_mem_ch_per_mmio_device, + const bool skip_driver_allocs, + const bool clean_system_resources, + bool perform_harvesting, + std::unordered_map simulated_harvesting_masks); // State variables tt_device_dram_address_params dram_address_params; @@ -810,22 +984,24 @@ class Cluster: public tt_device std::set target_devices_in_cluster = {}; std::set target_remote_chips = {}; tt::ARCH arch_name; - std::unordered_map> m_pci_device_map; // Map of enabled pci devices - int m_num_pci_devices; // Number of pci devices in system (enabled or disabled) + std::unordered_map> m_pci_device_map; // Map of enabled pci devices + int m_num_pci_devices; // Number of pci devices in system (enabled or disabled) std::shared_ptr ndesc; // remote eth transfer setup static constexpr std::uint32_t NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 6; static constexpr std::uint32_t NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = 4; static constexpr std::uint32_t NON_EPOCH_ETH_CORES_START_ID = 0; - static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); + static constexpr std::uint32_t NON_EPOCH_ETH_CORES_MASK = (NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1); - static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; - static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; - static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS-1); + static constexpr std::uint32_t EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS = + NUM_ETH_CORES_FOR_NON_MMIO_TRANSFERS - NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; + static constexpr std::uint32_t EPOCH_ETH_CORES_START_ID = + NON_EPOCH_ETH_CORES_START_ID + NON_EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS; + static constexpr std::uint32_t EPOCH_ETH_CORES_MASK = (EPOCH_ETH_CORES_FOR_NON_MMIO_TRANSFERS - 1); int active_core = NON_EPOCH_ETH_CORES_START_ID; - std::vector< std::vector > remote_transfer_ethernet_cores; + std::vector> remote_transfer_ethernet_cores; std::unordered_map flush_non_mmio_per_chip = {}; bool non_mmio_transfer_cores_customized = false; std::unordered_map active_eth_core_idx_per_chip = {}; @@ -850,7 +1026,7 @@ class Cluster: public tt_device bool use_ethernet_ordered_writes = true; bool use_ethernet_broadcast = true; bool use_virtual_coords_for_eth_broadcast = true; - tt_version eth_fw_version; // Ethernet FW the driver is interfacing with + tt_version eth_fw_version; // Ethernet FW the driver is interfacing with // Named Mutexes static constexpr char NON_MMIO_MUTEX_NAME[] = "NON_MMIO"; static constexpr char ARC_MSG_MUTEX_NAME[] = "ARC_MSG"; @@ -859,13 +1035,13 @@ class Cluster: public tt_device static constexpr std::uint32_t SW_VERSION = 0x06060000; }; -} +} // namespace tt::umd -constexpr inline bool operator==(const tt_version &a, const tt_version &b) { +constexpr inline bool operator==(const tt_version& a, const tt_version& b) { return a.major == b.major && a.minor == b.minor && a.patch == b.patch; } -constexpr inline bool operator>=(const tt_version &a, const tt_version &b) { +constexpr inline bool operator>=(const tt_version& a, const tt_version& b) { bool fw_major_greater = a.major > b.major; bool fw_minor_greater = (a.major == b.major) && (a.minor > b.minor); bool patch_greater_or_equal = (a.major == b.major) && (a.minor == b.minor) && (a.patch >= b.patch); diff --git a/device/coordinate_manager.cpp b/device/coordinate_manager.cpp index 324066933..b2eac485f 100644 --- a/device/coordinate_manager.cpp +++ b/device/coordinate_manager.cpp @@ -4,7 +4,9 @@ * SPDX-License-Identifier: Apache-2.0 */ #include "device/coordinate_manager.h" + #include + #include "coordinate_manager.h" #include "grayskull/grayskull_coordinate_manager.h" @@ -71,13 +73,9 @@ void CoordinateManager::clear_harvesting_structures() { virtual_y_to_logical_y.clear(); } -std::set CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { - return {}; -} +std::set CoordinateManager::get_x_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; } -std::set CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { - return {}; -} +std::set CoordinateManager::get_y_coordinates_to_harvest(std::size_t harvesting_mask) { return {}; } void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) { clear_harvesting_structures(); @@ -104,14 +102,16 @@ void CoordinateManager::perform_harvesting(std::size_t harvesting_mask) { logical_x_to_virtual_x.resize(grid_size_x - num_harvested_x); logical_y_to_virtual_y.resize(grid_size_y - num_harvested_y); - fill_logical_to_physical_mapping(x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested); + fill_logical_to_physical_mapping( + x_coordinates_to_harvest, y_coordinates_to_harvest, physical_x_unharvested, physical_y_unharvested); fill_logical_to_virtual_mapping(physical_x_unharvested, physical_y_unharvested); } void CoordinateManager::fill_logical_to_physical_mapping( - const std::set& x_to_harvest, const std::set& y_to_harvest, - const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { - + const std::set& x_to_harvest, + const std::set& y_to_harvest, + const std::set& physical_x_unharvested, + const std::set& physical_y_unharvested) { auto physical_y_it = physical_y_unharvested.begin(); std::size_t logical_y = 0; for (size_t y = 0; y < worker_grid_size.y; y++) { @@ -130,7 +130,7 @@ void CoordinateManager::fill_logical_to_physical_mapping( auto physical_x_it = physical_x_unharvested.begin(); std::size_t logical_x = 0; - for(std::size_t x = 0; x < worker_grid_size.x; x++) { + for (std::size_t x = 0; x < worker_grid_size.x; x++) { if (x_to_harvest.find(x) == x_to_harvest.end()) { logical_x_to_physical_x[logical_x] = *physical_x_it; if (physical_x_to_logical_x.find(*physical_x_it) != physical_x_to_logical_x.end()) { @@ -145,7 +145,8 @@ void CoordinateManager::fill_logical_to_physical_mapping( } } -void CoordinateManager::fill_logical_to_virtual_mapping(const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { +void CoordinateManager::fill_logical_to_virtual_mapping( + const std::set& physical_x_unharvested, const std::set& physical_y_unharvested) { auto physical_y_it = physical_y_unharvested.begin(); for (std::size_t y = 0; y < logical_y_to_virtual_y.size(); y++) { logical_y_to_virtual_y[y] = *physical_y_it; @@ -176,7 +177,6 @@ std::unique_ptr CoordinateManager::get_coordinate_manager( const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) { - switch (arch) { case tt::ARCH::GRAYSKULL: return std::make_unique(worker_grid_size, workers, harvesting_mask); diff --git a/device/coordinate_manager.h b/device/coordinate_manager.h index a4786b661..53ad266f9 100644 --- a/device/coordinate_manager.h +++ b/device/coordinate_manager.h @@ -7,17 +7,17 @@ #pragma once #include -#include #include +#include -#include "device/tt_xy_pair.h" #include "device/tt_arch_types.h" +#include "device/tt_xy_pair.h" class CoordinateManager { - public: - CoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {} + CoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + worker_grid_size(worker_grid_size), workers(workers), harvesting_mask(harvesting_mask) {} virtual void perform_harvesting(std::size_t harvesting_mask); @@ -49,14 +49,17 @@ class CoordinateManager { protected: virtual void clear_harvesting_structures(); - + virtual std::set get_x_coordinates_to_harvest(std::size_t harvesting_mask); virtual std::set get_y_coordinates_to_harvest(std::size_t harvesting_mask); virtual void fill_logical_to_physical_mapping( - const std::set& x_to_harvest, const std::set& y_to_harvest, + const std::set& x_to_harvest, + const std::set& y_to_harvest, + const std::set& physical_x_unharvested, + const std::set& physical_y_unharvested); + virtual void fill_logical_to_virtual_mapping( const std::set& physical_x_unharvested, const std::set& physical_y_unharvested); - virtual void fill_logical_to_virtual_mapping(const std::set& physical_x_unharvested, const std::set& physical_y_unharvested); std::map physical_y_to_logical_y; std::map physical_x_to_logical_x; diff --git a/device/cpuset_lib.cpp b/device/cpuset_lib.cpp index 0f1c4e5d3..e09591d3d 100644 --- a/device/cpuset_lib.cpp +++ b/device/cpuset_lib.cpp @@ -2,17 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 +#include "cpuset_lib.hpp" + #include +#include +#include -#include "cpuset_lib.hpp" #include "common/logger.hpp" -#include #include "device/cluster.h" -#include #include "fmt/core.h" + namespace tt { namespace fs = std::filesystem; + namespace cpuset { ///////////////////////////////////////////////////////////////////////// @@ -21,15 +24,18 @@ namespace cpuset { // Constructor for singleton class cpu id allocator tt_cpuset_allocator::tt_cpuset_allocator() { - - m_pid = getpid(); - m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; + m_pid = getpid(); + m_debug = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_DEBUG") ? true : false; // Chicken bit to disable this entire feature for debug/comparison. bool cpuset_allocator_enable_env = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_ENABLE") ? true : false; auto system_tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", m_pid, system_tid); + log_debug( + LogSiliconDriver, + "Starting tt_cpuset_allocator constructor now for process_id: {} thread_id: {}", + m_pid, + system_tid); m_enable_cpuset_allocator = true; @@ -38,86 +44,102 @@ tt_cpuset_allocator::tt_cpuset_allocator() { m_enable_cpuset_allocator &= init_get_number_of_packages(); m_enable_cpuset_allocator &= init_find_tt_pci_devices_packages_numanodes(); - if (!cpuset_allocator_enable_env){ + if (!cpuset_allocator_enable_env) { m_enable_cpuset_allocator = false; - }else{ - - bool is_cpu_supported = init_is_cpu_model_supported(); + } else { + bool is_cpu_supported = init_is_cpu_model_supported(); - if (is_cpu_supported){ + if (is_cpu_supported) { m_enable_cpuset_allocator &= init_determine_cpuset_allocations(); - }else{ + } else { m_enable_cpuset_allocator = false; } - log_debug(LogSiliconDriver,"Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} thread_id: {} ", m_enable_cpuset_allocator, m_pid, system_tid); + log_debug( + LogSiliconDriver, + "Finished tt_cpuset_allocator constructor now with m_enable_cpuset_allocator: {} for process_id: {} " + "thread_id: {} ", + m_enable_cpuset_allocator, + m_pid, + system_tid); } } // Step 1 : Initialize and perform m_topology detection -bool tt_cpuset_allocator::init_topology_init_and_load(){ - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::topology_init_and_load()"); +bool tt_cpuset_allocator::init_topology_init_and_load() { + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::topology_init_and_load()"); - if (!m_enable_cpuset_allocator){ + if (!m_enable_cpuset_allocator) { return false; } - if (hwloc_topology_init(&m_topology)){ + if (hwloc_topology_init(&m_topology)) { log_warning(LogSiliconDriver, "Problem initializing topology"); return false; } - hwloc_topology_set_type_filter(m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices. + hwloc_topology_set_type_filter( + m_topology, HWLOC_OBJ_PCI_DEVICE, HWLOC_TYPE_FILTER_KEEP_ALL); // Need to find PCI devices. - if (hwloc_topology_load(m_topology)){ + if (hwloc_topology_load(m_topology)) { log_warning(LogSiliconDriver, "Problem loading topology"); return false; } - return true; // Success + return true; // Success } -// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and numamode. -bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ - - if (!m_enable_cpuset_allocator){ +// Step 2 - Find TT PCI devices in topology by vendor_id to get their PCI bus_id and physical device_id, and package and +// numamode. +bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() { + if (!m_enable_cpuset_allocator) { return false; } - log_debug(LogSiliconDriver,"Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()"); + log_debug(LogSiliconDriver, "Starting tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes()"); m_num_tt_device_by_pci_device_id_map.clear(); hwloc_obj_t pci_device_obj = NULL; const std::regex tt_device_re("tenstorrent!([0-9]+)"); - while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))){ - - if (hwloc_obj_type_is_io(pci_device_obj->type) && (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) { - - std::pair device_id_revision = std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision); + while ((pci_device_obj = hwloc_get_next_pcidev(m_topology, pci_device_obj))) { + if (hwloc_obj_type_is_io(pci_device_obj->type) && + (pci_device_obj->attr->pcidev.vendor_id == TENSTORRENT_VENDOR_ID)) { + std::pair device_id_revision = + std::make_pair(pci_device_obj->attr->pcidev.device_id, pci_device_obj->attr->pcidev.revision); m_num_tt_device_by_pci_device_id_map[device_id_revision] += 1; - std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj); + std::string pci_bus_id_str = get_pci_bus_id(pci_device_obj); std::string pci_device_dir = fmt::format("/sys/bus/pci/devices/{}/tenstorrent/", pci_bus_id_str); int physical_device_id = -1; - log_trace(LogSiliconDriver, "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", pci_bus_id_str, m_num_tt_device_by_pci_device_id_map[device_id_revision]); + log_trace( + LogSiliconDriver, + "Found TT device with pci_bus_id_str: {} num_devices_by_pci_device_id: {}", + pci_bus_id_str, + m_num_tt_device_by_pci_device_id_map[device_id_revision]); // First, get the physical_device_id of the device. - if (fs::exists(pci_device_dir)){ - for (const auto &entry : fs::directory_iterator(pci_device_dir)){ + if (fs::exists(pci_device_dir)) { + for (const auto &entry : fs::directory_iterator(pci_device_dir)) { auto entry_str = entry.path().string(); - if (std::smatch device_match; std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)){ + if (std::smatch device_match; + std::regex_search(entry_str, device_match, tt_device_re) and (stoi(device_match[1]) >= 0)) { physical_device_id = stoi(device_match[1]); m_all_tt_devices.push_back(physical_device_id); - log_debug(LogSiliconDriver, "Found physical_device_id: {} from file: {}", physical_device_id, entry_str); + log_debug( + LogSiliconDriver, + "Found physical_device_id: {} from file: {}", + physical_device_id, + entry_str); break; } } - if (physical_device_id == -1){ - log_warning(LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir); + if (physical_device_id == -1) { + log_warning( + LogSiliconDriver, "Did not find file containing physical_device_id in {}", pci_device_dir); return false; } @@ -125,19 +147,23 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ // Next, get the PackageID of the device and update maps. auto package_id = get_package_id_from_device(pci_device_obj, physical_device_id); - - // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this + + // This package was not previously seen. Initialize structures tracking the TT Devices mapped to this // package and structures storing the CPU characteristics per package. if (m_package_id_to_devices_map.find(package_id) == m_package_id_to_devices_map.end()) { m_package_id_to_devices_map.insert({package_id, {}}); m_package_id_to_num_l3_per_ccx_map.insert({package_id, 0}); m_package_id_to_num_ccx_per_ccd_map.insert({package_id, 0}); } - if (package_id != -1){ + if (package_id != -1) { m_package_id_to_devices_map.at(package_id).push_back(physical_device_id); m_physical_device_id_to_package_id_map.insert({physical_device_id, package_id}); } else { - log_warning(LogSiliconDriver, "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + log_warning( + LogSiliconDriver, + "Could not find package_id for TT Device (physical_device_id: {} pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); return false; } @@ -145,378 +171,479 @@ bool tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes(){ auto numa_nodeset = get_numa_nodeset_from_device(pci_device_obj, physical_device_id); m_physical_device_id_to_numa_nodeset_map.insert({physical_device_id, numa_nodeset}); - if (numa_nodeset == 0x0){ - log_warning(LogSiliconDriver, "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + if (numa_nodeset == 0x0) { + log_warning( + LogSiliconDriver, + "Could not find NumaNodeSet for TT Device (physical_device_id: {} pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); return false; } - m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector. + m_physical_device_id_to_cpusets_map.insert({physical_device_id, {}}); // Empty vector. m_num_cpu_cores_allocated_per_tt_device.insert({physical_device_id, 0}); } } } - if (m_all_tt_devices.size() == 0){ - log_warning(LogSiliconDriver, "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", TENSTORRENT_VENDOR_ID); + if (m_all_tt_devices.size() == 0) { + log_warning( + LogSiliconDriver, + "Did not find any PCI devices matching Tenstorrent vendor_id 0x{:x}", + TENSTORRENT_VENDOR_ID); return false; } - log_debug(LogSiliconDriver,"Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", m_all_tt_devices.size()); - + log_debug( + LogSiliconDriver, + "Finshed tt_cpuset_allocator::init_find_tt_pci_devices_packages_numanodes() found {} devices", + m_all_tt_devices.size()); // Sort these 2 vectors of device_ids before we are done, since discovery can be in any order. - for (auto &p: m_package_id_to_devices_map){ + for (auto &p : m_package_id_to_devices_map) { std::sort(p.second.begin(), p.second.end()); } std::sort(m_all_tt_devices.begin(), m_all_tt_devices.end()); - return true; // Success + return true; // Success } - // Step 3 : Detect the number of packages. -bool tt_cpuset_allocator::init_get_number_of_packages(){ - - if (!m_enable_cpuset_allocator){ +bool tt_cpuset_allocator::init_get_number_of_packages() { + if (!m_enable_cpuset_allocator) { return false; } m_num_packages = hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_PACKAGE); - log_debug(LogSiliconDriver,"Found {} CPU packages", m_num_packages); - return m_num_packages > 0; // Success + log_debug(LogSiliconDriver, "Found {} CPU packages", m_num_packages); + return m_num_packages > 0; // Success } // Step 4 : Return true if all packages are models we want to support. Env-var can be used to ignore this check. -bool tt_cpuset_allocator::init_is_cpu_model_supported(){ - - if (!m_enable_cpuset_allocator){ +bool tt_cpuset_allocator::init_is_cpu_model_supported() { + if (!m_enable_cpuset_allocator) { return false; } - if (m_num_packages == 0){ - log_debug(LogSiliconDriver,"init_is_cpu_model_supported(): Found 0 packages, functions run out of order?"); + if (m_num_packages == 0) { + log_debug(LogSiliconDriver, "init_is_cpu_model_supported(): Found 0 packages, functions run out of order?"); return false; } bool use_any_cpu = std::getenv("TT_BACKEND_CPUSET_ALLOCATOR_SUPPORT_ANY_CPU") ? true : false; - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::check_if_cpu_model_supported()"); + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::check_if_cpu_model_supported()"); // Supported CPU Models for enabling CPUSET Allocator. Keep the list small to production machines to start. - std::vector supported_cpu_models = { "AMD EPYC 7352 24-Core Processor", - "AMD EPYC 7532 32-Core Processor"}; + std::vector supported_cpu_models = { + "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"}; // CPU Models that have L3 per CCX and 2 CCX per CCD - std::vector opt_2ccx_per_ccd_cpu_models = { "AMD EPYC 7352 24-Core Processor", - "AMD EPYC 7532 32-Core Processor"}; - for(const auto& package: m_package_id_to_devices_map) { + std::vector opt_2ccx_per_ccd_cpu_models = { + "AMD EPYC 7352 24-Core Processor", "AMD EPYC 7532 32-Core Processor"}; + for (const auto &package : m_package_id_to_devices_map) { int package_id = package.first; auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id); - if (m_debug) print_hwloc_object(package_obj, 0, true, true); + if (m_debug) { + print_hwloc_object(package_obj, 0, true, true); + } std::string pkg_cpu_model = hwloc_obj_get_info_by_name(package_obj, "CPUModel"); // First find out if this CPU is supported by CPUSET Allocator at all. bool has_supported_cpu = use_any_cpu ? true : false; - for (auto &supported_cpu_model : supported_cpu_models){ + for (auto &supported_cpu_model : supported_cpu_models) { has_supported_cpu |= (pkg_cpu_model.find(supported_cpu_model) != std::string::npos); } - log_debug(LogSiliconDriver,"Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", package_id, has_supported_cpu, pkg_cpu_model); + log_debug( + LogSiliconDriver, + "Detected package-id: {} has_supported_cpu: {} for CpuModel: {}", + package_id, + has_supported_cpu, + pkg_cpu_model); - if (!has_supported_cpu){ + if (!has_supported_cpu) { return false; } // Then, determine if the 2CCX-PER-CCD optimization can be enabled for this CPU Model in the package. - for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models){ - if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos){ + for (auto &opt_cpu_model : opt_2ccx_per_ccd_cpu_models) { + if (pkg_cpu_model.find(opt_cpu_model) != std::string::npos) { m_package_id_to_num_l3_per_ccx_map.at(package_id) = 1; m_package_id_to_num_ccx_per_ccd_map.at(package_id) = 2; } } } - return true; // Successhwloc + return true; // Successhwloc } - -// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given socket/package. -bool tt_cpuset_allocator::init_determine_cpuset_allocations(){ - - if (!m_enable_cpuset_allocator){ +// Step 5: Get all target allocation objects (ie. L3Cache if IO thread to be allocated per L3Cache cpuset) for a given +// socket/package. +bool tt_cpuset_allocator::init_determine_cpuset_allocations() { + if (!m_enable_cpuset_allocator) { return false; } - log_debug(LogSiliconDriver,"Inside tt_cpuset_allocator::init_determine_cpuset_allocations()"); - for (const auto& package : m_package_id_to_devices_map) { + log_debug(LogSiliconDriver, "Inside tt_cpuset_allocator::init_determine_cpuset_allocations()"); + for (const auto &package : m_package_id_to_devices_map) { int package_id = package.first; auto num_tt_devices_for_cpu_package = package.second.size(); - if (num_tt_devices_for_cpu_package == 0){ - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", package_id); + if (num_tt_devices_for_cpu_package == 0) { + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations() -- no TT devices for package_id: {}, skipping.", + package_id); continue; } - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", package_id); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). starting to detect allocation slots for package_id: {} ", + package_id); auto package_obj = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id); - if (m_debug) print_hwloc_object(package_obj, 0, true, true); + if (m_debug) { + print_hwloc_object(package_obj, 0, true, true); + } - auto num_alloc_slots_in_package = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot); - if (num_alloc_slots_in_package == 0){ - log_warning(LogSiliconDriver, "Could not find any of the alloc objects in package_id: {} for this cpu arc", package_id); + auto num_alloc_slots_in_package = + hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, package_obj->cpuset, m_object_per_alloc_slot); + if (num_alloc_slots_in_package == 0) { + log_warning( + LogSiliconDriver, + "Could not find any of the alloc objects in package_id: {} for this cpu arc", + package_id); return false; } auto num_alloc_slots_per_tt_device = num_alloc_slots_in_package / num_tt_devices_for_cpu_package; // Above splits evenly by devices, leaves remainder unused in the example case of 3 devices but 8 slots. - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}", - package_id, num_alloc_slots_in_package, num_tt_devices_for_cpu_package, num_alloc_slots_per_tt_device); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). package_id: {} num_alloc_slots_in_package: {} " + "num_tt_devices_for_cpu_package: {} num_alloc_slots_per_tt_device: {}", + package_id, + num_alloc_slots_in_package, + num_tt_devices_for_cpu_package, + num_alloc_slots_per_tt_device); int device_idx = 0; - for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++){ + for (int obj_idx = 0; obj_idx < num_alloc_slots_in_package; obj_idx++) { + auto obj = hwloc_get_obj_below_by_type( + m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx); - auto obj = hwloc_get_obj_below_by_type(m_topology, HWLOC_OBJ_PACKAGE, package_id, m_object_per_alloc_slot, obj_idx); - - if (obj){ - if (m_debug) print_hwloc_object(obj, 1, true); + if (obj) { + if (m_debug) { + print_hwloc_object(obj, 1, true); + } auto physical_device_id = m_package_id_to_devices_map.at(package_id).at(device_idx); // Hack for maximum number of slots per device. // if (m_physical_device_id_to_cpusets_map.at(physical_device_id).size() < 2){ m_physical_device_id_to_cpusets_map.at(physical_device_id).push_back(obj->cpuset); - int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology,obj->cpuset,HWLOC_OBJ_CORE); + int num_cpus = hwloc_get_nbobjs_inside_cpuset_by_type(m_topology, obj->cpuset, HWLOC_OBJ_CORE); m_num_cpu_cores_allocated_per_tt_device.at(physical_device_id) += num_cpus; // } // We're distributing allocation objects per package across TT devices, so switch to next one. - if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0){ - device_idx = (device_idx + 1) % num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to first device for that package. + if (((obj_idx + 1) % num_alloc_slots_per_tt_device) == 0) { + device_idx = (device_idx + 1) % + num_tt_devices_for_cpu_package; // Loop around if extra slots remain. Assigned to + // first device for that package. } - }else{ - log_warning(LogSiliconDriver, "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under package"); + } else { + log_warning( + LogSiliconDriver, + "init_determine_cpuset_allocations(). Something went wrong looking for cpuset alloc object under " + "package"); return false; } } - log_debug(LogSiliconDriver, "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", package_id); + log_debug( + LogSiliconDriver, + "init_determine_cpuset_allocations(). Done detecting allocation slots for package_id: {} ", + package_id); } - // Summary for Debug purposes. - for (auto &physical_device_id : m_all_tt_devices){ - for (size_t device_alloc_idx=0; device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); device_alloc_idx++){ + for (auto &physical_device_id : m_all_tt_devices) { + for (size_t device_alloc_idx = 0; + device_alloc_idx < m_physical_device_id_to_cpusets_map.at(physical_device_id).size(); + device_alloc_idx++) { auto cpuset = m_physical_device_id_to_cpusets_map.at(physical_device_id).at(device_alloc_idx); auto pu_ids_vector = get_hwloc_bitmap_vector(cpuset); auto num_pu_ids = pu_ids_vector.size(); auto package_id = m_physical_device_id_to_package_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} device_alloc_idx: {} picked {} PU's {}", physical_device_id, package_id, device_alloc_idx, num_pu_ids, pu_ids_vector); + log_debug( + LogSiliconDriver, + "Done init_determine_cpuset_allocations(). Summary => for mmio physical_device_id: {} package_id: {} " + "device_alloc_idx: {} picked {} PU's {}", + physical_device_id, + package_id, + device_alloc_idx, + num_pu_ids, + pu_ids_vector); } } - return true; // Success - + return true; // Success } ///////////////////////////////////////////////////////////////////////// // Runtime Functions //////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// -// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously allocated memory region to it. -bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ - +// Given a physical device_id, determine the right numa nodes associated with it and attempt to membind a previously +// allocated memory region to it. +bool tt_cpuset_allocator::bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) { auto tid = std::this_thread::get_id(); - log_debug(LogSiliconDriver,"bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: {} (pid: {} tid: {})", physical_device_id, m_pid, tid); - - if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0){ - log_fatal("bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not expected.", physical_device_id); + log_debug( + LogSiliconDriver, + "bind_area_memory_nodeset(): Going to attempt memory binding of addr/len to NumaNode for physical_device_id: " + "{} (pid: {} tid: {})", + physical_device_id, + m_pid, + tid); + + if (m_physical_device_id_to_numa_nodeset_map.count(physical_device_id) == 0) { + log_fatal( + "bind_area_memory_nodeset(): Did not find physical_device_id: {} in numanode_mask map, this is not " + "expected.", + physical_device_id); return false; } auto target_nodeset = m_physical_device_id_to_numa_nodeset_map.at(physical_device_id); - if (target_nodeset != 0){ - if (hwloc_set_area_membind(m_topology, addr, len, target_nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE) ){ - log_warning(LogSiliconDriver,"hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} tid: {})", - physical_device_id, get_hwloc_bitmap_vector(target_nodeset), strerror(errno), m_pid, tid); + if (target_nodeset != 0) { + if (hwloc_set_area_membind( + m_topology, + addr, + len, + target_nodeset, + HWLOC_MEMBIND_BIND, + HWLOC_MEMBIND_BYNODESET | HWLOC_MEMBIND_STRICT | HWLOC_MEMBIND_MIGRATE)) { + log_warning( + LogSiliconDriver, + "hwloc_set_area_membind(): failed for physical_device_id: {} on NodeSet: {} with errno: {} (pid: {} " + "tid: {})", + physical_device_id, + get_hwloc_bitmap_vector(target_nodeset), + strerror(errno), + m_pid, + tid); return false; - }else{ - log_debug(LogSiliconDriver,"hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", physical_device_id, get_hwloc_bitmap_vector(target_nodeset), m_pid, tid); + } else { + log_debug( + LogSiliconDriver, + "hwloc_set_area_membind(): success for physical_device_id: {} on NodeSet: {} (pid: {} tid: {})", + physical_device_id, + get_hwloc_bitmap_vector(target_nodeset), + m_pid, + tid); } - }else{ - log_warning(LogSiliconDriver,"bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. Skipping membind.", physical_device_id); + } else { + log_warning( + LogSiliconDriver, + "bind_area_memory_nodeset(): Unable to determine TT Device to NumaNode mapping for physical_device_id: {}. " + "Skipping membind.", + physical_device_id); return false; } - return true; // Success + return true; // Success } int tt_cpuset_allocator::_get_num_tt_pci_devices() { - for (auto &d : m_physical_device_id_to_package_id_map) { log_trace(LogSiliconDriver, "Found physical_device_id: {} ", d.first); } return m_physical_device_id_to_package_id_map.size(); } - - - ///////////////////////////////////////////////////////////////////////// -//Helper Functions ////////////////////////////////////////////////////// +// Helper Functions ////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// - -std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj){ - +std::string tt_cpuset_allocator::get_pci_bus_id(hwloc_obj_t pci_device_obj) { std::string pci_bus_id_str = ""; - if (hwloc_obj_type_is_io(pci_device_obj->type)) { + if (hwloc_obj_type_is_io(pci_device_obj->type)) { auto attrs = pci_device_obj->attr->pcidev; pci_bus_id_str = fmt::format("{:04x}:{:02x}:{:02x}.{:01x}", attrs.domain, attrs.bus, attrs.dev, attrs.func); } return pci_bus_id_str; - } -int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){ - +int tt_cpuset_allocator::get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) { auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", physical_device_id, pci_bus_id_str); + log_debug( + LogSiliconDriver, + "Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding CPU package", + physical_device_id, + pci_bus_id_str); hwloc_obj_t tmp_obj = hwloc_get_non_io_ancestor_obj(m_topology, pci_device_obj); int package_id = -1; // Keep going up until package/machine hierarchy is found, in case we don't find it right away. - while (package_id == -1){ - - if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)){ - if (tmp_obj->os_index != (unsigned) -1){ + while (package_id == -1) { + if ((hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_PACKAGE) == 0) || + (hwloc_compare_types(tmp_obj->type, HWLOC_OBJ_MACHINE) == 0)) { + if (tmp_obj->os_index != (unsigned)-1) { package_id = tmp_obj->os_index; - }else{ - log_warning(LogSiliconDriver, "Could not find os_index of package or machine object for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + } else { + log_warning( + LogSiliconDriver, + "Could not find os_index of package or machine object for TT device (physical_device_id: {} " + "pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); break; } - }else{ - if (tmp_obj->parent){ + } else { + if (tmp_obj->parent) { tmp_obj = tmp_obj->parent; - }else{ + } else { break; } } } - if (m_debug) print_hwloc_object(pci_device_obj, 1, true, true); - if (m_debug) print_hwloc_object(tmp_obj, 1, true, true); + if (m_debug) { + print_hwloc_object(pci_device_obj, 1, true, true); + } + if (m_debug) { + print_hwloc_object(tmp_obj, 1, true, true); + } return package_id; } -hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id){ - +hwloc_nodeset_t tt_cpuset_allocator::get_numa_nodeset_from_device( + hwloc_obj_t pci_device_obj, chip_id_t physical_device_id) { hwloc_nodeset_t nodeset = 0x0; // Currently an issue in non-EPYC machines where PCI devices are directly under Machine, and not any NumaNodes. // As quick workaround, skip this if there is only single numanode since returning 1 seems fine. - if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1){ + if (hwloc_get_nbobjs_by_type(m_topology, HWLOC_OBJ_NUMANODE) == 1) { auto numanode = hwloc_get_obj_by_type(m_topology, HWLOC_OBJ_NUMANODE, 0); return numanode->nodeset; } auto pci_bus_id_str = m_physical_device_id_to_pci_bus_id_map.at(physical_device_id); - log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's corresponding NumaNode.", physical_device_id, pci_bus_id_str); + log_debug( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): Checking TT device (physical_device_id: {} pci_bus_id: {}) to find it's " + "corresponding NumaNode.", + physical_device_id, + pci_bus_id_str); hwloc_obj_t tmp_obj = pci_device_obj->parent; - while (tmp_obj && !tmp_obj->memory_arity){ + while (tmp_obj && !tmp_obj->memory_arity) { tmp_obj = tmp_obj->parent; /* no memory child, walk up */ } - if (tmp_obj && tmp_obj->nodeset){ - log_debug(LogSiliconDriver, "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found NumaNodeSet: {}", physical_device_id, pci_bus_id_str, get_hwloc_bitmap_vector(tmp_obj->nodeset)); + if (tmp_obj && tmp_obj->nodeset) { + log_debug( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): For TT device (physical_device_id: {} pci_bus_id: {}) found " + "NumaNodeSet: {}", + physical_device_id, + pci_bus_id_str, + get_hwloc_bitmap_vector(tmp_obj->nodeset)); nodeset = tmp_obj->nodeset; - }else{ - log_warning(LogSiliconDriver, "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} pci_bus_id: {})", physical_device_id, pci_bus_id_str); + } else { + log_warning( + LogSiliconDriver, + "init_detect_tt_device_numanodes(): Could not determine NumaNodeSet for TT device (physical_device_id: {} " + "pci_bus_id: {})", + physical_device_id, + pci_bus_id_str); } return nodeset; - } int tt_cpuset_allocator::_get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision) { - std::pair device_id_revision = std::make_pair(device_id, revision); if (m_num_tt_device_by_pci_device_id_map.find(device_id_revision) != m_num_tt_device_by_pci_device_id_map.end()) { return m_num_tt_device_by_pci_device_id_map.at(device_id_revision); } else { - log_warning(LogSiliconDriver, "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", device_id, revision); + log_warning( + LogSiliconDriver, + "Cannot find any TT device with PCI device_id: 0x{:x} and revision: {} in topology.", + device_id, + revision); return 0; } } ///////////////////////////////////////////////////////////////////////// -//Debug Functions /////////////////////////////////////////////////////// +// Debug Functions /////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// // Get all PU ids (or numa nodes) in a vector, for legacy/back-compat/debug purposes. -std::vector tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap){ - +std::vector tt_cpuset_allocator::get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap) { std::vector indices; int index; - if (bitmap){ - hwloc_bitmap_foreach_begin(index, bitmap) - indices.push_back(index); + if (bitmap) { + hwloc_bitmap_foreach_begin(index, bitmap) indices.push_back(index); hwloc_bitmap_foreach_end(); } return indices; } -std::vector tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj){ +std::vector tt_cpuset_allocator::get_hwloc_cpuset_vector(hwloc_obj_t &obj) { return get_hwloc_bitmap_vector(obj->cpuset); } -std::vector tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj){ +std::vector tt_cpuset_allocator::get_hwloc_nodeset_vector(hwloc_obj_t &obj) { return get_hwloc_bitmap_vector(obj->nodeset); } - // Nicer way to print pu ids as a vector on single line. -void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj){ +void tt_cpuset_allocator::print_hwloc_cpuset(hwloc_obj_t &obj) { std::cout << " Number: " << hwloc_bitmap_weight(obj->cpuset) << " cpuset_pu_ids: " << get_hwloc_cpuset_vector(obj); } -void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj){ - std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj); +void tt_cpuset_allocator::print_hwloc_nodeset(hwloc_obj_t &obj) { + std::cout << " Number: " << hwloc_bitmap_weight(obj->nodeset) + << " nodeset node_ids: " << get_hwloc_nodeset_vector(obj); } -void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids){ - +void tt_cpuset_allocator::print_hwloc_object(hwloc_obj_t &obj, int depth, bool verbose, bool show_cpuids) { char type[32], attr[1024]; hwloc_obj_type_snprintf(type, sizeof(type), obj, verbose); - printf("%*s%s", 2*depth, "", type); - if (obj->os_index != (unsigned) -1) + printf("%*s%s", 2 * depth, "", type); + if (obj->os_index != (unsigned)-1) { printf("#%u", obj->os_index); + } hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", verbose); - if (*attr) + if (*attr) { printf("(%s)", attr); - if (show_cpuids && obj->cpuset) + } + if (show_cpuids && obj->cpuset) { print_hwloc_cpuset(obj); + } printf("\n"); } - } // namespace cpuset } // namespace tt - diff --git a/device/cpuset_lib.hpp b/device/cpuset_lib.hpp index a14a4f334..46994833c 100644 --- a/device/cpuset_lib.hpp +++ b/device/cpuset_lib.hpp @@ -4,18 +4,17 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once +#include + #include -#include -#include #include +#include #include -#include - -#include "device/tt_cluster_descriptor.h" // For chip_id_t +#include +#include "device/tt_cluster_descriptor.h" // For chip_id_t #include "hwloc.h" using tt_cluster_description = tt_ClusterDescriptor; @@ -27,90 +26,87 @@ namespace cpuset { // CPU ID allocator for pinning threads to cpu_ids // It's a singleton that should be retrieved via get() struct tt_cpuset_allocator { - public: - - tt_cpuset_allocator(tt_cpuset_allocator const&) = delete; - void operator=(tt_cpuset_allocator const&) = delete; - - // Bind an already allocated memory region to particular numa nodes - static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len){ - auto& instance = tt_cpuset_allocator::get(); - return instance.bind_area_memory_nodeset(physical_device_id, addr, len); - } - - static int get_num_tt_pci_devices(){ - auto& instance = tt_cpuset_allocator::get(); - return instance._get_num_tt_pci_devices(); - } - - static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id){ - auto& instance = tt_cpuset_allocator::get(); - return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); - } - - private: - - static tt_cpuset_allocator& get() { - static tt_cpuset_allocator instance; - return instance; - } - - tt_cpuset_allocator(); - - int TENSTORRENT_VENDOR_ID = 0x1e52; - - bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void * addr, size_t len); - int _get_num_tt_pci_devices(); - int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); - - // Series of init functions, must be called in this order. Seperated out to support - // early exit in case of errors. - bool init_topology_init_and_load(); - bool init_find_tt_pci_devices_packages_numanodes(); - bool init_get_number_of_packages(); - bool init_is_cpu_model_supported(); - bool init_determine_cpuset_allocations(); - - // Helper Functions - std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); - int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); - hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); - - // Debug Functions - void print_hwloc_cpuset(hwloc_obj_t &obj); - void print_hwloc_nodeset(hwloc_obj_t &obj); - void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true); - std::vector get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap); - std::vector get_hwloc_cpuset_vector(hwloc_obj_t &obj); - std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); - hwloc_topology_t m_topology; - bool m_debug; - pid_t m_pid; - - // Items calculated by parsing system info, used by allocation algorithm: - std::map> m_package_id_to_devices_map; - std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info - std::map, int> m_num_tt_device_by_pci_device_id_map; - - std::map> m_physical_device_id_to_cpusets_map; - std::map m_physical_device_id_to_package_id_map; - - bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. - int m_num_packages = 0; - std::vector m_all_tt_devices = {}; - - hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default +public: + tt_cpuset_allocator(tt_cpuset_allocator const &) = delete; + void operator=(tt_cpuset_allocator const &) = delete; + + // Bind an already allocated memory region to particular numa nodes + static bool bind_area_to_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len) { + auto &instance = tt_cpuset_allocator::get(); + return instance.bind_area_memory_nodeset(physical_device_id, addr, len); + } - // For 2CCX-PER-CCD Optimization detection. - std::map m_package_id_to_num_l3_per_ccx_map; - std::map m_package_id_to_num_ccx_per_ccd_map; + static int get_num_tt_pci_devices() { + auto &instance = tt_cpuset_allocator::get(); + return instance._get_num_tt_pci_devices(); + } - // Memory Binding - std::map m_physical_device_id_to_numa_nodeset_map; + static int get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id) { + auto &instance = tt_cpuset_allocator::get(); + return instance._get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); + } - // Helper for some dynamic multi-threading. - std::map m_num_cpu_cores_allocated_per_tt_device; +private: + static tt_cpuset_allocator &get() { + static tt_cpuset_allocator instance; + return instance; + } + tt_cpuset_allocator(); + + int TENSTORRENT_VENDOR_ID = 0x1e52; + + bool bind_area_memory_nodeset(chip_id_t physical_device_id, const void *addr, size_t len); + int _get_num_tt_pci_devices(); + int _get_num_tt_pci_devices_by_pci_device_id(uint16_t device_id, uint16_t revision_id); + + // Series of init functions, must be called in this order. Seperated out to support + // early exit in case of errors. + bool init_topology_init_and_load(); + bool init_find_tt_pci_devices_packages_numanodes(); + bool init_get_number_of_packages(); + bool init_is_cpu_model_supported(); + bool init_determine_cpuset_allocations(); + + // Helper Functions + std::string get_pci_bus_id(hwloc_obj_t pci_device_obj); + int get_package_id_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); + hwloc_nodeset_t get_numa_nodeset_from_device(hwloc_obj_t pci_device_obj, chip_id_t physical_device_id); + + // Debug Functions + void print_hwloc_cpuset(hwloc_obj_t &obj); + void print_hwloc_nodeset(hwloc_obj_t &obj); + void print_hwloc_object(hwloc_obj_t &obj, int depth = 0, bool verbose = false, bool show_cpuids = true); + std::vector get_hwloc_bitmap_vector(hwloc_bitmap_t &bitmap); + std::vector get_hwloc_cpuset_vector(hwloc_obj_t &obj); + std::vector get_hwloc_nodeset_vector(hwloc_obj_t &obj); + hwloc_topology_t m_topology; + bool m_debug; + pid_t m_pid; + + // Items calculated by parsing system info, used by allocation algorithm: + std::map> m_package_id_to_devices_map; + std::map m_physical_device_id_to_pci_bus_id_map; // Debug/Info + std::map, int> m_num_tt_device_by_pci_device_id_map; + + std::map> m_physical_device_id_to_cpusets_map; + std::map m_physical_device_id_to_package_id_map; + + bool m_enable_cpuset_allocator = true; // Enable feature, otherwise do nothing. + int m_num_packages = 0; + std::vector m_all_tt_devices = {}; + + hwloc_obj_type_t m_object_per_alloc_slot = HWLOC_OBJ_L3CACHE; // Default + + // For 2CCX-PER-CCD Optimization detection. + std::map m_package_id_to_num_l3_per_ccx_map; + std::map m_package_id_to_num_ccx_per_ccd_map; + + // Memory Binding + std::map m_physical_device_id_to_numa_nodeset_map; + + // Helper for some dynamic multi-threading. + std::map m_num_cpu_cores_allocated_per_tt_device; }; template diff --git a/device/driver_atomics.h b/device/driver_atomics.h index ec2134388..4ed3e7a63 100644 --- a/device/driver_atomics.h +++ b/device/driver_atomics.h @@ -12,54 +12,44 @@ namespace tt_driver_atomics { #if defined(__x86_64__) || defined(__i386__) // Store-Any barrier. -static inline __attribute__((always_inline)) void sfence() { - _mm_sfence(); -} +static inline __attribute__((always_inline)) void sfence() { _mm_sfence(); } + // Load-Any barrier. -static inline __attribute__((always_inline)) void lfence() { - _mm_lfence(); -} +static inline __attribute__((always_inline)) void lfence() { _mm_lfence(); } + // Any-Any barrier. -static inline __attribute__((always_inline)) void mfence() { - _mm_mfence(); -} +static inline __attribute__((always_inline)) void mfence() { _mm_mfence(); } #elif defined(__ARM_ARCH) static inline __attribute__((always_inline)) void sfence() { // Full memory barrier (full system). ARM does not have a Store-Any barrier. // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB SY" : : : "memory"); + asm volatile("DMB SY" : : : "memory"); } static inline __attribute__((always_inline)) void lfence() { // Load-Any barrier (full system) // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB LD" : : : "memory"); + asm volatile("DMB LD" : : : "memory"); } static inline __attribute__((always_inline)) void mfence() { // Full memory barrier (full system). // https://developer.arm.com/documentation/100941/0101/Barriers - asm volatile ("DMB SY" : : : "memory"); + asm volatile("DMB SY" : : : "memory"); } #elif defined(__riscv) -static inline __attribute__((always_inline)) void sfence() { - asm volatile ("fence ow, ow" : : : "memory"); -} +static inline __attribute__((always_inline)) void sfence() { asm volatile("fence ow, ow" : : : "memory"); } -static inline __attribute__((always_inline)) void lfence() { - asm volatile ("fence ir, ir" : : : "memory"); -} +static inline __attribute__((always_inline)) void lfence() { asm volatile("fence ir, ir" : : : "memory"); } -static inline __attribute__((always_inline)) void mfence() { - asm volatile ("fence iorw, iorw" : : : "memory"); -} +static inline __attribute__((always_inline)) void mfence() { asm volatile("fence iorw, iorw" : : : "memory"); } #else #error "Unsupported architecture" #endif -} // namespace tt_driver_atomics +} // namespace tt_driver_atomics diff --git a/device/grayskull/grayskull_coordinate_manager.h b/device/grayskull/grayskull_coordinate_manager.h index ac6ee60d0..ba7eebe28 100644 --- a/device/grayskull/grayskull_coordinate_manager.h +++ b/device/grayskull/grayskull_coordinate_manager.h @@ -9,8 +9,8 @@ #include "device/coordinate_manager.h" class GrayskullCoordinateManager : public CoordinateManager { - public: - GrayskullCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + GrayskullCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} }; diff --git a/device/grayskull/grayskull_implementation.cpp b/device/grayskull/grayskull_implementation.cpp index 7cdb72e57..415de2298 100644 --- a/device/grayskull/grayskull_implementation.cpp +++ b/device/grayskull/grayskull_implementation.cpp @@ -4,13 +4,12 @@ #include "grayskull_implementation.h" -#include "src/firmware/riscv/grayskull/host_mem_address_map.h" -#include "src/firmware/riscv/grayskull/eth_interface.h" - #include "device/cluster.h" +#include "src/firmware/riscv/grayskull/eth_interface.h" +#include "src/firmware/riscv/grayskull/host_mem_address_map.h" -constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 32; // source: noc_parameters.h, unique for GS -constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for GS && WH && BH +constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 32; // source: noc_parameters.h, unique for GS +constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for GS && WH && BH namespace tt::umd { @@ -90,7 +89,9 @@ std::pair grayskull_implementation::get_tlb_data( } tt_driver_host_address_params grayskull_implementation::get_host_address_params() const { - return {::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::grayskull::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::grayskull::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } tt_driver_eth_interface_params grayskull_implementation::get_eth_interface_params() const { diff --git a/device/grayskull/grayskull_implementation.h b/device/grayskull/grayskull_implementation.h index 3f385801a..5984710c7 100644 --- a/device/grayskull/grayskull_implementation.h +++ b/device/grayskull/grayskull_implementation.h @@ -104,7 +104,8 @@ enum class arc_message_type { }; // DEVICE_DATA -static const std::array DRAM_LOCATIONS = {{{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}}; +static const std::array DRAM_LOCATIONS = { + {{1, 6}, {4, 6}, {7, 6}, {10, 6}, {1, 0}, {4, 0}, {7, 0}, {10, 0}}}; static const std::array ARC_LOCATIONS = {{{0, 2}}}; static const std::array PCI_LOCATIONS = {{{0, 4}}}; static const std::array ETH_LOCATIONS = {}; @@ -134,7 +135,8 @@ static constexpr uint32_t STATIC_TLB_CFG_ADDR = 0x1fc00000; static constexpr uint32_t TLB_CFG_REG_SIZE_BYTES = 8; static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024; -static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); +static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = + STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M; static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024; @@ -171,59 +173,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; } // namespace grayskull class grayskull_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::GRAYSKULL; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(grayskull::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(grayskull::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(grayskull::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(grayskull::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(grayskull::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(grayskull::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(grayskull::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(grayskull::arc_message_type::TEST); } + uint32_t get_arc_csm_mailbox_offset() const override { return grayskull::ARC_CSM_MAILBOX_OFFSET; } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return grayskull::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return grayskull::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return grayskull::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return grayskull::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return grayskull::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return grayskull::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return grayskull::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return grayskull::DYNAMIC_TLB_2M_SIZE; } + uint32_t get_dynamic_tlb_16m_base() const override { return grayskull::DYNAMIC_TLB_16M_BASE; } + uint32_t get_dynamic_tlb_16m_size() const override { return grayskull::DYNAMIC_TLB_16M_SIZE; } + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return grayskull::DYNAMIC_TLB_16M_CFG_ADDR; } + uint32_t get_mem_large_read_tlb() const override { return grayskull::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return grayskull::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return grayskull::STATIC_TLB_CFG_ADDR; } + uint32_t get_static_tlb_size() const override { return grayskull::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return grayskull::REG_TLB; } + uint32_t get_tlb_base_index_16m() const override { return grayskull::TLB_BASE_INDEX_16M; } + uint32_t get_tensix_soft_reset_addr() const override { return grayskull::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return grayskull::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return grayskull::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return grayskull::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return grayskull::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return grayskull::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return grayskull::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return grayskull::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -234,7 +270,6 @@ class grayskull_implementation : public architecture_implementation { tt_driver_host_address_params get_host_address_params() const override; tt_driver_eth_interface_params get_eth_interface_params() const override; tt_driver_noc_params get_noc_params() const override; - }; } // namespace tt::umd diff --git a/device/hugepage.cpp b/device/hugepage.cpp index b4695c217..9010ed37d 100644 --- a/device/hugepage.cpp +++ b/device/hugepage.cpp @@ -6,8 +6,8 @@ #include "hugepage.h" -#include // for umask -#include // for O_RDWR and other constants +#include // for O_RDWR and other constants +#include // for umask #include "common/logger.hpp" #include "device/cpuset_lib.hpp" @@ -20,13 +20,12 @@ std::string hugepage_dir = hugepage_dir_env ? hugepage_dir_env : "/dev/hugepages namespace tt::umd { -uint32_t get_num_hugepages(){ - +uint32_t get_num_hugepages() { std::string nr_hugepages_path = "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"; std::ifstream hugepages_file(nr_hugepages_path); uint32_t num_hugepages = 0; - if(hugepages_file.is_open()) { + if (hugepages_file.is_open()) { std::string value; std::getline(hugepages_file, value); num_hugepages = std::stoi(value); @@ -36,100 +35,121 @@ uint32_t get_num_hugepages(){ } return num_hugepages; - } -uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) { - +uint32_t get_available_num_host_mem_channels( + const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id) { // To minimally support hybrid dev systems with mix of ARCH, get only devices matching current ARCH's device_id. - uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices(); - uint32_t num_tt_mmio_devices_for_arch = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); - uint32_t total_hugepages = get_num_hugepages(); + uint32_t total_num_tt_mmio_devices = tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices(); + uint32_t num_tt_mmio_devices_for_arch = + tt::cpuset::tt_cpuset_allocator::get_num_tt_pci_devices_by_pci_device_id(device_id, revision_id); + uint32_t total_hugepages = get_num_hugepages(); // This shouldn't happen on silicon machines. if (num_tt_mmio_devices_for_arch == 0) { - log_warning(LogSiliconDriver, + log_warning( + LogSiliconDriver, "No TT devices found that match PCI device_id: 0x{:x} revision: {}, returning NumHostMemChannels:0", - device_id, revision_id); + device_id, + revision_id); return 0; } - // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups that were incomplete - // ie fewer hugepages than devices, which would partially work previously for some devices. - uint32_t num_channels_per_device_available = std::min(num_channels_per_device_target, std::max((uint32_t) 1, total_hugepages / num_tt_mmio_devices_for_arch)); + // GS will use P2P + 1 channel, others may support 4 host channels. Apply min of 1 to not completely break setups + // that were incomplete ie fewer hugepages than devices, which would partially work previously for some devices. + uint32_t num_channels_per_device_available = + std::min(num_channels_per_device_target, std::max((uint32_t)1, total_hugepages / num_tt_mmio_devices_for_arch)); - // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later on. + // Perform some helpful assertion checks to guard against common pitfalls that would show up as runtime issues later + // on. if (total_num_tt_mmio_devices > num_tt_mmio_devices_for_arch) { - log_warning(LogSiliconDriver, - "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient Hugepages/HostMemChannels per device."); + log_warning( + LogSiliconDriver, + "Hybrid system mixing different TTDevices - this is not well supported. Ensure sufficient " + "Hugepages/HostMemChannels per device."); } if (total_hugepages < num_tt_mmio_devices_for_arch) { - log_warning(LogSiliconDriver, - "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. NumHostMemChannels would be 0, bumping to 1.", - total_hugepages, num_tt_mmio_devices_for_arch, device_id, revision_id); + log_warning( + LogSiliconDriver, + "Insufficient NumHugepages: {} should be at least NumMMIODevices: {} for device_id: 0x{:x} revision: {}. " + "NumHostMemChannels would be 0, bumping to 1.", + total_hugepages, + num_tt_mmio_devices_for_arch, + device_id, + revision_id); } if (num_channels_per_device_available < num_channels_per_device_target) { - log_warning(LogSiliconDriver, - "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds NumHostMemChannels. Increase Number of Hugepages.", - num_channels_per_device_available, device_id, num_channels_per_device_target); + log_warning( + LogSiliconDriver, + "NumHostMemChannels: {} used for device_id: 0x{:x} less than target: {}. Workload will fail if it exceeds " + "NumHostMemChannels. Increase Number of Hugepages.", + num_channels_per_device_available, + device_id, + num_channels_per_device_target); } - log_assert(num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS, + log_assert( + num_channels_per_device_available <= g_MAX_HOST_MEM_CHANNELS, "NumHostMemChannels: {} exceeds supported maximum: {}, this is unexpected.", - num_channels_per_device_available, g_MAX_HOST_MEM_CHANNELS); + num_channels_per_device_available, + g_MAX_HOST_MEM_CHANNELS); return num_channels_per_device_available; - } -std::string find_hugepage_dir(std::size_t pagesize) -{ - - static const std::regex hugetlbfs_mount_re(fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir)); +std::string find_hugepage_dir(std::size_t pagesize) { + static const std::regex hugetlbfs_mount_re( + fmt::format("^(nodev|hugetlbfs) ({}) hugetlbfs ([^ ]+) 0 0$", hugepage_dir)); static const std::regex pagesize_re("(?:^|,)pagesize=([0-9]+)([KMGT])(?:,|$)"); std::ifstream proc_mounts("/proc/mounts"); - for (std::string line; std::getline(proc_mounts, line); ) - { - if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) - { + for (std::string line; std::getline(proc_mounts, line);) { + if (std::smatch mount_match; std::regex_match(line, mount_match, hugetlbfs_mount_re)) { std::string options = mount_match[3]; - if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) - { + if (std::smatch pagesize_match; std::regex_search(options, pagesize_match, pagesize_re)) { std::size_t mount_page_size = std::stoull(pagesize_match[1]); - switch (pagesize_match[2].str()[0]) - { - case 'T': mount_page_size <<= 10; - case 'G': mount_page_size <<= 10; - case 'M': mount_page_size <<= 10; - case 'K': mount_page_size <<= 10; + switch (pagesize_match[2].str()[0]) { + case 'T': + mount_page_size <<= 10; + case 'G': + mount_page_size <<= 10; + case 'M': + mount_page_size <<= 10; + case 'K': + mount_page_size <<= 10; } - if (mount_page_size == pagesize) - { + if (mount_page_size == pagesize) { return mount_match[2]; } } } } - log_warning(LogSiliconDriver, "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: {}.", hugepage_dir, pagesize); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::find_hugepage_dir: no huge page mount found in /proc/mounts for path: {} with hugepage_size: " + "{}.", + hugepage_dir, + pagesize); return std::string(); } -int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel) { +int open_hugepage_file(const std::string& dir, chip_id_t physical_device_id, uint16_t channel) { std::vector filename; static const char pipeline_name[] = "tenstorrent"; filename.insert(filename.end(), dir.begin(), dir.end()); - if (filename.back() != '/') filename.push_back('/'); + if (filename.back() != '/') { + filename.push_back('/'); + } // In order to limit number of hugepages while transition from shared hugepage (1 per system) to unique // hugepage per device, will share original/shared hugepage filename with physical device 0. - if (physical_device_id != 0 || channel != 0){ + if (physical_device_id != 0 || channel != 0) { std::string device_id_str = fmt::format("device_{}_", physical_device_id); filename.insert(filename.end(), device_id_str.begin(), device_id_str.end()); } @@ -139,20 +159,32 @@ int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uin filename.insert(filename.end(), channel_id_str.begin(), channel_id_str.end()); } - filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator + filename.insert(filename.end(), std::begin(pipeline_name), std::end(pipeline_name)); // includes NUL terminator std::string filename_str(filename.begin(), filename.end()); - filename_str.erase(std::find(filename_str.begin(), filename_str.end(), '\0'), filename_str.end()); // Erase NULL terminator for printing. - log_debug(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", filename_str.c_str(), physical_device_id, channel); + filename_str.erase( + std::find(filename_str.begin(), filename_str.end(), '\0'), + filename_str.end()); // Erase NULL terminator for printing. + log_debug( + LogSiliconDriver, + "ttSiliconDevice::open_hugepage_file: using filename: {} for physical_device_id: {} channel: {}", + filename_str.c_str(), + physical_device_id, + channel); // Save original and set umask to unrestricted. auto old_umask = umask(0); - int fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH ); + int fd = + open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH); if (fd == -1 && errno == EACCES) { - log_warning(LogSiliconDriver, "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", filename_str); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::open_hugepage_file could not open filename: {} on first try, unlinking it and retrying.", + filename_str); unlink(filename.data()); - fd = open(filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH ); + fd = open( + filename.data(), O_RDWR | O_CREAT | O_CLOEXEC, S_IWUSR | S_IRUSR | S_IWGRP | S_IRGRP | S_IWOTH | S_IROTH); } // Restore original mask @@ -166,4 +198,4 @@ int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uin return fd; } -} // namespace tt::umd +} // namespace tt::umd diff --git a/device/hugepage.h b/device/hugepage.h index d12b1396c..6aa1838ba 100644 --- a/device/hugepage.h +++ b/device/hugepage.h @@ -6,10 +6,10 @@ #pragma once -#include "device/tt_cluster_descriptor_types.h" - -#include #include +#include + +#include "device/tt_cluster_descriptor_types.h" namespace tt::umd { @@ -17,7 +17,8 @@ namespace tt::umd { uint32_t get_num_hugepages(); // Dynamically figure out how many host memory channels (based on hugepages installed) for each device, based on arch. -uint32_t get_available_num_host_mem_channels(const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id); +uint32_t get_available_num_host_mem_channels( + const uint32_t num_channels_per_device_target, const uint16_t device_id, const uint16_t revision_id); // Looks for hugetlbfs inside /proc/mounts matching desired pagesize (typically 1G) std::string find_hugepage_dir(std::size_t pagesize); @@ -27,4 +28,4 @@ std::string find_hugepage_dir(std::size_t pagesize); // Today we assume there's only one pipeline running within the system. // One hugepage per device such that each device gets unique memory. int open_hugepage_file(const std::string &dir, chip_id_t physical_device_id, uint16_t channel); -} +} // namespace tt::umd diff --git a/device/ioctl.h b/device/ioctl.h index 60ec7b2f3..1f732cfce 100644 --- a/device/ioctl.h +++ b/device/ioctl.h @@ -4,6 +4,9 @@ * SPDX-License-Identifier: Apache-2.0 */ +// clang-format off +// This file is copied from KMD, so we don't want clang formatting diff. + #ifndef TTDRIVER_IOCTL_H_INCLUDED #define TTDRIVER_IOCTL_H_INCLUDED @@ -155,3 +158,4 @@ struct tenstorrent_pin_pages { }; #endif +// clang-format on diff --git a/device/mockup/tt_mockup_device.hpp b/device/mockup/tt_mockup_device.hpp index e6085b396..ef1e7e5af 100644 --- a/device/mockup/tt_mockup_device.hpp +++ b/device/mockup/tt_mockup_device.hpp @@ -9,31 +9,42 @@ #include #include -#include "device/tt_cluster_descriptor.h" #include "device/cluster.h" +#include "device/tt_cluster_descriptor.h" class tt_MockupDevice : public tt_device { - public: +public: tt_MockupDevice(const std::string& sdesc_path) : tt_device() { soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; } + virtual ~tt_MockupDevice() {} // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors() override { return soc_descriptor_per_chip; } + void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) override {} + void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) override {} + void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_) override {} - void set_driver_eth_interface_params( - const tt_driver_eth_interface_params& eth_interface_params_) override {} + + void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_) override {} + void start_device(const tt_device_params& device_params) override {} + void assert_risc_reset() override {} + void deassert_risc_reset() override {} - void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET) override {} + + void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET) override {} + void assert_risc_reset_at_core(tt_cxy_pair core) override {} + void close_device() override {} // Runtime Functions @@ -43,10 +54,13 @@ class tt_MockupDevice : public tt_device { tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) override {} + void read_from_device( void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) override {} + void write_to_sysmem( const void* mem_ptr, std::uint32_t size, uint64_t addr, uint16_t channel, chip_id_t src_device_id) override {} + void read_from_sysmem( void* mem_ptr, uint64_t addr, uint16_t channel, uint32_t size, chip_id_t src_device_id) override {} @@ -54,10 +68,12 @@ class tt_MockupDevice : public tt_device { const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}) override {} + void dram_membar( const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels = {}) override {} + void dram_membar( const chip_id_t chip, const std::string& fallback_tlb, @@ -66,27 +82,35 @@ class tt_MockupDevice : public tt_device { void wait_for_non_mmio_flush() override {} // Misc. Functions to Query/Set Device State - std::unordered_map get_harvesting_masks_for_soc_descriptors() override { - return {{0, 0}}; - } + std::unordered_map get_harvesting_masks_for_soc_descriptors() override { return {{0, 0}}; } + static std::vector detect_available_device_ids() { return {0}; }; + std::set get_target_remote_device_ids() override { return target_remote_chips; } + std::map get_clocks() override { return {{0, 0}}; } + void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const override { return nullptr; } + std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const override { return 0; } + std::uint32_t get_num_dram_channels(std::uint32_t device_id) override { return get_soc_descriptor(device_id).get_num_dram_channels(); }; + std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return get_soc_descriptor(device_id).dram_bank_size; } + std::uint32_t get_num_host_channels(std::uint32_t device_id) override { return 1; } + std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) override { return 0; } + std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id) override { return 0; } - private: +private: std::vector archs_in_cluster = {}; std::set target_devices_in_cluster = {}; std::set target_remote_chips = {}; diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index 95baa27c1..23528e5a8 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -4,27 +4,27 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include "pci_device.hpp" + +#include // for ::open +#include // for PCI_SLOT, PCI_FUNC +#include // for ioctl +#include // for mmap, munmap +#include // for fstat +#include // for ::close + #include -#include // for memcpy +#include // for memcpy #include -#include // for ::open -#include // for ::close -#include // for ioctl -#include // for mmap, munmap -#include // for fstat -#include // for PCI_SLOT, PCI_FUNC -#include "pci_device.hpp" -#include "ioctl.h" - -#include "ioctl.h" -#include "device/tt_arch_types.h" -#include "device/driver_atomics.h" +#include "common/assert.hpp" +#include "common/logger.hpp" #include "device/architecture_implementation.h" #include "device/cpuset_lib.hpp" +#include "device/driver_atomics.h" #include "device/hugepage.h" -#include "common/assert.hpp" -#include "common/logger.hpp" +#include "device/tt_arch_types.h" +#include "ioctl.h" static const uint16_t GS_PCIE_DEVICE_ID = 0xfaca; static const uint16_t WH_PCIE_DEVICE_ID = 0x401e; @@ -32,25 +32,29 @@ static const uint16_t BH_PCIE_DEVICE_ID = 0xb140; // TODO: we'll have to rethink this when KMD takes control of the inbound PCIe // TLB windows and there is no longer a pre-defined WC/UC split. -static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156<<20) + (10<<21) + (18<<24); +static const uint32_t GS_BAR0_WC_MAPPING_SIZE = (156 << 20) + (10 << 21) + (18 << 24); // Defines the address for WC region. addresses 0 to BH_BAR0_WC_MAPPING_SIZE are in WC, above that are UC -static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188<<21; +static const uint32_t BH_BAR0_WC_MAPPING_SIZE = 188 << 21; static const uint32_t BH_NOC_NODE_ID_OFFSET = 0x1FD04044; static const uint32_t GS_WH_ARC_SCRATCH_6_OFFSET = 0x1FF30078; // Hugepages must be 1GB in size -const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB +const uint32_t HUGEPAGE_REGION_SIZE = 1 << 30; // 1GB using namespace tt; using namespace tt::umd; template static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name) { - const auto sysfs_path = fmt::format("/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}", - device_info.pci_domain, device_info.pci_bus, - device_info.pci_device, device_info.pci_function, attribute_name); + const auto sysfs_path = fmt::format( + "/sys/bus/pci/devices/{:04x}:{:02x}:{:02x}.{:x}/{}", + device_info.pci_domain, + device_info.pci_bus, + device_info.pci_device, + device_info.pci_function, + attribute_name); std::ifstream attribute_file(sysfs_path); std::string value_str; T value; @@ -75,8 +79,7 @@ static T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribu return value; } -static PciDeviceInfo read_device_info(int fd) -{ +static PciDeviceInfo read_device_info(int fd) { tenstorrent_get_device_info info{}; info.in.output_size_bytes = sizeof(info.out); @@ -92,11 +95,11 @@ static PciDeviceInfo read_device_info(int fd) } static tt::ARCH detect_arch(uint32_t pcie_device_id, uint32_t pcie_revision_id) { - if (pcie_device_id == GS_PCIE_DEVICE_ID){ + if (pcie_device_id == GS_PCIE_DEVICE_ID) { return tt::ARCH::GRAYSKULL; - } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01){ + } else if (pcie_device_id == WH_PCIE_DEVICE_ID && pcie_revision_id == 0x01) { return tt::ARCH::WORMHOLE_B0; - } else if (pcie_device_id == BH_PCIE_DEVICE_ID){ + } else if (pcie_device_id == BH_PCIE_DEVICE_ID) { return tt::ARCH::BLACKHOLE; } else { TT_THROW("Unknown pcie device id that does not match any known architecture: ", pcie_device_id); @@ -122,28 +125,29 @@ inline void memcpy_to_device(void *dest, const void *src, std::size_t num_bytes) if (dest_misalignment != 0) { // Read-modify-write for the first dest element. - dp = reinterpret_cast(dest_addr - dest_misalignment); + dp = reinterpret_cast(dest_addr - dest_misalignment); copy_t tmp = *dp; auto leading_len = std::min(sizeof(tmp) - dest_misalignment, num_bytes); - std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); + std::memcpy(reinterpret_cast(&tmp) + dest_misalignment, src, leading_len); num_bytes -= leading_len; src = static_cast(src) + leading_len; *dp++ = tmp; } else { - dp = static_cast(dest); + dp = static_cast(dest); } // Copy the destination-aligned middle. - const copy_t *sp = static_cast(src); + const copy_t *sp = static_cast(src); std::size_t num_words = num_bytes / sizeof(copy_t); - for (std::size_t i = 0; i < num_words; i++) + for (std::size_t i = 0; i < num_words; i++) { *dp++ = *sp++; + } // Finally copy any sub-word trailer, again RMW on the destination. auto trailing_len = num_bytes % sizeof(copy_t); @@ -166,7 +170,7 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte unsigned int src_misalignment = src_addr % sizeof(copy_t); if (src_misalignment != 0) { - sp = reinterpret_cast(src_addr - src_misalignment); + sp = reinterpret_cast(src_addr - src_misalignment); copy_t tmp = *sp++; @@ -176,15 +180,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte dest = static_cast(dest) + leading_len; } else { - sp = static_cast(src); + sp = static_cast(src); } // Copy the source-aligned middle. copy_t *dp = static_cast(dest); std::size_t num_words = num_bytes / sizeof(copy_t); - for (std::size_t i = 0; i < num_words; i++) + for (std::size_t i = 0; i < num_words; i++) { *dp++ = *sp++; + } // Finally copy any sub-word trailer. auto trailing_len = num_bytes % sizeof(copy_t); @@ -195,17 +200,16 @@ inline void memcpy_from_device(void *dest, const void *src, std::size_t num_byte } tt::ARCH PciDeviceInfo::get_arch() const { - if (this->device_id == GS_PCIE_DEVICE_ID){ + if (this->device_id == GS_PCIE_DEVICE_ID) { return tt::ARCH::GRAYSKULL; } else if (this->device_id == WH_PCIE_DEVICE_ID) { return tt::ARCH::WORMHOLE_B0; - } else if (this->device_id == BH_PCIE_DEVICE_ID){ + } else if (this->device_id == BH_PCIE_DEVICE_ID) { return tt::ARCH::BLACKHOLE; } return tt::ARCH::Invalid; } - /* static */ std::vector PCIDevice::enumerate_devices() { std::vector device_ids; std::string path = "/dev/tenstorrent/"; @@ -213,7 +217,7 @@ tt::ARCH PciDeviceInfo::get_arch() const { if (!std::filesystem::exists(path)) { return device_ids; } - for (const auto& entry : std::filesystem::directory_iterator(path)) { + for (const auto &entry : std::filesystem::directory_iterator(path)) { std::string filename = entry.path().filename().string(); // TODO: this will skip any device that has a non-numeric name, which @@ -237,28 +241,29 @@ tt::ARCH PciDeviceInfo::get_arch() const { try { infos[n] = read_device_info(fd); - } catch (...) {} + } catch (...) { + } close(fd); } return infos; } -PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) - : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)) - , pci_device_num(pci_device_number) - , logical_id(logical_device_id) - , pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)) - , info(read_device_info(pci_device_file_desc)) - , numa_node(read_sysfs(info, "numa_node")) - , revision(read_sysfs(info, "revision")) - , arch(detect_arch(info.device_id, revision)) - , architecture_implementation(tt::umd::architecture_implementation::create(arch)) -{ +PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) : + device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)), + pci_device_num(pci_device_number), + logical_id(logical_device_id), + pci_device_file_desc(open(device_path.c_str(), O_RDWR | O_CLOEXEC)), + info(read_device_info(pci_device_file_desc)), + numa_node(read_sysfs(info, "numa_node")), + revision(read_sysfs(info, "revision")), + arch(detect_arch(info.device_id, revision)), + architecture_implementation(tt::umd::architecture_implementation::create(arch)) { struct { tenstorrent_query_mappings query_mappings; tenstorrent_mapping mapping_array[8]; } mappings; + memset(&mappings, 0, sizeof(mappings)); mappings.query_mappings.in.output_mapping_count = 8; @@ -302,7 +307,9 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) bar4_wc_mapping = mappings.mapping_array[i]; } - log_debug(LogSiliconDriver, "BAR mapping id {} base {} size {}", + log_debug( + LogSiliconDriver, + "BAR mapping id {} base {} size {}", mappings.mapping_array[i].mapping_id, (void *)mappings.mapping_array[i].mapping_base, mappings.mapping_array[i].mapping_size); @@ -317,7 +324,8 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) // Attempt WC mapping first so we can fall back to all-UC if it fails. if (bar0_wc_mapping.mapping_id == TENSTORRENT_MAPPING_RESOURCE0_WC) { bar0_wc_size = std::min(bar0_wc_mapping.mapping_size, wc_mapping_size); - bar0_wc = mmap(NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base); + bar0_wc = mmap( + NULL, bar0_wc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_wc_mapping.mapping_base); if (bar0_wc == MAP_FAILED) { bar0_wc_size = 0; bar0_wc = nullptr; @@ -334,7 +342,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) bar0_uc_offset = 0; } - bar0_uc = mmap(NULL, bar0_uc_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar0_uc_mapping.mapping_base + bar0_uc_offset); + bar0_uc = mmap( + NULL, + bar0_uc_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar0_uc_mapping.mapping_base + bar0_uc_offset); if (bar0_uc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR0 UC mapping failed for device {}.", pci_device_num)); @@ -351,22 +365,34 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) system_reg_mapping_size = bar4_uc_mapping.mapping_size; - system_reg_mapping = mmap(NULL, bar4_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_uc_mapping.mapping_base); + system_reg_mapping = mmap( + NULL, + bar4_uc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar4_uc_mapping.mapping_base); if (system_reg_mapping == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR4 UC mapping failed for device {}.", pci_device_num)); } - system_reg_start_offset = (512 - 16) * 1024*1024; - system_reg_offset_adjust = (512 - 32) * 1024*1024; - } else if(arch == tt::ARCH::BLACKHOLE) { + system_reg_start_offset = (512 - 16) * 1024 * 1024; + system_reg_offset_adjust = (512 - 32) * 1024 * 1024; + } else if (arch == tt::ARCH::BLACKHOLE) { if (bar2_uc_mapping.mapping_id != TENSTORRENT_MAPPING_RESOURCE1_UC) { throw std::runtime_error(fmt::format("Device {} has no BAR2 UC mapping.", pci_device_num)); } // Using UnCachable memory mode. This is used for accessing registers on Blackhole. bar2_uc_size = bar2_uc_mapping.mapping_size; - bar2_uc = mmap(NULL, bar2_uc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar2_uc_mapping.mapping_base); + bar2_uc = mmap( + NULL, + bar2_uc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar2_uc_mapping.mapping_base); if (bar2_uc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR2 UC mapping failed for device {}.", pci_device_num)); @@ -379,7 +405,13 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) // Using Write-Combine memory mode. This is used for accessing DRAM on Blackhole. // WC doesn't guarantee write ordering but has better performance. bar4_wc_size = bar4_wc_mapping.mapping_size; - bar4_wc = mmap(NULL, bar4_wc_mapping.mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, pci_device_file_desc, bar4_wc_mapping.mapping_base); + bar4_wc = mmap( + NULL, + bar4_wc_mapping.mapping_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + pci_device_file_desc, + bar4_wc_mapping.mapping_base); if (bar4_wc == MAP_FAILED) { throw std::runtime_error(fmt::format("BAR4 WC mapping failed for device {}.", pci_device_num)); @@ -391,7 +423,7 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) } PCIDevice::~PCIDevice() { - for (const auto& hugepage_mapping : hugepage_mapping_per_channel) { + for (const auto &hugepage_mapping : hugepage_mapping_per_channel) { if (hugepage_mapping.mapping) { munmap(hugepage_mapping.mapping, hugepage_mapping.mapping_size); } @@ -405,8 +437,8 @@ PCIDevice::~PCIDevice() { // essential for correctness then it needs to move to the driver. uint64_t iatu_index = 0; uint64_t iatu_base = UNROLL_ATU_OFFSET_BAR + iatu_index * 0x200; - uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 - write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); + uint32_t region_ctrl_2 = 0 << 31; // REGION_EN = 0 + write_regs(reinterpret_cast(static_cast(bar2_uc) + iatu_base + 0x04), ®ion_ctrl_2, 1); } close(pci_device_file_desc); @@ -432,8 +464,8 @@ PCIDevice::~PCIDevice() { } } -template -T* PCIDevice::get_register_address(uint32_t register_offset) { +template +T *PCIDevice::get_register_address(uint32_t register_offset) { // Right now, address can either be exposed register in BAR, or TLB window in BAR0 (BAR4 for Blackhole). // Should clarify this interface void *reg_mapping; @@ -446,10 +478,10 @@ T* PCIDevice::get_register_address(uint32_t register_offset) { register_offset -= bar0_uc_offset; reg_mapping = bar0_uc; } - return reinterpret_cast(static_cast(reg_mapping) + register_offset); + return reinterpret_cast(static_cast(reg_mapping) + register_offset); } -void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr) { +void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr) { void *dest = nullptr; if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { byte_addr -= BAR0_BH_SIZE; @@ -466,7 +498,7 @@ void PCIDevice::write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_ } } -void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr) { +void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr) { void *src = nullptr; if (bar4_wc != nullptr && byte_addr >= BAR0_BH_SIZE) { byte_addr -= BAR0_BH_SIZE; @@ -483,7 +515,7 @@ void PCIDevice::read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buff } if (num_bytes >= sizeof(std::uint32_t)) { - detect_hang_read(*reinterpret_cast(dest)); + detect_hang_read(*reinterpret_cast(dest)); } } @@ -496,14 +528,14 @@ void PCIDevice::write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_ void PCIDevice::write_regs(uint32_t byte_addr, uint32_t word_len, const void *data) { volatile uint32_t *dest = get_register_address(byte_addr); - const uint32_t *src = reinterpret_cast(data); + const uint32_t *src = reinterpret_cast(data); write_regs(dest, src, word_len); } void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { const volatile uint32_t *src = get_register_address(byte_addr); - uint32_t *dest = reinterpret_cast(data); + uint32_t *dest = reinterpret_cast(data); while (word_len-- != 0) { uint32_t temp = *src++; @@ -511,29 +543,34 @@ void PCIDevice::read_regs(uint32_t byte_addr, uint32_t word_len, void *data) { } } -void PCIDevice::write_tlb_reg(uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size){ - log_assert((tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); +void PCIDevice::write_tlb_reg( + uint32_t byte_addr, uint64_t value_lower, uint64_t value_upper, uint32_t tlb_cfg_reg_size) { + log_assert( + (tlb_cfg_reg_size == 8) or (tlb_cfg_reg_size == 12), + "Tenstorrent hardware supports only 64bit or 96bit TLB config regs"); volatile uint64_t *dest_qw = get_register_address(byte_addr); - volatile uint32_t *dest_extra_dw = get_register_address(byte_addr+8); + volatile uint32_t *dest_extra_dw = get_register_address(byte_addr + 8); #if defined(__ARM_ARCH) || defined(__riscv) // The store below goes through UC memory on x86, which has implicit ordering constraints with WC accesses. - // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory accesses. - // Insert an explicit full memory barrier for ARM. - // Do the same for RISC-V. + // ARM has no concept of UC memory. This will not allow for implicit ordering of this store wrt other memory + // accesses. Insert an explicit full memory barrier for ARM. Do the same for RISC-V. tt_driver_atomics::mfence(); #endif *dest_qw = value_lower; if (tlb_cfg_reg_size > 8) { - uint32_t* p_value_upper = reinterpret_cast(&value_upper); + uint32_t *p_value_upper = reinterpret_cast(&value_upper); *dest_extra_dw = p_value_upper[0]; } - tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB register. + tt_driver_atomics::mfence(); // Otherwise subsequent WC loads move earlier than the above UC store to the TLB + // register. } bool PCIDevice::is_hardware_hung() { - volatile const void *addr = reinterpret_cast(bar0_uc) + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - bar0_uc_offset; - std::uint32_t scratch_data = *reinterpret_cast(addr); + volatile const void *addr = reinterpret_cast(bar0_uc) + + (get_architecture_implementation()->get_arc_reset_scratch_offset() + 6 * 4) - + bar0_uc_offset; + std::uint32_t scratch_data = *reinterpret_cast(addr); return (scratch_data == c_hang_read_value); } @@ -547,55 +584,94 @@ void PCIDevice::detect_hang_read(std::uint32_t data_read) { } // Get TLB index (from zero), check if it's in 16MB, 2MB or 1MB TLB range, and dynamically program it. -dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, - std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t address, + bool multicast, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering) { auto architecture_implementation = get_architecture_implementation(); if (multicast) { std::tie(start, end) = architecture_implementation->multicast_workaround(start, end); } - log_trace(LogSiliconDriver, "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast = {}, ordering = {}", - tlb_index, start.x, start.y, end.x, end.y, address, multicast, (int)ordering); + log_trace( + LogSiliconDriver, + "set_dynamic_tlb with arguments: tlb_index = {}, start = ({}, {}), end = ({}, {}), address = 0x{:x}, multicast " + "= {}, ordering = {}", + tlb_index, + start.x, + start.y, + end.x, + end.y, + address, + multicast, + (int)ordering); tt::umd::tlb_configuration tlb_config = architecture_implementation->get_tlb_configuration(tlb_index); std::uint32_t TLB_CFG_REG_SIZE_BYTES = architecture_implementation->get_tlb_cfg_reg_size_bytes(); auto translated_start_coords = harvested_coord_translation.at(logical_id).at(start); auto translated_end_coords = harvested_coord_translation.at(logical_id).at(end); - uint32_t tlb_address = address / tlb_config.size; - uint32_t local_address = address % tlb_config.size; - uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); - uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset); - - std::pair tlb_data = tt::umd::tlb_data { - .local_offset = tlb_address, - .x_end = static_cast(translated_end_coords.x), - .y_end = static_cast(translated_end_coords.y), - .x_start = static_cast(translated_start_coords.x), - .y_start = static_cast(translated_start_coords.y), - .mcast = multicast, - .ordering = ordering, - // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0. - // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be the same TLB. - // Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc. - .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true, - }.apply_offset(tlb_config.offset); - - log_debug(LogSiliconDriver, "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} tlb_cfg_reg: 0x{:x}", tlb_index, tlb_config.index_offset, tlb_config.size/(1024*1024), tlb_base, tlb_cfg_reg); + uint32_t tlb_address = address / tlb_config.size; + uint32_t local_address = address % tlb_config.size; + uint64_t tlb_base = tlb_config.base + (tlb_config.size * tlb_config.index_offset); + uint32_t tlb_cfg_reg = tlb_config.cfg_addr + (TLB_CFG_REG_SIZE_BYTES * tlb_config.index_offset); + + std::pair tlb_data = + tt::umd::tlb_data{ + .local_offset = tlb_address, + .x_end = static_cast(translated_end_coords.x), + .y_end = static_cast(translated_end_coords.y), + .x_start = static_cast(translated_start_coords.x), + .y_start = static_cast(translated_start_coords.y), + .mcast = multicast, + .ordering = ordering, + // TODO #2715: hack for Blackhole A0, will potentially be fixed in B0. + // Using the same static vc for reads and writes through TLBs can hang the card. It doesn't even have to be + // the same TLB. Dynamic vc should not have this issue. There might be a perf impact with using dynamic vc. + .static_vc = (get_arch() == tt::ARCH::BLACKHOLE) ? false : true, + } + .apply_offset(tlb_config.offset); + + log_debug( + LogSiliconDriver, + "set_dynamic_tlb() with tlb_index: {} tlb_index_offset: {} dynamic_tlb_size: {}MB tlb_base: 0x{:x} " + "tlb_cfg_reg: 0x{:x}", + tlb_index, + tlb_config.index_offset, + tlb_config.size / (1024 * 1024), + tlb_base, + tlb_cfg_reg); write_tlb_reg(tlb_cfg_reg, tlb_data.first, tlb_data.second, TLB_CFG_REG_SIZE_BYTES); - return { tlb_base + local_address, tlb_config.size - local_address }; + return {tlb_base + local_address, tlb_config.size - local_address}; } -dynamic_tlb PCIDevice::set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair target, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering) { return set_dynamic_tlb(tlb_index, tt_xy_pair(0, 0), target, address, false, harvested_coord_translation, ordering); } -dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering) { +dynamic_tlb PCIDevice::set_dynamic_tlb_broadcast( + unsigned int tlb_index, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t ordering) { // Issue a broadcast to cores included in the start (top left) and end (bottom right) grid return set_dynamic_tlb(tlb_index, start, end, address, true, harvested_coord_translation, ordering); } -tt::umd::architecture_implementation* PCIDevice::get_architecture_implementation() const {return architecture_implementation.get();} +tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation() const { + return architecture_implementation.get(); +} bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { const size_t hugepage_size = HUGEPAGE_REGION_SIZE; @@ -605,7 +681,10 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { std::string hugepage_dir = find_hugepage_dir(hugepage_size); if (hugepage_dir.empty()) { - log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", hugepage_size); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: no huge page mount found for hugepage_size: {}.", + hugepage_size); return false; } @@ -615,11 +694,14 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { // Support for more than 1GB host memory accessible per device, via channels. for (int ch = 0; ch < num_host_mem_channels; ch++) { - int hugepage_fd = open_hugepage_file(hugepage_dir, physical_device_id, ch); if (hugepage_fd == -1) { // Probably a permissions problem. - log_warning(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", physical_device_id, ch); + log_warning( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} creating hugepage mapping file failed.", + physical_device_id, + ch); success = false; continue; } @@ -630,26 +712,43 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { log_warning(LogSiliconDriver, "Error reading hugepage file size after opening."); } - std::byte *mapping = static_cast(mmap(nullptr, hugepage_size, PROT_READ|PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0)); + std::byte *mapping = static_cast( + mmap(nullptr, hugepage_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, hugepage_fd, 0)); close(hugepage_fd); if (mapping == MAP_FAILED) { - log_warning(LogSiliconDriver, "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", physical_device_id, ch, num_host_mem_channels, strerror(errno)); + log_warning( + LogSiliconDriver, + "UMD: Mapping a hugepage failed. (device: {}, {}/{} errno: {}).", + physical_device_id, + ch, + num_host_mem_channels, + strerror(errno)); if (hugepage_st.st_size == 0) { - log_warning(LogSiliconDriver, "Opened hugepage file has zero size, mapping might've failed due to that. Verify that enough hugepages are provided."); + log_warning( + LogSiliconDriver, + "Opened hugepage file has zero size, mapping might've failed due to that. Verify that enough " + "hugepages are provided."); } - print_file_contents("/proc/cmdline");\ - print_file_contents("/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage. + print_file_contents("/proc/cmdline"); + print_file_contents( + "/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"); // Hardcoded for 1GB hugepage. success = false; continue; } - // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same numanode as TT device. - if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)){ - log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: {}). " - "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf (Issue #893).", - physical_device_id, ch); + // Beter performance if hugepage just allocated (populate flag to prevent lazy alloc) is migrated to same + // numanode as TT device. + if (!tt::cpuset::tt_cpuset_allocator::bind_area_to_memory_nodeset(physical_device_id, mapping, hugepage_size)) { + log_warning( + LogSiliconDriver, + "---- ttSiliconDevice::init_hugepage: bind_area_to_memory_nodeset() failed (physical_device_id: {} ch: " + "{}). " + "Hugepage allocation is not on NumaNode matching TT Device. Side-Effect is decreased Device->Host perf " + "(Issue #893).", + physical_device_id, + ch); } tenstorrent_pin_pages pin_pages; @@ -662,7 +761,13 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { auto fd = get_fd(); if (ioctl(fd, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) { - log_warning(LogSiliconDriver, "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed (errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", physical_device_id, ch, strerror(errno)); + log_warning( + LogSiliconDriver, + "---- ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} TENSTORRENT_IOCTL_PIN_PAGES failed " + "(errno: {}). Common Issue: Requires TTMKD >= 1.11, see following file contents...", + physical_device_id, + ch, + strerror(errno)); munmap(mapping, hugepage_size); print_file_contents("/sys/module/tenstorrent/version", "(TTKMD version)"); print_file_contents("/proc/meminfo"); @@ -673,15 +778,19 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { hugepage_mapping_per_channel[ch] = {mapping, hugepage_size, pin_pages.out.physical_address}; - log_debug(LogSiliconDriver, "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", physical_device_id, ch, hugepage_size, (unsigned long long)hugepage_mappings.at(device_id).at(ch).physical_address); + log_debug( + LogSiliconDriver, + "ttSiliconDevice::init_hugepage: physical_device_id: {} ch: {} mapping_size: {} physical address 0x{:x}", + physical_device_id, + ch, + hugepage_size, + (unsigned long long)hugepage_mappings.at(device_id).at(ch).physical_address); } return success; } -int PCIDevice::get_num_host_mem_channels() const { - return hugepage_mapping_per_channel.size(); -} +int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); } hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const { if (channel < 0 || hugepage_mapping_per_channel.size() <= channel) { @@ -691,10 +800,10 @@ hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const { } } -void PCIDevice::print_file_contents(std::string filename, std::string hint){ - if (std::filesystem::exists(filename)){ +void PCIDevice::print_file_contents(std::string filename, std::string hint) { + if (std::filesystem::exists(filename)) { std::ifstream meminfo(filename); - if (meminfo.is_open()){ + if (meminfo.is_open()) { std::cout << std::endl << "File " << filename << " " << hint << " is: " << std::endl; std::cout << meminfo.rdbuf(); } diff --git a/device/pcie/pci_device.hpp b/device/pcie/pci_device.hpp index 2d6c2fa4f..455091bb3 100644 --- a/device/pcie/pci_device.hpp +++ b/device/pcie/pci_device.hpp @@ -12,28 +12,30 @@ #include #include -#include "device/tt_xy_pair.h" +#include "device/tlb.h" #include "device/tt_arch_types.h" #include "device/tt_cluster_descriptor_types.h" -#include "device/tlb.h" +#include "device/tt_xy_pair.h" // TODO: this is used up in cluster.cpp but that logic ought to be // lowered into the PCIDevice class since it is specific to PCIe cards. // See /vendor_ip/synopsys/052021/bh_pcie_ctl_gen5/export/configuration/DWC_pcie_ctl.h static const uint64_t UNROLL_ATU_OFFSET_BAR = 0x1200; -// TODO: this is a bit of a hack... something to revisit when we formalize an +// TODO: this is a bit of a hack... something to revisit when we formalize an // abstraction for IO. // BAR0 size for Blackhole, used to determine whether write block should use BAR0 or BAR4 static const uint64_t BAR0_BH_SIZE = 512 * 1024 * 1024; constexpr unsigned int c_hang_read_value = 0xffffffffu; -namespace tt::umd { class architecture_implementation; } +namespace tt::umd { +class architecture_implementation; +} struct dynamic_tlb { - uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR. - uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB. + uint64_t bar_offset; // Offset that address is mapped to, within the PCI BAR. + uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB. }; struct hugepage_mapping { @@ -42,8 +44,7 @@ struct hugepage_mapping { uint64_t physical_address = 0; }; -struct PciDeviceInfo -{ +struct PciDeviceInfo { uint16_t vendor_id; uint16_t device_id; uint16_t pci_domain; @@ -57,14 +58,14 @@ struct PciDeviceInfo }; class PCIDevice { - const std::string device_path; // Path to character device: /dev/tenstorrent/N - const int pci_device_num; // N in /dev/tenstorrent/N - const int logical_id; // Unique identifier for each device in entire network topology - const int pci_device_file_desc; // Character device file descriptor - const PciDeviceInfo info; // PCI device info - const int numa_node; // -1 if non-NUMA - const int revision; // PCI revision value from sysfs - const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole + const std::string device_path; // Path to character device: /dev/tenstorrent/N + const int pci_device_num; // N in /dev/tenstorrent/N + const int logical_id; // Unique identifier for each device in entire network topology + const int pci_device_file_desc; // Character device file descriptor + const PciDeviceInfo info; // PCI device info + const int numa_node; // -1 if non-NUMA + const int revision; // PCI revision value from sysfs + const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole std::unique_ptr architecture_implementation; public: @@ -83,7 +84,7 @@ class PCIDevice { * * Opens the character device file descriptor, reads device information from * sysfs, and maps device memory region(s) into the process address space. - * + * * @param pci_device_number N in /dev/tenstorrent/N * @param logical_device_id unique identifier for this device in the network topology */ @@ -95,8 +96,8 @@ class PCIDevice { */ ~PCIDevice(); - PCIDevice(const PCIDevice&) = delete; // copy - void operator=(const PCIDevice&) = delete; // copy assignment + PCIDevice(const PCIDevice &) = delete; // copy + void operator=(const PCIDevice &) = delete; // copy assignment /** * @return PCI device info @@ -155,21 +156,39 @@ class PCIDevice { // NOC endpoints. Probably worth waiting for the KMD to start owning the // resource management aspect of these PCIe->NOC mappings (the "TLBs") // before doing too much work here... - void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t* buffer_addr); - void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t* buffer_addr); + void write_block(uint64_t byte_addr, uint64_t num_bytes, const uint8_t *buffer_addr); + void read_block(uint64_t byte_addr, uint64_t num_bytes, uint8_t *buffer_addr); void write_regs(uint32_t byte_addr, uint32_t word_len, const void *data); void write_regs(volatile uint32_t *dest, const uint32_t *src, uint32_t word_len); void read_regs(uint32_t byte_addr, uint32_t word_len, void *data); // TLB related functions. // TODO: These are architecture specific, and will be moved out of the class. - void write_tlb_reg(uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); - dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair start, tt_xy_pair end, - std::uint64_t address, bool multicast, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering); - dynamic_tlb set_dynamic_tlb(unsigned int tlb_index, tt_xy_pair target, std::uint64_t address, std::unordered_map>& harvested_coord_translation, std::uint64_t ordering = tt::umd::tlb_data::Relaxed); - dynamic_tlb set_dynamic_tlb_broadcast(unsigned int tlb_index, std::uint64_t address, std::unordered_map>& harvested_coord_translation, tt_xy_pair start, tt_xy_pair end, std::uint64_t ordering = tt::umd::tlb_data::Relaxed); - - tt::umd::architecture_implementation* get_architecture_implementation() const; + void write_tlb_reg( + uint32_t byte_addr, std::uint64_t value_lower, std::uint64_t value_upper, std::uint32_t tlb_cfg_reg_size); + dynamic_tlb set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t address, + bool multicast, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering); + dynamic_tlb set_dynamic_tlb( + unsigned int tlb_index, + tt_xy_pair target, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + std::uint64_t ordering = tt::umd::tlb_data::Relaxed); + dynamic_tlb set_dynamic_tlb_broadcast( + unsigned int tlb_index, + std::uint64_t address, + std::unordered_map> &harvested_coord_translation, + tt_xy_pair start, + tt_xy_pair end, + std::uint64_t ordering = tt::umd::tlb_data::Relaxed); + + tt::umd::architecture_implementation *get_architecture_implementation() const; void detect_hang_read(uint32_t data_read = c_hang_read_value); // TODO: this also probably has more sense to live in the future TTDevice class. @@ -197,8 +216,8 @@ class PCIDevice { // and simplify the code. void *system_reg_mapping = nullptr; size_t system_reg_mapping_size; - uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. - uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. + uint32_t system_reg_start_offset; // Registers >= this are system regs, use the mapping. + uint32_t system_reg_offset_adjust; // This is the offset of the first reg in the system reg mapping. uint32_t read_checking_offset; @@ -206,11 +225,10 @@ class PCIDevice { bool is_hardware_hung(); template - T* get_register_address(uint32_t register_offset); + T *get_register_address(uint32_t register_offset); // For debug purposes when various stages fails. void print_file_contents(std::string filename, std::string hint = ""); std::vector hugepage_mapping_per_channel; }; - diff --git a/device/simulation/deprecated/tt_emulation_device.cpp b/device/simulation/deprecated/tt_emulation_device.cpp index 25026737d..e7d668933 100644 --- a/device/simulation/deprecated/tt_emulation_device.cpp +++ b/device/simulation/deprecated/tt_emulation_device.cpp @@ -3,193 +3,231 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include +#include "tt_emulation_device.h" + #include +#include #include "common/logger.hpp" #include "device/tt_cluster_descriptor.h" -#include "tt_emulation_device.h" #include "tt_emu_zemi3_wrapper.h" - tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - // create just a default one, we do not have cluster anyway - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper(); + soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); + std::set target_devices = {0}; + // create just a default one, we do not have cluster anyway + ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); + tt_zebu_wrapper_inst = new tt_emu_zemi3_wrapper(); - log_info(tt::LogEmulationDriver, "Created Emulation Device "); + log_info(tt::LogEmulationDriver, "Created Emulation Device "); } tt_emulation_device::~tt_emulation_device() { - ndesc.reset(); - delete tt_zebu_wrapper_inst; - log_info(tt::LogEmulationDriver, "Destroyed Emulation Device "); + ndesc.reset(); + delete tt_zebu_wrapper_inst; + log_info(tt::LogEmulationDriver, "Destroyed Emulation Device "); } - + void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) { - const uint32_t size = static_cast(data.size()); - tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y); + const uint32_t size = static_cast(data.size()); + tt_zebu_wrapper_inst->axi_write(0, core.x, core.y, addr, size, data); + log_info(tt::LogEmulationDriver, "Wrote {} bytes to address {:#016x}, core {},{}", size, addr, core.x, core.y); } std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { - std::vector data(size); - tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data); - log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr); + std::vector data(size); + tt_zebu_wrapper_inst->axi_read(0, core.x, core.y, addr, size, data); + log_info(tt::LogEmulationDriver, "Read {} bytes from address {:#016x}", size, addr); - return data; + return data; } - void tt_emulation_device::start_device(const tt_device_params& device_params) { - tt_zebu_wrapper_inst->zebu_start(); - tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC); - log_info(tt::LogEmulationDriver, "Started Emulation Device "); + tt_zebu_wrapper_inst->zebu_start(); + tt_zebu_wrapper_inst->zebu_enable_waveform_dump(tt_zebu_wrapper::WAVEFORM_DUMP_QIWC); + log_info(tt::LogEmulationDriver, "Started Emulation Device "); } void tt_emulation_device::deassert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_deassert(); - log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset "); + tt_zebu_wrapper_inst->all_tensix_reset_deassert(); + log_info(tt::LogEmulationDriver, "Deasserted all tensix RISC Reset "); } void tt_emulation_device::assert_risc_reset() { - tt_zebu_wrapper_inst->all_tensix_reset_assert(); - log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset "); + tt_zebu_wrapper_inst->all_tensix_reset_assert(); + log_info(tt::LogEmulationDriver, "Asserted all tensix RISC Reset "); } -void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y); +void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + tt_zebu_wrapper_inst->tensix_reset_deassert(core.x, core.y); } void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) { - tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y); + tt_zebu_wrapper_inst->tensix_reset_assert(core.x, core.y); } - - void tt_emulation_device::close_device() { log_info(tt::LogEmulationDriver, "Closing Emulation Device "); tt_zebu_wrapper_inst->zebu_finish(); } -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/ +void tt_emulation_device::start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool /*init_device*/, + bool /*skip_driver_allocs*/ ) { - log_info(tt::LogEmulationDriver, "Starting Emulation Device "); + log_info(tt::LogEmulationDriver, "Starting Emulation Device "); +} + +void tt_emulation_device::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { + for (const auto& core : get_soc_descriptor(0)->cores) { + // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == + // rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { + // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + // } + // MT: Iterate through all the worker cores for bcast: + // if (get_soc_descriptor(0)->is_worker_core(core.first)) { + // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + // } + // Emulation only broadcasts to all Tensix cores or all DRAM cores. + // differentiate which bcast pattern to use based on exclude columns + if (cols_to_exclude.find(0) == cols_to_exclude.end()) { + // Detect DRAM bcast + if (get_soc_descriptor(0)->is_dram_core(core.first)) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } else { + if (get_soc_descriptor(0)->is_worker_core(core.first)) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } + } } - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - // if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // MT: Iterate through all the worker cores for bcast: - // if (get_soc_descriptor(0)->is_worker_core(core.first)) { - // write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - // } - // Emulation only broadcasts to all Tensix cores or all DRAM cores. - // differentiate which bcast pattern to use based on exclude columns - if (cols_to_exclude.find(0) == cols_to_exclude.end()) { - // Detect DRAM bcast - if (get_soc_descriptor(0)->is_dram_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } else { - if (get_soc_descriptor(0)->is_worker_core(core.first)) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } +void tt_emulation_device::rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use) { + std::vector vec = base_vec; + uint32_t byte_increment = 4 * vec.size(); + for (uint32_t i = 0; i < unroll_count; ++i) { + vec[0] = i; // slot id for debug + uint64_t offset_addr = base_addr + i * byte_increment; + write_to_device(vec, core, offset_addr, tlb_to_use); } - } -} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) { - std::vector vec = base_vec; - uint32_t byte_increment = 4 * vec.size(); - for (uint32_t i = 0; i < unroll_count; ++i) { - vec[0] = i; // slot id for debug - uint64_t offset_addr = base_addr + i * byte_increment; - write_to_device(vec, core, offset_addr, tlb_to_use); - } } -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!"); - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); -} +void tt_emulation_device::write_to_device( + const void* mem_ptr, + uint32_t size, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_assert(!(size % 4), "Writes to Emulation Backend should be 4 byte aligned!"); -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { + std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); + write_to_device( + mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +} - std::vector byte_data(vec.size() * sizeof(uint32_t)); - std::memcpy(byte_data.data(), vec.data(), byte_data.size()); +void tt_emulation_device::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + std::vector byte_data(vec.size() * sizeof(uint32_t)); + std::memcpy(byte_data.data(), vec.data(), byte_data.size()); - write(core, addr, byte_data); + write(core, addr, byte_data); } -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { +void tt_emulation_device::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { // Placeholder - implement later - https://yyz-gitlab.local.tenstorrent.com/tenstorrent/open-umd/-/issues/26 } +void tt_emulation_device::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) { + std::vector byte_data = read(core, addr, size); + // Verify that the received byte data can be converted to uint32_t + // if (byte_data.size() % sizeof(uint32_t) != 0) { + // throw std::runtime_error("Received byte data size is not a multiple of uint32_t size."); + // } -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) { - std::vector byte_data = read(core, addr, size); - - // Verify that the received byte data can be converted to uint32_t - // if (byte_data.size() % sizeof(uint32_t) != 0) { - // throw std::runtime_error("Received byte data size is not a multiple of uint32_t size."); - // } - - vec.clear(); - vec.resize(byte_data.size() / sizeof(uint32_t)); - std::memcpy(vec.data(), byte_data.data(), byte_data.size()); + vec.clear(); + vec.resize(byte_data.size() / sizeof(uint32_t)); + std::memcpy(vec.data(), byte_data.data(), byte_data.size()); } void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; + // No translation is performed + return; } + tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } std::set tt_emulation_device::get_target_mmio_device_ids() { - log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented"); - return {}; + log_error("LogEmulationDriver: get_target_mmio_device_ids not implemented"); + return {}; } std::set tt_emulation_device::get_target_remote_device_ids() { - log_error("LogEmulationDriver: get_target_remote_device_ids not implemented"); - return {}; + log_error("LogEmulationDriver: get_target_remote_device_ids not implemented"); + return {}; } void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) { dram_address_params = dram_address_params_; } + int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } + +std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return {0}; } + int tt_emulation_device::detect_number_of_chips() { return 1; } bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} +bool tt_emulation_device::noc_translation_en() { return false; } -std::map tt_emulation_device::get_clocks() { - return std::map(); +std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { + return {{0, 0}}; } -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { - l1_address_params = l1_address_params_; +std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; } +std::map tt_emulation_device::get_clocks() { return std::map(); } - +void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { + l1_address_params = l1_address_params_; +} diff --git a/device/simulation/deprecated/tt_emulation_device.h b/device/simulation/deprecated/tt_emulation_device.h index b15e2aaf5..8c411d07a 100644 --- a/device/simulation/deprecated/tt_emulation_device.h +++ b/device/simulation/deprecated/tt_emulation_device.h @@ -9,63 +9,97 @@ #include #include #include + +#include "cluster.h" #include "tt_soc_descriptor.h" #include "tt_xy_pair.h" -#include "cluster.h" // use forward declaration here so we do not need to include tt_zebu_wrapper.h class tt_zebu_wrapper; class tt_emulation_device : public tt_device { public: - virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care - tt_emulation_device(const std::string& sdesc_path); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params& device_params); - virtual void close_device(); - virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); - virtual void assert_risc_reset(); - virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); + virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); // Dont care + tt_emulation_device(const std::string& sdesc_path); + virtual void start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool init_device, + bool skip_driver_allocs); + virtual void start_device(const tt_device_params& device_params); + virtual void close_device(); + virtual void deassert_risc_reset(); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void assert_risc_reset(); + virtual void assert_risc_reset_at_core(tt_cxy_pair core); + virtual void write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use); // See Versim Implementation - virtual void read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use); // See Versim Implementation + virtual void read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); - virtual bool using_harvested_soc_descriptors(); - virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); - virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual bool noc_translation_en(); - virtual std::set get_target_mmio_device_ids(); - virtual std::set get_target_remote_device_ids(); - virtual ~tt_emulation_device(); - virtual tt_ClusterDescriptor* get_cluster_description(); - virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - virtual int get_number_of_chips_in_cluster(); - virtual std::unordered_set get_all_chips_in_cluster(); - static int detect_number_of_chips(); - virtual std::map get_clocks(); -private: - - tt_device_l1_address_params l1_address_params; - std::shared_ptr ndesc; - tt_device_dram_address_params dram_address_params; - - // zebu wrapper, provides interface to zebu emulator device through axi and command transactors - tt_zebu_wrapper *tt_zebu_wrapper_inst = NULL; + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); + virtual bool using_harvested_soc_descriptors(); + virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); + virtual std::unordered_map& get_virtual_soc_descriptors(); + virtual bool noc_translation_en(); + virtual std::set get_target_mmio_device_ids(); + virtual std::set get_target_remote_device_ids(); + virtual ~tt_emulation_device(); + virtual tt_ClusterDescriptor* get_cluster_description(); + virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); + virtual int get_number_of_chips_in_cluster(); + virtual std::unordered_set get_all_chips_in_cluster(); + static int detect_number_of_chips(); + virtual std::map get_clocks(); +private: + tt_device_l1_address_params l1_address_params; + std::shared_ptr ndesc; + tt_device_dram_address_params dram_address_params; + // zebu wrapper, provides interface to zebu emulator device through axi and command transactors + tt_zebu_wrapper* tt_zebu_wrapper_inst = NULL; - // These functions implement the "protocol" between the RTL simulation and the UMD - void write(tt_cxy_pair core, uint64_t addr, const std::vector& data); - std::vector read(tt_cxy_pair core, uint64_t addr, uint32_t size); - + // These functions implement the "protocol" between the RTL simulation and the UMD + void write(tt_cxy_pair core, uint64_t addr, const std::vector& data); + std::vector read(tt_cxy_pair core, uint64_t addr, uint32_t size); }; - diff --git a/device/simulation/deprecated/tt_emulation_stub.cpp b/device/simulation/deprecated/tt_emulation_stub.cpp index b841359f5..bdd97b27a 100644 --- a/device/simulation/deprecated/tt_emulation_stub.cpp +++ b/device/simulation/deprecated/tt_emulation_stub.cpp @@ -3,23 +3,21 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include #include +#include #include "common/logger.hpp" #include "tt_emulation_device.h" tt_emulation_device::tt_emulation_device(const std::string& sdesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n"); + throw std::runtime_error("tt_emulation_device() -- Zebu Emulation is not supported in this build\n"); } - tt_emulation_device::~tt_emulation_device() {} - -void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) {} -std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) {return {};} +void tt_emulation_device::write(tt_cxy_pair core, uint64_t addr, const std::vector& data) {} +std::vector tt_emulation_device::read(tt_cxy_pair core, uint64_t addr, uint32_t size) { return {}; } void tt_emulation_device::start_device(const tt_device_params& device_params) {} @@ -27,52 +25,99 @@ void tt_emulation_device::deassert_risc_reset() {} void tt_emulation_device::assert_risc_reset() {} -void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {} +void tt_emulation_device::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {} void tt_emulation_device::assert_risc_reset_at_core(tt_cxy_pair core) {} void tt_emulation_device::close_device() {} -void tt_emulation_device::start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/) {} - - -void tt_emulation_device::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_emulation_device::rolled_write_to_device(std::vector& base_vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t base_addr, const std::string& tlb_to_use) {} - -void tt_emulation_device::write_to_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_emulation_device::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {}; -void tt_emulation_device::read_from_device(std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {} -void tt_emulation_device::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_emulation_device::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} - +void tt_emulation_device::start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool /*init_device*/, + bool /*skip_driver_allocs*/) {} + +void tt_emulation_device::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) {} + +void tt_emulation_device::rolled_write_to_device( + std::vector& base_vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t base_addr, + const std::string& tlb_to_use) {} + +void tt_emulation_device::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_emulation_device::write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write){}; + +void tt_emulation_device::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& /*tlb_to_use*/) {} + +void tt_emulation_device::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_emulation_device::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} // ------------------------- // Not sure how to implement these functions below, leaving them blank/default for now void tt_emulation_device::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { - // No translation is performed - return; + // No translation is performed + return; } + tt_ClusterDescriptor* tt_emulation_device::get_cluster_description() { return ndesc.get(); } -std::set tt_emulation_device::get_target_mmio_device_ids() {return {};} +std::set tt_emulation_device::get_target_mmio_device_ids() { return {}; } -std::set tt_emulation_device::get_target_remote_device_ids() {return {};} +std::set tt_emulation_device::get_target_remote_device_ids() { return {}; } void tt_emulation_device::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} + int tt_emulation_device::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } -std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return { 0 }; } + +std::unordered_set tt_emulation_device::get_all_chips_in_cluster() { return {0}; } + int tt_emulation_device::detect_number_of_chips() { return 1; } bool tt_emulation_device::using_harvested_soc_descriptors() { return false; } -bool tt_emulation_device::noc_translation_en() { return false; } -std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} - -std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} -std::map tt_emulation_device::get_clocks() {return std::map();} +bool tt_emulation_device::noc_translation_en() { return false; } -void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} +std::unordered_map tt_emulation_device::get_harvesting_masks_for_soc_descriptors() { + return {{0, 0}}; +} +std::unordered_map& tt_emulation_device::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; +} +std::map tt_emulation_device::get_clocks() { return std::map(); } +void tt_emulation_device::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} diff --git a/device/simulation/deprecated/tt_versim_device.cpp b/device/simulation/deprecated/tt_versim_device.cpp index 7e700b2f8..9504d9f64 100644 --- a/device/simulation/deprecated/tt_versim_device.cpp +++ b/device/simulation/deprecated/tt_versim_device.cpp @@ -2,16 +2,14 @@ // // SPDX-License-Identifier: Apache-2.0 - - -#include "cluster.h" -#include "device/driver_atomics.h" -#include "common/logger.hpp" -#include #include +#include #include #include +#include "cluster.h" +#include "common/logger.hpp" +#include "device/driver_atomics.h" #include "yaml-cpp/yaml.h" // TODO: Remove dependency on command_assembler + soc @@ -19,112 +17,134 @@ #include "device/tt_cluster_descriptor.h" namespace CA = CommandAssembler; - -void translate_soc_descriptor_to_ca_soc(CA::Soc &soc, const tt_SocDescriptor soc_descriptor) { - for (auto &core : soc_descriptor.cores) { - CA::SocNocNode node; - CA::xy_pair CA_coord(core.first.x, core.first.y); - node.noc_coord = CA_coord; - node.memory_size = core.second.l1_size; - switch (core.second.type) { - case CoreType::ARC: node.arc = true; break; - case CoreType::DRAM: { - node.dram = true; - #ifdef EN_DRAM_ALIAS - node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first)); - #endif - } break; - case CoreType::ETH: node.eth = true; break; - case CoreType::PCIE: node.pcie = true; break; - case CoreType::WORKER: node.worker = true; break; - case CoreType::HARVESTED: node.harvested = true; break; - case CoreType::ROUTER_ONLY: node.router_only = true; break; - default: std::cout << " Error: Unsupported CoreType type: " << static_cast(core.second.type) << std::endl; break; +void translate_soc_descriptor_to_ca_soc(CA::Soc& soc, const tt_SocDescriptor soc_descriptor) { + for (auto& core : soc_descriptor.cores) { + CA::SocNocNode node; + CA::xy_pair CA_coord(core.first.x, core.first.y); + node.noc_coord = CA_coord; + node.memory_size = core.second.l1_size; + switch (core.second.type) { + case CoreType::ARC: + node.arc = true; + break; + case CoreType::DRAM: { + node.dram = true; +#ifdef EN_DRAM_ALIAS + node.dram_channel_id = std::get<0>(soc_descriptor.dram_core_channel_map.at(core.first)); +#endif + } break; + case CoreType::ETH: + node.eth = true; + break; + case CoreType::PCIE: + node.pcie = true; + break; + case CoreType::WORKER: + node.worker = true; + break; + case CoreType::HARVESTED: + node.harvested = true; + break; + case CoreType::ROUTER_ONLY: + node.router_only = true; + break; + default: + std::cout << " Error: Unsupported CoreType type: " << static_cast(core.second.type) << std::endl; + break; + } + soc.SetNodeProperties(node.noc_coord, node); } - soc.SetNodeProperties(node.noc_coord, node); - } } //////// // Device Versim //////// +#include + #include "device.h" #include "sim_interactive.h" -#include -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); - std::set target_devices = {0}; - if (ndesc_path == "") { - ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); - } - else { - ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); - } +tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) { + soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); + std::set target_devices = {0}; + if (ndesc_path == "") { + ndesc = tt_ClusterDescriptor::create_for_grayskull_cluster(target_devices, {}); + } else { + ndesc = tt_ClusterDescriptor::create_from_yaml(ndesc_path); + } } -std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() {return soc_descriptor_per_chip;} - -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} -void tt_VersimDevice::start_device(const tt_device_params &device_params) { - bool no_checkers = true; - std::vector dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0) -> grid_size); - start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false); +std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() { + return soc_descriptor_per_chip; } -void tt_VersimDevice::close_device() { - stop(); +tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); } + +void tt_VersimDevice::start_device(const tt_device_params& device_params) { + bool no_checkers = true; + std::vector dump_cores = device_params.unroll_vcd_dump_cores(get_soc_descriptor(0)->grid_size); + start(device_params.expand_plusargs(), dump_cores, no_checkers, device_params.init_device, false); } +void tt_VersimDevice::close_device() { stop(); } + void tt_VersimDevice::start( std::vector plusargs, std::vector dump_cores, bool no_checkers, bool /*init_device*/, bool /*skip_driver_allocs*/ - ) { - - std::cout << "Start Versim Device " << std::endl; - std::string device_descriptor_dir = "./"; +) { + std::cout << "Start Versim Device " << std::endl; + std::string device_descriptor_dir = "./"; - std::optional vcd_suffix; - if (dump_cores.size() > 0) { - vcd_suffix = "core_dump.vcd"; - } + std::optional vcd_suffix; + if (dump_cores.size() > 0) { + vcd_suffix = "core_dump.vcd"; + } - std::vector vcd_cores; + std::vector vcd_cores; - // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core - // interface. mainly bypasses arch_configs etc from llir. We can populate soc directly - // MT: have to preserve ca_soc_descriptor object since versim references it at runtime - CA::xy_pair CA_grid_size((soc_descriptor_per_chip.begin() -> second).grid_size.x, (soc_descriptor_per_chip.begin() -> second).grid_size.y); - // CA::Soc ca_soc_manager(CA_grid_size); - std::unique_ptr p_ca_soc_manager_unique = std::make_unique(CA_grid_size); - translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin() -> second)); - // TODO: End + // TODO: For now create a temporary stuff from CA and populate from descriptor before passing back to versim-core + // interface. mainly bypasses arch_configs etc from llir. We can populate soc directly + // MT: have to preserve ca_soc_descriptor object since versim references it at runtime + CA::xy_pair CA_grid_size( + (soc_descriptor_per_chip.begin()->second).grid_size.x, (soc_descriptor_per_chip.begin()->second).grid_size.y); + // CA::Soc ca_soc_manager(CA_grid_size); + std::unique_ptr p_ca_soc_manager_unique = std::make_unique(CA_grid_size); + translate_soc_descriptor_to_ca_soc(*p_ca_soc_manager_unique, (soc_descriptor_per_chip.begin()->second)); + // TODO: End - std::cout << "Versim Device: turn_on_device "; - std::vector trisc_sizes = {static_cast(l1_address_params.trisc0_size), static_cast(l1_address_params.trisc1_size), static_cast(l1_address_params.trisc2_size)}; - std::unique_ptr versim_unique = versim::turn_on_device(CA_grid_size, *p_ca_soc_manager_unique, plusargs, vcd_suffix, dump_cores, no_checkers, - l1_address_params.trisc_base, trisc_sizes); - versim = versim_unique.release(); + std::cout << "Versim Device: turn_on_device "; + std::vector trisc_sizes = { + static_cast(l1_address_params.trisc0_size), + static_cast(l1_address_params.trisc1_size), + static_cast(l1_address_params.trisc2_size)}; + std::unique_ptr versim_unique = versim::turn_on_device( + CA_grid_size, + *p_ca_soc_manager_unique, + plusargs, + vcd_suffix, + dump_cores, + no_checkers, + l1_address_params.trisc_base, + trisc_sizes); + versim = versim_unique.release(); - std::cout << "Versim Device: write info to tvm db " << std::endl; - versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes); - versim::build_and_connect_tvm_phase(); + std::cout << "Versim Device: write info to tvm db " << std::endl; + versim::write_info_to_tvm_db(l1_address_params.trisc_base, trisc_sizes); + versim::build_and_connect_tvm_phase(); - versim->spin_threads(*p_ca_soc_manager_unique, false); - versim::assert_reset(*versim); + versim->spin_threads(*p_ca_soc_manager_unique, false); + versim::assert_reset(*versim); - p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release()); + p_ca_soc_manager = (void*)(p_ca_soc_manager_unique.release()); - std::cout << "Versim Device: Done start " << std::endl; + std::cout << "Versim Device: Done start " << std::endl; } -tt_VersimDevice::~tt_VersimDevice () { - ndesc.reset(); -} +tt_VersimDevice::~tt_VersimDevice() { ndesc.reset(); } // bool tt_VersimDevice::run() { // std::cout << "Versim Device: Run " << std::endl; @@ -136,165 +156,218 @@ tt_VersimDevice::~tt_VersimDevice () { // } void tt_VersimDevice::deassert_risc_reset() { - std::cout << "Versim Device: Deassert risc resets start" << std::endl; - versim::handle_resetting_triscs(*versim); - std::cout << "Versim Device: Start main loop " << std::endl; - versim::startup_versim_main_loop(*versim); + std::cout << "Versim Device: Deassert risc resets start" << std::endl; + versim::handle_resetting_triscs(*versim); + std::cout << "Versim Device: Start main loop " << std::endl; + versim::startup_versim_main_loop(*versim); } -void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - // This function deasserts reset on the full versim device (don't need core level granularity for versim) - deassert_risc_reset(); +void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + // This function deasserts reset on the full versim device (don't need core level granularity for versim) + deassert_risc_reset(); } void tt_VersimDevice::assert_risc_reset() { - std::cout << "Pause all the cores" << std::endl; - versim::pause(*versim); + std::cout << "Pause all the cores" << std::endl; + versim::pause(*versim); - std::cout << "Wait for cores to go to paused state" << std::endl; - versim::sleep_wait_for_paused (*versim); + std::cout << "Wait for cores to go to paused state" << std::endl; + versim::sleep_wait_for_paused(*versim); - std::cout << "Assert riscv reset" << std::endl; - versim::assert_riscv_reset(*versim); + std::cout << "Assert riscv reset" << std::endl; + versim::assert_riscv_reset(*versim); } void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - // This function asserts reset on the full versim device (don't need core level granularity for versim) - assert_risc_reset(); -} - -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { - uint32_t byte_increment = vec.size() * 4; - for (int i=0; i mem_vector(mem_ptr, mem_ptr + len); - rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb); + // This function asserts reset on the full versim device (don't need core level granularity for versim) + assert_risc_reset(); } -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Write vector at target core {}, address: {}", get_sim_time(*versim), core.str(), addr); - - bool aligned_32B = (soc_descriptor_per_chip.begin() -> second).cores.at(core).type == CoreType::DRAM; - // MT: Remove these completely - CommandAssembler::xy_pair CA_target(core.x, core.y); - CommandAssembler::memory CA_tensor_memory(addr, vec); - - nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory); +void tt_VersimDevice::rolled_write_to_device( + std::vector& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { + uint32_t byte_increment = vec.size() * 4; + for (int i = 0; i < unroll_count; i++) { + vec[0] = i; // slot id for debug + write_to_device(vec, core, addr + i * byte_increment, tlb_to_use); + } } -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t size, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) { - log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!"); - - std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); - write_to_device(mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +void tt_VersimDevice::rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t len, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb) { + std::vector mem_vector(mem_ptr, mem_ptr + len); + rolled_write_to_device(mem_vector, unroll_count, core, addr, fallback_tlb); +} + +void tt_VersimDevice::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Write vector at target core {}, address: {}", + get_sim_time(*versim), + core.str(), + addr); + + bool aligned_32B = (soc_descriptor_per_chip.begin()->second).cores.at(core).type == CoreType::DRAM; + // MT: Remove these completely + CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::memory CA_tensor_memory(addr, vec); + + nuapi::device::write_memory_to_core(*versim, CA_target, CA_tensor_memory); +} + +void tt_VersimDevice::write_to_device( + const void* mem_ptr, + uint32_t size, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) { + log_assert(!(size % 4), "Writes to Versim Backend should be 4 byte aligned!"); + + std::vector mem_vector((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size / sizeof(uint32_t)); + write_to_device( + mem_vector, core, addr, tlb_to_use, send_epoch_cmd, last_send_epoch_cmd, ordered_with_prev_remote_write); +} + +void tt_VersimDevice::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) { + for (const auto& core : get_soc_descriptor(0)->cores) { + if (cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and + rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { + write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); + } + } } -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) { - for(const auto& core : get_soc_descriptor(0) -> cores) { - if(cols_to_exclude.find(core.first.x) == cols_to_exclude.end() and rows_to_exclude.find(core.first.y) == rows_to_exclude.end() and core.second.type != CoreType::HARVESTED) { - write_to_device(mem_ptr, size_in_bytes, tt_cxy_pair(0, core.first.x, core.first.y), address, ""); - } - } -} void tt_VersimDevice::wait_for_non_mmio_flush() { - // Do nothing, since Versim does not simulate non-mmio mapped chips + // Do nothing, since Versim does not simulate non-mmio mapped chips } -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) { - tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) { + tt_driver_atomics::mfence(); // Ensure no reordering of loads/stores around this } -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); +void tt_VersimDevice::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", + get_sim_time(*versim), + addr, + size); - CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::xy_pair CA_target(core.x, core.y); - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - vec = result; + size_t size_in_words = size / 4; + auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); + vec = result; } -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { - log_debug(tt::LogSiliconDriver, "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", get_sim_time(*versim), addr, size); - log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!"); +void tt_VersimDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { + log_debug( + tt::LogSiliconDriver, + "Versim Device ({}): Read vector from address: {}, with size: {} Bytes", + get_sim_time(*versim), + addr, + size); + log_assert(!(size % 4), "Reads from Versim backend should be 4 byte aligned!"); - CommandAssembler::xy_pair CA_target(core.x, core.y); + CommandAssembler::xy_pair CA_target(core.x, core.y); - size_t size_in_words = size / 4; - auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); - memcpy(mem_ptr, result.data(), result.size()*sizeof(uint32_t)); + size_t size_in_words = size / 4; + auto result = nuapi::device::read_memory_from_core(*versim, CA_target, addr, size_in_words); + memcpy(mem_ptr, result.data(), result.size() * sizeof(uint32_t)); } -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) { - // No translation is performed - return; +void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c) { + // No translation is performed + return; } std::set tt_VersimDevice::get_target_mmio_device_ids() { - // Must only be used for silicon - return {}; + // Must only be used for silicon + return {}; } std::set tt_VersimDevice::get_target_remote_device_ids() { - // Must only be used for silicon - return {}; + // Must only be used for silicon + return {}; } - -bool versim_check_dram_core_exists(const std::vector> &dram_core_channels, tt_xy_pair target_core) { +bool versim_check_dram_core_exists( + const std::vector>& dram_core_channels, tt_xy_pair target_core) { bool dram_core_exists = false; - for (const auto &dram_cores_in_channel: dram_core_channels) { - for (const auto &dram_core : dram_cores_in_channel) { - if (dram_core.x == target_core.x && dram_core.y == target_core.y) { - return true; + for (const auto& dram_cores_in_channel : dram_core_channels) { + for (const auto& dram_core : dram_cores_in_channel) { + if (dram_core.x == target_core.x && dram_core.y == target_core.y) { + return true; + } } - } } return false; } int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } + std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {0}; } + int tt_VersimDevice::detect_number_of_chips() { return 1; } bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } + bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}};} + +std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; } // Meant to breakout running functions for simulator bool tt_VersimDevice::stop() { - std::cout << "Versim Device: Stop " << std::endl; - - versim::turn_off_device(*versim); - versim->shutdown(); - // Force free of all versim cores - for (auto x = 0; x < versim->grid_size.x; x++) { - for (auto y = 0; y < versim->grid_size.y; y++) { - delete versim->core_grid.at(x).at(y); + std::cout << "Versim Device: Stop " << std::endl; + + versim::turn_off_device(*versim); + versim->shutdown(); + // Force free of all versim cores + for (auto x = 0; x < versim->grid_size.x; x++) { + for (auto y = 0; y < versim->grid_size.y; y++) { + delete versim->core_grid.at(x).at(y); + } } - } - std::cout << "Versim Device: Stop completed " << std::endl; - delete versim; - return true; + std::cout << "Versim Device: Stop completed " << std::endl; + delete versim; + return true; } -std::map tt_VersimDevice::get_clocks() { - return std::map(); -} +std::map tt_VersimDevice::get_clocks() { return std::map(); } void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) { l1_address_params = l1_address_params_; @@ -305,11 +378,11 @@ void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_addres } std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { - return get_soc_descriptor(device_id) -> get_num_dram_channels(); + return get_soc_descriptor(device_id)->get_num_dram_channels(); } std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { - return get_soc_descriptor(device_id) -> dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id)->dram_bank_size; // Space per channel is identical for now } std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { diff --git a/device/simulation/deprecated/tt_versim_device.h b/device/simulation/deprecated/tt_versim_device.h index 05ac6b06c..2c71f1be0 100644 --- a/device/simulation/deprecated/tt_versim_device.h +++ b/device/simulation/deprecated/tt_versim_device.h @@ -11,42 +11,92 @@ #include "tt_xy_pair.h" class c_versim_core; -namespace nuapi {namespace device {template class Simulator;}} -namespace versim { - struct VersimSimulatorState; - using VersimSimulator = nuapi::device::Simulator; + +namespace nuapi { +namespace device { +template +class Simulator; } +} // namespace nuapi + +namespace versim { +struct VersimSimulatorState; +using VersimSimulator = nuapi::device::Simulator; +} // namespace versim /** * @brief Versim Backend Class, derived from the tt_device class * Implements APIs to communicate with a simulated (using Verilator) Tenstorrent Device. -*/ -class tt_VersimDevice: public tt_device -{ - public: + */ +class tt_VersimDevice : public tt_device { +public: virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); - tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path); + tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path); virtual std::unordered_map& get_virtual_soc_descriptors(); - virtual void start(std::vector plusargs, std::vector dump_cores, bool no_checkers, bool init_device, bool skip_driver_allocs); - virtual void start_device(const tt_device_params &device_params); + virtual void start( + std::vector plusargs, + std::vector dump_cores, + bool no_checkers, + bool init_device, + bool skip_driver_allocs); + virtual void start_device(const tt_device_params& device_params); virtual void close_device(); virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); virtual void assert_risc_reset(); virtual void assert_risc_reset_at_core(tt_cxy_pair core); - virtual void write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& columns_to_exclude, const std::string& fallback_tlb); - virtual void rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); - virtual void rolled_write_to_device(uint32_t* mem_ptr, uint32_t size_in_bytes, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb); - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd = false, bool last_send_epoch_cmd = true, bool ordered_with_prev_remote_write = false); - virtual void read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& columns_to_exclude, + const std::string& fallback_tlb); + virtual void rolled_write_to_device( + std::vector& vec, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use); + virtual void read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); + virtual void rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t size_in_bytes, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb); + virtual void write_to_device( + const void* mem_ptr, + uint32_t size_in_bytes, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd = false, + bool last_send_epoch_cmd = true, + bool ordered_with_prev_remote_write = false); + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use); virtual void wait_for_non_mmio_flush(); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + virtual void translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c); virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); virtual bool noc_translation_en(); @@ -57,12 +107,13 @@ class tt_VersimDevice: public tt_device virtual int get_number_of_chips_in_cluster(); virtual std::unordered_set get_all_chips_in_cluster(); static int detect_number_of_chips(); - virtual std::map get_clocks(); + virtual std::map get_clocks(); virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_num_host_channels(std::uint32_t device_id); virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); - private: + +private: bool stop(); tt_device_l1_address_params l1_address_params; tt_device_dram_address_params dram_address_params; diff --git a/device/simulation/deprecated/tt_versim_stub.cpp b/device/simulation/deprecated/tt_versim_stub.cpp index 8cf0899b8..c80e0bddb 100644 --- a/device/simulation/deprecated/tt_versim_stub.cpp +++ b/device/simulation/deprecated/tt_versim_stub.cpp @@ -2,19 +2,18 @@ // // SPDX-License-Identifier: Apache-2.0 - -#include "cluster.h" - -#include #include +#include #include #include -tt_VersimDevice::tt_VersimDevice(const std::string &sdesc_path, const std::string &ndesc_path) : tt_device(sdesc_path) { - throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); +#include "cluster.h" + +tt_VersimDevice::tt_VersimDevice(const std::string& sdesc_path, const std::string& ndesc_path) : tt_device(sdesc_path) { + throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); } -tt_VersimDevice::~tt_VersimDevice () {} +tt_VersimDevice::~tt_VersimDevice() {} std::unordered_map& tt_VersimDevice::get_virtual_soc_descriptors() { throw std::runtime_error("tt_VersimDevice() -- VERSIM is not supported in this build\n"); @@ -22,23 +21,71 @@ std::unordered_map& tt_VersimDevice::get_virtual_so } int tt_VersimDevice::get_number_of_chips_in_cluster() { return detect_number_of_chips(); } + std::unordered_set tt_VersimDevice::get_all_chips_in_cluster() { return {}; } + int tt_VersimDevice::detect_number_of_chips() { return 0; } -void tt_VersimDevice::start_device(const tt_device_params &device_params) {} +void tt_VersimDevice::start_device(const tt_device_params& device_params) {} + void tt_VersimDevice::close_device() {} -void tt_VersimDevice::write_to_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::broadcast_write_to_cluster(const void *mem_ptr, uint32_t size_in_bytes, uint64_t address, const std::set& chips_to_exclude, std::set& rows_to_exclude, std::set& cols_to_exclude, const std::string& fallback_tlb) {} -void tt_VersimDevice::read_from_device(std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(std::vector &vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) {} -void tt_VersimDevice::write_to_device(const void *mem_ptr, uint32_t len, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use, bool send_epoch_cmd, bool last_send_epoch_cmd, bool ordered_with_prev_remote_write) {} -void tt_VersimDevice::read_from_device(void *mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} -void tt_VersimDevice::rolled_write_to_device(uint32_t* mem_ptr, uint32_t len, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& fallback_tlb) {} + +void tt_VersimDevice::write_to_device( + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_VersimDevice::broadcast_write_to_cluster( + const void* mem_ptr, + uint32_t size_in_bytes, + uint64_t address, + const std::set& chips_to_exclude, + std::set& rows_to_exclude, + std::set& cols_to_exclude, + const std::string& fallback_tlb) {} + +void tt_VersimDevice::read_from_device( + std::vector& vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} + +void tt_VersimDevice::rolled_write_to_device( + std::vector& vec, uint32_t unroll_count, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { +} + +void tt_VersimDevice::write_to_device( + const void* mem_ptr, + uint32_t len, + tt_cxy_pair core, + uint64_t addr, + const std::string& tlb_to_use, + bool send_epoch_cmd, + bool last_send_epoch_cmd, + bool ordered_with_prev_remote_write) {} + +void tt_VersimDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) {} + +void tt_VersimDevice::rolled_write_to_device( + uint32_t* mem_ptr, + uint32_t len, + uint32_t unroll_count, + tt_cxy_pair core, + uint64_t addr, + const std::string& fallback_tlb) {} + void tt_VersimDevice::wait_for_non_mmio_flush() {} -void tt_VersimDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} -void tt_VersimDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) {} +void tt_VersimDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} + +void tt_VersimDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& dram_cores) {} void tt_VersimDevice::start( std::vector plusargs, @@ -49,36 +96,48 @@ void tt_VersimDevice::start( ) {} void tt_VersimDevice::deassert_risc_reset() {} -void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) {} + +void tt_VersimDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) {} + void tt_VersimDevice::assert_risc_reset() {} + void tt_VersimDevice::assert_risc_reset_at_core(tt_cxy_pair core) {} -void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t &r, std::size_t &c) {}; +void tt_VersimDevice::translate_to_noc_table_coords(chip_id_t device_id, std::size_t& r, std::size_t& c){}; + // void tt_VersimDevice::dump_wall_clock_mailbox(std::string output_path, int device_id) {} -std::set tt_VersimDevice::get_target_mmio_device_ids() {return {};} -std::set tt_VersimDevice::get_target_remote_device_ids() {return {};} +std::set tt_VersimDevice::get_target_mmio_device_ids() { return {}; } + +std::set tt_VersimDevice::get_target_remote_device_ids() { return {}; } bool versim_check_dram_core_exists( - const std::vector> &dram_core_channels, tt_xy_pair target_core) { - return false; + const std::vector>& dram_core_channels, tt_xy_pair target_core) { + return false; } bool tt_VersimDevice::using_harvested_soc_descriptors() { return false; } + bool tt_VersimDevice::noc_translation_en() { return false; } -std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { return std::unordered_map();} + +std::unordered_map tt_VersimDevice::get_harvesting_masks_for_soc_descriptors() { + return std::unordered_map(); +} bool tt_VersimDevice::stop() { return true; } void tt_VersimDevice::set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_) {} + void tt_VersimDevice::set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_) {} -std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) {return 0;} -std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} -std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) {return 0;} -std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} +std::uint32_t tt_VersimDevice::get_num_dram_channels(std::uint32_t device_id) { return 0; } + +std::uint64_t tt_VersimDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } + +std::uint32_t tt_VersimDevice::get_num_host_channels(std::uint32_t device_id) { return 0; } -std::map tt_VersimDevice::get_clocks() {return std::map();} +std::uint32_t tt_VersimDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } -tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() {return ndesc.get();} +std::map tt_VersimDevice::get_clocks() { return std::map(); } +tt_ClusterDescriptor* tt_VersimDevice::get_cluster_description() { return ndesc.get(); } diff --git a/device/simulation/tt_simulation_device.cpp b/device/simulation/tt_simulation_device.cpp index a77a8ad3f..e703b26eb 100644 --- a/device/simulation/tt_simulation_device.cpp +++ b/device/simulation/tt_simulation_device.cpp @@ -4,43 +4,44 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include +#include "tt_simulation_device.h" + +#include +#include + #include +#include #include #include -#include -#include - -#include "common/logger.hpp" #include "common/assert.hpp" +#include "common/logger.hpp" #include "device/driver_atomics.h" #include "device/tt_cluster_descriptor.h" - -#include "tt_simulation_device.h" #include "tt_simulation_device_generated.h" -flatbuffers::FlatBufferBuilder create_flatbuffer(DEVICE_COMMAND rw, std::vector vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_=0){ +flatbuffers::FlatBufferBuilder create_flatbuffer( + DEVICE_COMMAND rw, std::vector vec, tt_cxy_pair core_, uint64_t addr, uint64_t size_ = 0) { flatbuffers::FlatBufferBuilder builder; auto data = builder.CreateVector(vec); auto core = tt_vcs_core(core_.x, core_.y); - uint64_t size = size_ == 0 ? size = vec.size()*sizeof(uint32_t) : size = size_; + uint64_t size = size_ == 0 ? size = vec.size() * sizeof(uint32_t) : size = size_; auto device_cmd = CreateDeviceRequestResponse(builder, rw, data, &core, addr, size); builder.Finish(device_cmd); return builder; } -void print_flatbuffer(const DeviceRequestResponse *buf){ +void print_flatbuffer(const DeviceRequestResponse* buf) { std::vector data_vec(buf->data()->begin(), buf->data()->end()); uint64_t addr = buf->address(); uint32_t size = buf->size(); tt_cxy_pair core = {0, buf->core()->x(), buf->core()->y()}; - + std::stringstream ss; ss << std::hex << reinterpret_cast(addr); std::string addr_hex = ss.str(); log_info(tt::LogEmulationDriver, "{} bytes @ address {} in core ({}, {})", size, addr_hex, core.x, core.y); - for(int i = 0; i < data_vec.size(); i++){ + for (int i = 0; i < data_vec.size(); i++) { std::ios_base::fmtflags save = std::cout.flags(); std::cout << "0x" << std::hex << std::setw(8) << std::setfill('0') << data_vec[i] << " "; std::cout.flags(save); @@ -48,14 +49,14 @@ void print_flatbuffer(const DeviceRequestResponse *buf){ std::cout << std::endl; } -tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_device(){ +tt_SimulationDevice::tt_SimulationDevice(const std::string& sdesc_path) : tt_device() { log_info(tt::LogEmulationDriver, "Instantiating simulation device"); soc_descriptor_per_chip.emplace(0, tt_SocDescriptor(sdesc_path)); std::set target_devices = {0}; - + // Start VCS simulator in a separate process TT_ASSERT(std::getenv("TT_REMOTE_EXE"), "TT_REMOTE_EXE not set, please provide path to the VCS binary"); - uv_loop_t *loop = uv_default_loop(); + uv_loop_t* loop = uv_default_loop(); uv_process_t child_p; uv_process_options_t child_options = {0}; @@ -69,14 +70,12 @@ tt_SimulationDevice::tt_SimulationDevice(const std::string &sdesc_path) : tt_dev log_info(tt::LogEmulationDriver, "Simulator process spawned with PID: {}", child_p.pid); } - uv_unref((uv_handle_t *) &child_p); + uv_unref((uv_handle_t*)&child_p); uv_run(loop, UV_RUN_DEFAULT); uv_loop_close(loop); } -tt_SimulationDevice::~tt_SimulationDevice() { - close_device(); -} +tt_SimulationDevice::~tt_SimulationDevice() { close_device(); } // Setup/Teardown Functions std::unordered_map& tt_SimulationDevice::get_virtual_soc_descriptors() { @@ -99,11 +98,11 @@ void tt_SimulationDevice::set_driver_eth_interface_params(const tt_driver_eth_in eth_interface_params = eth_interface_params_; } -void tt_SimulationDevice::start_device(const tt_device_params &device_params) { - void *buf_ptr = nullptr; +void tt_SimulationDevice::start_device(const tt_device_params& device_params) { + void* buf_ptr = nullptr; host.start_host(); - + log_info(tt::LogEmulationDriver, "Waiting for ack msg from remote..."); size_t buf_size = host.recv_from_device(&buf_ptr); auto buf = GetDeviceRequestResponse(buf_ptr); @@ -114,8 +113,9 @@ void tt_SimulationDevice::start_device(const tt_device_params &device_params) { void tt_SimulationDevice::assert_risc_reset() { log_info(tt::LogEmulationDriver, "Sending assert_risc_reset signal.."); - auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector(1, 0), {0, 0, 0}, 0); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + auto wr_buffer = + create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_ASSERT, std::vector(1, 0), {0, 0, 0}, 0); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); @@ -124,20 +124,25 @@ void tt_SimulationDevice::assert_risc_reset() { void tt_SimulationDevice::deassert_risc_reset() { log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset' signal.."); - auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector(1, 0), {0, 0, 0}, 0); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + auto wr_buffer = + create_flatbuffer(DEVICE_COMMAND_ALL_TENSIX_RESET_DEASSERT, std::vector(1, 0), {0, 0, 0}, 0); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); host.send_to_device(wr_buffer_ptr, wr_buffer_size); } -void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets) { - log_info(tt::LogEmulationDriver, "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)"); +void tt_SimulationDevice::deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions& soft_resets) { + log_info( + tt::LogEmulationDriver, + "Sending 'deassert_risc_reset_at_core'.. (Not implemented, defaulting to 'deassert_risc_reset' instead)"); deassert_risc_reset(); } void tt_SimulationDevice::assert_risc_reset_at_core(tt_cxy_pair core) { - log_info(tt::LogEmulationDriver, "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)"); + log_info( + tt::LogEmulationDriver, + "Sending 'assert_risc_reset_at_core'.. (Not implemented, defaulting to 'assert_risc_reset' instead)"); assert_risc_reset(); } @@ -149,19 +154,21 @@ void tt_SimulationDevice::close_device() { } // Runtime Functions -void tt_SimulationDevice::write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { +void tt_SimulationDevice::write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use) { log_info(tt::LogEmulationDriver, "Device writing"); std::vector data((uint32_t*)mem_ptr, (uint32_t*)mem_ptr + size_in_bytes / sizeof(uint32_t)); auto wr_buffer = create_flatbuffer(DEVICE_COMMAND_WRITE, data, core, addr); - uint8_t *wr_buffer_ptr = wr_buffer.GetBufferPointer(); + uint8_t* wr_buffer_ptr = wr_buffer.GetBufferPointer(); size_t wr_buffer_size = wr_buffer.GetSize(); - - print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print + + print_flatbuffer(GetDeviceRequestResponse(wr_buffer_ptr)); // sanity print host.send_to_device(wr_buffer_ptr, wr_buffer_size); } -void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { - void *rd_resp; +void tt_SimulationDevice::read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb) { + void* rd_resp; // Send read request auto rd_req_buf = create_flatbuffer(DEVICE_COMMAND_READ, {0}, core, addr, size); @@ -171,50 +178,49 @@ void tt_SimulationDevice::read_from_device(void* mem_ptr, tt_cxy_pair core, uint size_t rd_rsp_sz = host.recv_from_device(&rd_resp); auto rd_resp_buf = GetDeviceRequestResponse(rd_resp); - if (addr != 0x40){ + if (addr != 0x40) { log_info(tt::LogEmulationDriver, "Device reading vec"); - print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam + print_flatbuffer(rd_resp_buf); // 0x40 is host polling device, don't print since it'll spam } std::memcpy(mem_ptr, rd_resp_buf->data()->data(), rd_resp_buf->data()->size() * sizeof(uint32_t)); nng_free(rd_resp, rd_rsp_sz); } void tt_SimulationDevice::wait_for_non_mmio_flush() {} + void tt_SimulationDevice::wait_for_non_mmio_flush(const chip_id_t chip) {} -void tt_SimulationDevice::l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} -void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} -void tt_SimulationDevice::dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_SimulationDevice::l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} + +void tt_SimulationDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels) {} + +void tt_SimulationDevice::dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores) {} // Misc. Functions to Query/Set Device State std::unordered_map tt_SimulationDevice::get_harvesting_masks_for_soc_descriptors() { return {{0, 0}}; } -std::vector tt_SimulationDevice::detect_available_device_ids() { - return {0}; -} +std::vector tt_SimulationDevice::detect_available_device_ids() { return {0}; } -std::set tt_SimulationDevice::get_target_remote_device_ids() { - return target_remote_chips; -} +std::set tt_SimulationDevice::get_target_remote_device_ids() { return target_remote_chips; } -std::map tt_SimulationDevice::get_clocks() { - return {{0, 0}}; -} +std::map tt_SimulationDevice::get_clocks() { return {{0, 0}}; } -void *tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { +void* tt_SimulationDevice::host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const { return nullptr; } std::uint64_t tt_SimulationDevice::get_pcie_base_addr_from_device(const chip_id_t chip_id) const { - if(arch_name == tt::ARCH::WORMHOLE_B0) { + if (arch_name == tt::ARCH::WORMHOLE_B0) { return 0x800000000; - } - else if (arch_name == tt::ARCH::BLACKHOLE) { + } else if (arch_name == tt::ARCH::BLACKHOLE) { // Enable 4th ATU window. return 1ULL << 60; - } - else { + } else { return 0; } } @@ -224,12 +230,11 @@ std::uint32_t tt_SimulationDevice::get_num_dram_channels(std::uint32_t device_id } std::uint64_t tt_SimulationDevice::get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel) { - return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now + return get_soc_descriptor(device_id).dram_bank_size; // Space per channel is identical for now } -std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { - return 1; -} +std::uint32_t tt_SimulationDevice::get_num_host_channels(std::uint32_t device_id) { return 1; } + +std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) { return 0; } -std::uint32_t tt_SimulationDevice::get_host_channel_size(std::uint32_t device_id, std::uint32_t channel) {return 0;} -std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) {return 0;} +std::uint32_t tt_SimulationDevice::get_numa_node_for_pcie_device(std::uint32_t device_id) { return 0; } diff --git a/device/simulation/tt_simulation_device.h b/device/simulation/tt_simulation_device.h index 16dd47d75..5f468a11b 100644 --- a/device/simulation/tt_simulation_device.h +++ b/device/simulation/tt_simulation_device.h @@ -13,43 +13,49 @@ #include "device/cluster.h" #include "device/simulation/tt_simulation_host.hpp" -class tt_SimulationDevice: public tt_device { - public: - tt_SimulationDevice(const std::string &sdesc_path); +class tt_SimulationDevice : public tt_device { +public: + tt_SimulationDevice(const std::string& sdesc_path); ~tt_SimulationDevice(); tt_SimulationHost host; - //Setup/Teardown Functions + // Setup/Teardown Functions virtual std::unordered_map& get_virtual_soc_descriptors(); virtual void set_device_l1_address_params(const tt_device_l1_address_params& l1_address_params_); virtual void set_device_dram_address_params(const tt_device_dram_address_params& dram_address_params_); virtual void set_driver_host_address_params(const tt_driver_host_address_params& host_address_params_); virtual void set_driver_eth_interface_params(const tt_driver_eth_interface_params& eth_interface_params_); - virtual void start_device(const tt_device_params &device_params); + virtual void start_device(const tt_device_params& device_params); virtual void assert_risc_reset(); virtual void deassert_risc_reset(); - virtual void deassert_risc_reset_at_core(tt_cxy_pair core, const TensixSoftResetOptions &soft_resets = TENSIX_DEASSERT_SOFT_RESET); + virtual void deassert_risc_reset_at_core( + tt_cxy_pair core, const TensixSoftResetOptions& soft_resets = TENSIX_DEASSERT_SOFT_RESET); virtual void assert_risc_reset_at_core(tt_cxy_pair core); virtual void close_device(); // Runtime Functions - virtual void write_to_device(const void *mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); - virtual void read_from_device(void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); + virtual void write_to_device( + const void* mem_ptr, uint32_t size_in_bytes, tt_cxy_pair core, uint64_t addr, const std::string& tlb_to_use); + virtual void read_from_device( + void* mem_ptr, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& fallback_tlb); virtual void wait_for_non_mmio_flush(); virtual void wait_for_non_mmio_flush(const chip_id_t chip); - void l1_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); - void dram_membar(const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void l1_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& channels); + void dram_membar( + const chip_id_t chip, const std::string& fallback_tlb, const std::unordered_set& cores = {}); // Misc. Functions to Query/Set Device State // virtual bool using_harvested_soc_descriptors(); virtual std::unordered_map get_harvesting_masks_for_soc_descriptors(); static std::vector detect_available_device_ids(); virtual std::set get_target_remote_device_ids(); - virtual std::map get_clocks(); - virtual void *host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; + virtual std::map get_clocks(); + virtual void* host_dma_address(std::uint64_t offset, chip_id_t src_device_id, uint16_t channel) const; virtual std::uint64_t get_pcie_base_addr_from_device(const chip_id_t chip_id) const; virtual std::uint32_t get_num_dram_channels(std::uint32_t device_id); virtual std::uint64_t get_dram_channel_size(std::uint32_t device_id, std::uint32_t channel); @@ -57,7 +63,7 @@ class tt_SimulationDevice: public tt_device { virtual std::uint32_t get_host_channel_size(std::uint32_t device_id, std::uint32_t channel); virtual std::uint32_t get_numa_node_for_pcie_device(std::uint32_t device_id); - private: +private: // State variables tt_device_dram_address_params dram_address_params; tt_device_l1_address_params l1_address_params; diff --git a/device/simulation/tt_simulation_host.cpp b/device/simulation/tt_simulation_host.cpp index ed9cf7e94..309bb7be4 100644 --- a/device/simulation/tt_simulation_host.cpp +++ b/device/simulation/tt_simulation_host.cpp @@ -2,19 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include -#include -#include -#include -#include +#include "tt_simulation_host.hpp" #include #include -#include "common/logger.hpp" +#include +#include +#include +#include +#include +#include + #include "common/assert.hpp" -#include "tt_simulation_host.hpp" +#include "common/logger.hpp" tt_SimulationHost::tt_SimulationHost() { // Initialize socket and dialer @@ -64,7 +65,7 @@ void tt_SimulationHost::start_host() { void tt_SimulationHost::send_to_device(uint8_t *buf, size_t buf_size) { int rv; log_debug(tt::LogEmulationDriver, "Sending messsage to remote.."); - + void *msg = nng_alloc(buf_size); std::memcpy(msg, buf, buf_size); diff --git a/device/simulation/tt_simulation_host.hpp b/device/simulation/tt_simulation_host.hpp index 6de18a04e..26897a447 100644 --- a/device/simulation/tt_simulation_host.hpp +++ b/device/simulation/tt_simulation_host.hpp @@ -1,9 +1,9 @@ // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include #include #include +#include #include "device/tt_xy_pair.h" @@ -20,6 +20,7 @@ class tt_SimulationHost { void start_host(); void send_to_device(uint8_t *buf, size_t buf_size); size_t recv_from_device(void **data_ptr); + private: std::unique_ptr host_socket; std::unique_ptr host_dialer; diff --git a/device/tlb.h b/device/tlb.h index 3e8fb8268..300942029 100644 --- a/device/tlb.h +++ b/device/tlb.h @@ -8,8 +8,8 @@ #include #include -#include #include +#include namespace tt::umd { @@ -41,10 +41,10 @@ struct tlb_data { // Orderings static constexpr uint64_t Relaxed = 0; - static constexpr uint64_t Strict = 1; - static constexpr uint64_t Posted = 2; + static constexpr uint64_t Strict = 1; + static constexpr uint64_t Posted = 2; - bool check(const tlb_offsets & offset) const; + bool check(const tlb_offsets &offset) const; std::pair apply_offset(const tlb_offsets &offset) const; }; diff --git a/device/tt_arch_types.h b/device/tt_arch_types.h index 8a7c5dba1..c165bf1b4 100644 --- a/device/tt_arch_types.h +++ b/device/tt_arch_types.h @@ -17,4 +17,4 @@ enum class ARCH { BLACKHOLE = 3, Invalid = 0xFF, }; -} +} // namespace tt diff --git a/device/tt_cluster_descriptor.cpp b/device/tt_cluster_descriptor.cpp index 0ed661203..b427fc9f2 100644 --- a/device/tt_cluster_descriptor.cpp +++ b/device/tt_cluster_descriptor.cpp @@ -2,24 +2,25 @@ // // SPDX-License-Identifier: Apache-2.0 - #include "tt_cluster_descriptor.h" -#include "libs/create_ethernet_map.h" #include #include -#include +#include #include "common/disjoint_set.hpp" #include "common/logger.hpp" -#include "yaml-cpp/yaml.h" - #include "fmt/core.h" +#include "libs/create_ethernet_map.h" +#include "yaml-cpp/yaml.h" using namespace tt; -bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const { + +bool tt_ClusterDescriptor::ethernet_core_has_active_ethernet_link( + chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const { return this->ethernet_connections.find(local_chip) != this->ethernet_connections.end() && - this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != this->ethernet_connections.at(local_chip).end(); + this->ethernet_connections.at(local_chip).find(local_ethernet_channel) != + this->ethernet_connections.at(local_chip).end(); } std::tuple tt_ClusterDescriptor::get_chip_and_channel_of_remote_ethernet_core( @@ -40,10 +41,14 @@ std::tuple tt_ClusterDescriptor::get_chip_and_cha } } -// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how extensively router needs to use it -std::vector> tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const { +// NOTE: It might be worthwhile to precompute this for every pair of directly connected chips, depending on how +// extensively router needs to use it +std::vector> +tt_ClusterDescriptor::get_directly_connected_ethernet_channels_between_chips( + const chip_id_t &first, const chip_id_t &second) const { std::vector> directly_connected_channels = {}; - if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { + if (this->enabled_active_chips.find(first) == this->enabled_active_chips.end() || + this->enabled_active_chips.find(second) == this->enabled_active_chips.end()) { return {}; } @@ -60,9 +65,7 @@ bool tt_ClusterDescriptor::is_chip_mmio_capable(const chip_id_t chip_id) const { return this->chips_with_mmio.find(chip_id) != this->chips_with_mmio.end(); } -bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { - return !is_chip_mmio_capable(chip_id); -} +bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { return !is_chip_mmio_capable(chip_id); } // given two coordinates, finds the number of hops between the two chips // it assumes that shelves are connected in x-dim and racks are connected in y-dim @@ -71,11 +74,21 @@ bool tt_ClusterDescriptor::is_chip_remote(const chip_id_t chip_id) const { // then once a chip on the same shelf&rack is found, // the distance from this chip to either location_a or location_b is just x&y dim difference. // the function returns the total distance of travelled between shelves and racks, plust the x&y dim difference -int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const { - - log_trace(LogSiliconDriver, "get_ethernet_link_coord_distance from ({}, {}, {}, {}, {}) to ({}, {}, {}, {}, {})", - location_a.cluster_id, location_a.x, location_a.y, location_a.rack, location_a.shelf, - location_b.cluster_id, location_b.x, location_b.y, location_b.rack, location_b.shelf); +int tt_ClusterDescriptor::get_ethernet_link_coord_distance( + const eth_coord_t &location_a, const eth_coord_t &location_b) const { + log_trace( + LogSiliconDriver, + "get_ethernet_link_coord_distance from ({}, {}, {}, {}, {}) to ({}, {}, {}, {}, {})", + location_a.cluster_id, + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.cluster_id, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf); if (location_a.cluster_id != location_b.cluster_id) { return std::numeric_limits::max(); @@ -85,166 +98,242 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo int y_distance = std::abs(location_a.y - location_b.y); // move along y-dim to exit from the shelf to go to a higher shelf - if(location_b.shelf > location_a.shelf) { + if (location_b.shelf > location_a.shelf) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(location_a.shelf) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert( + galaxy_shelves_exit_chip_coords_per_y_dim.find(location_a.shelf) != + galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).find(location_a.y) == galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).end()) { + if (galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).find(location_a.y) == + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).at(location_a.y); - log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many"); + const Chip2ChipConnection &shelf_to_shelf_connection = + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_a.shelf).at(location_a.y); + log_assert( + shelf_to_shelf_connection.destination_chip_coords.size(), + "Expecting at least one shelf-to-shelf connection, possibly one-to-many"); // for each shelf-to-shelf connection at location_a.y, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; - for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - - log_assert(exit_shelf.y == location_a.y && exit_shelf.shelf == location_a.shelf && exit_shelf.rack == location_a.rack, + for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { + log_assert( + exit_shelf.y == location_a.y && exit_shelf.shelf == location_a.shelf && + exit_shelf.rack == location_a.rack, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(next_shelf.shelf == (location_a.shelf+1) && next_shelf.rack == location_a.rack, + log_assert( + next_shelf.shelf == (location_a.shelf + 1) && next_shelf.rack == location_a.rack, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_shelf); int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_b); // no path found - if(distance_to_exit == std::numeric_limits::max() || distance_in_next_shelf == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_shelf == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - location_a.x, location_a.y, location_a.rack, location_a.shelf, - location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; - } - else if(location_a.shelf > location_b.shelf) { - + } else if (location_a.shelf > location_b.shelf) { // this is already verified where galaxy_shelves_exit_chip_coords_per_y_dim is populated, but just to be safe - log_assert(galaxy_shelves_exit_chip_coords_per_y_dim.find(location_b.shelf) != galaxy_shelves_exit_chip_coords_per_y_dim.end(), + log_assert( + galaxy_shelves_exit_chip_coords_per_y_dim.find(location_b.shelf) != + galaxy_shelves_exit_chip_coords_per_y_dim.end(), "Expected shelf-to-shelf connection"); // this row does not have a shelf-to-shelf connection - if(galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).find(location_b.y) == galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).end()) { + if (galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).find(location_b.y) == + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& shelf_to_shelf_connection = galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).at(location_b.y); - log_assert(shelf_to_shelf_connection.destination_chip_coords.size(), "Expecting at least one shelf-to-shelf connection, possibly one-to-many") + const Chip2ChipConnection &shelf_to_shelf_connection = + galaxy_shelves_exit_chip_coords_per_y_dim.at(location_b.shelf).at(location_b.y); + log_assert( + shelf_to_shelf_connection.destination_chip_coords.size(), + "Expecting at least one shelf-to-shelf connection, possibly one-to-many") - // for each shelf-to-shelf connection at location_b.y, find the distance to location_a, take min - int distance = std::numeric_limits::max(); + // for each shelf-to-shelf connection at location_b.y, find the distance to location_a, take min + int distance = std::numeric_limits::max(); eth_coord_t exit_shelf = shelf_to_shelf_connection.source_chip_coord; - for(eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { - - log_assert(exit_shelf.y == location_b.y && exit_shelf.shelf == location_b.shelf && exit_shelf.rack == location_b.rack, + for (eth_coord_t next_shelf : shelf_to_shelf_connection.destination_chip_coords) { + log_assert( + exit_shelf.y == location_b.y && exit_shelf.shelf == location_b.shelf && + exit_shelf.rack == location_b.rack, "Invalid shelf exit coordinates"); // next shelf could be at a different y-dim in nebula->galaxy systems - log_assert(next_shelf.shelf == (location_b.shelf+1) && next_shelf.rack == location_b.rack, + log_assert( + next_shelf.shelf == (location_b.shelf + 1) && next_shelf.rack == location_b.rack, "Invalid shelf entry coordinates"); // hop onto the next shelf and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_shelf); int distance_in_next_shelf = get_ethernet_link_coord_distance(next_shelf, location_a); // no path found - if(distance_to_exit == std::numeric_limits::max() || distance_in_next_shelf == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_shelf == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_shelf + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - location_a.x, location_a.y, location_a.rack, location_a.shelf, - location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; } // move along y-dim to exit from the shelf to go to a higher shelf - if(location_b.rack > location_a.rack) { - + if (location_b.rack > location_a.rack) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(location_a.rack) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert( + galaxy_racks_exit_chip_coords_per_x_dim.find(location_a.rack) != + galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).find(location_a.x) == galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).end()) { + if (galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).find(location_a.x) == + galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).at(location_a.x); - log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); + const Chip2ChipConnection &rack_to_rack_connection = + galaxy_racks_exit_chip_coords_per_x_dim.at(location_a.rack).at(location_a.x); + log_assert( + rack_to_rack_connection.destination_chip_coords.size(), + "Expecting at least one rack-to-rack connection, possibly one-to-many"); // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; - for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - - log_assert(exit_rack.x == location_a.x && exit_rack.shelf == location_a.shelf && exit_rack.rack == location_a.rack, + for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { + log_assert( + exit_rack.x == location_a.x && exit_rack.shelf == location_a.shelf && exit_rack.rack == location_a.rack, "Invalid rack exit coordinates"); - log_assert(next_rack.x == location_a.x && next_rack.shelf == location_a.shelf && next_rack.rack == (location_a.rack+1), + log_assert( + next_rack.x == location_a.x && next_rack.shelf == location_a.shelf && + next_rack.rack == (location_a.rack + 1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_a, exit_rack); int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_b); // no path found - if (distance_to_exit == std::numeric_limits::max() || distance_in_next_rack == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_rack == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - location_a.x, location_a.y, location_a.rack, location_a.shelf, - location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; - } - else if(location_a.rack > location_b.rack) { - + } else if (location_a.rack > location_b.rack) { // this is already verified where galaxy_racks_exit_chip_coords_per_x_dim is populated, but just to be safe - log_assert(galaxy_racks_exit_chip_coords_per_x_dim.find(location_b.rack) != galaxy_racks_exit_chip_coords_per_x_dim.end(), + log_assert( + galaxy_racks_exit_chip_coords_per_x_dim.find(location_b.rack) != + galaxy_racks_exit_chip_coords_per_x_dim.end(), "Expected rack-to-rack connection"); // this row does not have a rack-to-rack connection - if(galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).find(location_b.x) == galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).end()) { + if (galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).find(location_b.x) == + galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).end()) { return std::numeric_limits::max(); } - const Chip2ChipConnection& rack_to_rack_connection = galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).at(location_b.x); - log_assert(rack_to_rack_connection.destination_chip_coords.size(), "Expecting at least one rack-to-rack connection, possibly one-to-many"); + const Chip2ChipConnection &rack_to_rack_connection = + galaxy_racks_exit_chip_coords_per_x_dim.at(location_b.rack).at(location_b.x); + log_assert( + rack_to_rack_connection.destination_chip_coords.size(), + "Expecting at least one rack-to-rack connection, possibly one-to-many"); // for each rack-to-rack connection at location_a.x, find the distance to location_b, take min int distance = std::numeric_limits::max(); eth_coord_t exit_rack = rack_to_rack_connection.source_chip_coord; - for(eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { - - log_assert(exit_rack.x == location_b.x && exit_rack.shelf == location_b.shelf && exit_rack.rack == location_b.rack, + for (eth_coord_t next_rack : rack_to_rack_connection.destination_chip_coords) { + log_assert( + exit_rack.x == location_b.x && exit_rack.shelf == location_b.shelf && exit_rack.rack == location_b.rack, "Invalid rack exit coordinates"); - log_assert(next_rack.x == location_b.x && next_rack.shelf == location_b.shelf && next_rack.rack == (location_b.rack+1), + log_assert( + next_rack.x == location_b.x && next_rack.shelf == location_b.shelf && + next_rack.rack == (location_b.rack + 1), "Invalid rack entry coordinates"); // hop onto the next rack and find distance from there int distance_to_exit = get_ethernet_link_coord_distance(location_b, exit_rack); int distance_in_next_rack = get_ethernet_link_coord_distance(next_rack, location_a); // no path found - if (distance_to_exit == std::numeric_limits::max() || distance_in_next_rack == std::numeric_limits::max()) { + if (distance_to_exit == std::numeric_limits::max() || + distance_in_next_rack == std::numeric_limits::max()) { continue; } distance = std::min(distance, distance_to_exit + distance_in_next_rack + 1); } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - location_a.x, location_a.y, location_a.rack, location_a.shelf, - location_b.x, location_b.y, location_b.rack, location_b.shelf, distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + distance); return distance; } - log_trace(LogSiliconDriver, "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", - location_a.x, location_a.y, location_a.rack, location_a.shelf, - location_b.x, location_b.y, location_b.rack, location_b.shelf, x_distance + y_distance); + log_trace( + LogSiliconDriver, + "\tdistance from ({}, {}, {}, {}) to ({}, {}, {}, {}) is {}", + location_a.x, + location_a.y, + location_a.rack, + location_a.shelf, + location_b.x, + location_b.y, + location_b.rack, + location_b.shelf, + x_distance + y_distance); // on same shelf/rack, the distance is just x+y difference return x_distance + y_distance; @@ -252,14 +341,13 @@ int tt_ClusterDescriptor::get_ethernet_link_coord_distance(const eth_coord_t &lo // Returns the closest mmio chip to the given chip chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t chip) { - log_debug(LogSiliconDriver, "get_closest_mmio_chip to chip{}", chip); if (this->is_chip_mmio_capable(chip)) { return chip; } - if(closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) { + if (closest_mmio_chip_cache.find(chip) != closest_mmio_chip_cache.end()) { return closest_mmio_chip_cache[chip]; } @@ -271,7 +359,14 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch const chip_id_t &mmio_chip = pair.first; eth_coord_t mmio_eth_coord = this->chip_locations.at(mmio_chip); - log_debug(LogSiliconDriver, "Checking chip{} at ({}, {}, {}, {})", mmio_chip, mmio_eth_coord.x, mmio_eth_coord.y, mmio_eth_coord.rack, mmio_eth_coord.shelf); + log_debug( + LogSiliconDriver, + "Checking chip{} at ({}, {}, {}, {})", + mmio_chip, + mmio_eth_coord.x, + mmio_eth_coord.y, + mmio_eth_coord.rack, + mmio_eth_coord.shelf); int distance = get_ethernet_link_coord_distance(mmio_eth_coord, chip_eth_coord); log_debug(LogSiliconDriver, "Distance from chip{} to chip{} is {}", chip, mmio_chip, distance); @@ -280,7 +375,8 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch closest_chip = mmio_chip; } } - log_assert(min_distance != std::numeric_limits::max(), "Chip{} is not connected to any MMIO capable chip", chip); + log_assert( + min_distance != std::numeric_limits::max(), "Chip{} is not connected to any MMIO capable chip", chip); log_assert(is_chip_mmio_capable(closest_chip), "Closest MMIO chip must be MMIO capable"); @@ -294,32 +390,37 @@ chip_id_t tt_ClusterDescriptor::get_closest_mmio_capable_chip(const chip_id_t ch std::string tt_ClusterDescriptor::get_cluster_descriptor_file_path() { static std::string yaml_path; static bool is_initialized = false; - if (!is_initialized){ - + if (!is_initialized) { // Cluster descriptor yaml will be created in a unique temporary directory. std::filesystem::path temp_path = std::filesystem::temp_directory_path(); std::string cluster_path_dir_template = temp_path / "umd_XXXXXX"; std::filesystem::path cluster_path_dir = mkdtemp(cluster_path_dir_template.data()); std::filesystem::path cluster_path = cluster_path_dir / "cluster_descriptor.yaml"; - if (!std::filesystem::exists(cluster_path)){ - auto val = system ( ("touch " + cluster_path.string()).c_str()); - if(val != 0) throw std::runtime_error("Cluster Generation Failed!"); + if (!std::filesystem::exists(cluster_path)) { + auto val = system(("touch " + cluster_path.string()).c_str()); + if (val != 0) { + throw std::runtime_error("Cluster Generation Failed!"); + } } - int val = create_ethernet_map((char*)cluster_path.string().c_str()); - if(val != 0) throw std::runtime_error("Cluster Generation Failed!"); + int val = create_ethernet_map((char *)cluster_path.string().c_str()); + if (val != 0) { + throw std::runtime_error("Cluster Generation Failed!"); + } yaml_path = cluster_path.string(); is_initialized = true; } return yaml_path; } -std::unique_ptr tt_ClusterDescriptor::create_from_yaml(const std::string &cluster_descriptor_file_path) { +std::unique_ptr tt_ClusterDescriptor::create_from_yaml( + const std::string &cluster_descriptor_file_path) { std::unique_ptr desc = std::unique_ptr(new tt_ClusterDescriptor()); std::ifstream fdesc(cluster_descriptor_file_path); if (fdesc.fail()) { - throw std::runtime_error(fmt::format("Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path)); + throw std::runtime_error(fmt::format( + "Error: cluster connectivity descriptor file {} does not exist!", cluster_descriptor_file_path)); } fdesc.close(); @@ -337,22 +438,31 @@ std::unique_ptr tt_ClusterDescriptor::create_from_yaml(con } std::unique_ptr tt_ClusterDescriptor::create_for_grayskull_cluster( - const std::set &logical_mmio_device_ids, - const std::vector &physical_mmio_device_ids) { + const std::set &logical_mmio_device_ids, const std::vector &physical_mmio_device_ids) { std::unique_ptr desc = std::unique_ptr(new tt_ClusterDescriptor()); // Some users need not care about physical ids, can provide empty set. - auto use_physical_ids = physical_mmio_device_ids.size() ? true : false; - auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set. - auto num_available_physical_devices = physical_mmio_device_ids.size(); - auto required_physical_devices = largest_workload_logical_device_id + 1; - - log_debug(tt::LogSiliconDriver, "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} required_physical_devices: {}", - __FUNCTION__, use_physical_ids, largest_workload_logical_device_id, num_available_physical_devices, required_physical_devices); - - log_assert(!use_physical_ids || num_available_physical_devices >= required_physical_devices, + auto use_physical_ids = physical_mmio_device_ids.size() ? true : false; + auto largest_workload_logical_device_id = *logical_mmio_device_ids.rbegin(); // Last element in ordered set. + auto num_available_physical_devices = physical_mmio_device_ids.size(); + auto required_physical_devices = largest_workload_logical_device_id + 1; + + log_debug( + tt::LogSiliconDriver, + "{} - use_physical_ids: {} largest_workload_logical_device_id: {} num_available_physical_devices: {} " + "required_physical_devices: {}", + __FUNCTION__, + use_physical_ids, + largest_workload_logical_device_id, + num_available_physical_devices, + required_physical_devices); + + log_assert( + !use_physical_ids || num_available_physical_devices >= required_physical_devices, "Insufficient silicon devices. Workload requires device_id: {} (ie. {} devices) but only {} present", - largest_workload_logical_device_id, required_physical_devices, num_available_physical_devices); + largest_workload_logical_device_id, + required_physical_devices, + num_available_physical_devices); // All Grayskull devices are MMIO mapped so physical_mmio_device_ids correspond to all available devices for (auto &logical_id : logical_mmio_device_ids) { @@ -361,8 +471,10 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull desc->all_chips.insert(logical_id); eth_coord_t chip_location{logical_id, 0, 0, 0}; desc->chip_locations.insert({logical_id, chip_location}); - desc->coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = logical_id; - log_debug(tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id); + desc->coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = + logical_id; + log_debug( + tt::LogSiliconDriver, "{} - adding logical: {} => physical: {}", __FUNCTION__, logical_id, physical_id); } desc->enable_all_devices(); @@ -370,7 +482,8 @@ std::unique_ptr tt_ClusterDescriptor::create_for_grayskull return desc; } -void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { +void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descriptor( + YAML::Node &yaml, tt_ClusterDescriptor &desc) { log_assert(yaml["ethernet_connections"].IsSequence(), "Invalid YAML"); for (YAML::Node &connected_endpoints : yaml["ethernet_connections"].as>()) { log_assert(connected_endpoints.IsSequence(), "Invalid YAML"); @@ -403,7 +516,13 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto log_debug(LogSiliconDriver, "Ethernet Connectivity Descriptor:"); for (const auto &[chip, chan_to_chip_chan_map] : desc.ethernet_connections) { for (const auto &[chan, chip_and_chan] : chan_to_chip_chan_map) { - log_debug(LogSiliconDriver, "\tchip: {}, chan: {} <--> chip: {}, chan: {}", chip, chan, chip_and_chan.x, chip_and_chan.y); + log_debug( + LogSiliconDriver, + "\tchip: {}, chan: {} <--> chip: {}, chan: {}", + chip, + chan, + chip_and_chan.x, + chip_and_chan.y); } } @@ -423,52 +542,61 @@ void tt_ClusterDescriptor::load_ethernet_connections_from_connectivity_descripto } void tt_ClusterDescriptor::fill_galaxy_connections(tt_ClusterDescriptor &desc) { - int highest_shelf_id = 0; int highest_rack_id = 0; // shelves and racks can be connected at different chip coordinates - // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip on the other shelf/rack is - // this is used in get_ethernet_link_coord_distance to find the distance between two chips + // determine which chips are connected to the next (i.e. higher id) shelf/rack and what the coordinate of the chip + // on the other shelf/rack is this is used in get_ethernet_link_coord_distance to find the distance between two + // chips for (const auto &[chip_id, chip_eth_coord] : desc.chip_locations) { highest_shelf_id = std::max(highest_shelf_id, chip_eth_coord.shelf); highest_rack_id = std::max(highest_rack_id, chip_eth_coord.rack); // iterate over all neighbors - if(desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) { - continue; // chip has no eth connections + if (desc.ethernet_connections.find(chip_id) == desc.ethernet_connections.end()) { + continue; // chip has no eth connections } for (const auto &[chan, chip_and_chan] : desc.ethernet_connections.at(chip_id)) { const chip_id_t &neighbor_chip = std::get<0>(chip_and_chan); eth_coord_t neighbor_eth_coord = desc.chip_locations.at(neighbor_chip); // shelves are connected in x-dim - if(neighbor_eth_coord.shelf != chip_eth_coord.shelf) { - eth_coord_t higher_shelf_coord = neighbor_eth_coord.shelf > chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_shelf_coord = neighbor_eth_coord.shelf < chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; + if (neighbor_eth_coord.shelf != chip_eth_coord.shelf) { + eth_coord_t higher_shelf_coord = + neighbor_eth_coord.shelf > chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_shelf_coord = + neighbor_eth_coord.shelf < chip_eth_coord.shelf ? neighbor_eth_coord : chip_eth_coord; int lower_shelf_id = lower_shelf_coord.shelf; int lower_shelf_y = lower_shelf_coord.y; - auto& galaxy_shelf_exit_chip_coords_per_y_dim = desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id]; + auto &galaxy_shelf_exit_chip_coords_per_y_dim = + desc.galaxy_shelves_exit_chip_coords_per_y_dim[lower_shelf_id]; log_assert( - galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == galaxy_shelf_exit_chip_coords_per_y_dim.end() || - galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord, + galaxy_shelf_exit_chip_coords_per_y_dim.find(lower_shelf_y) == + galaxy_shelf_exit_chip_coords_per_y_dim.end() || + galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord == lower_shelf_coord, "Expected a single exit chip on each shelf row"); galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].source_chip_coord = lower_shelf_coord; - galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert(higher_shelf_coord); + galaxy_shelf_exit_chip_coords_per_y_dim[lower_shelf_y].destination_chip_coords.insert( + higher_shelf_coord); } // racks are connected in y-dim - if(neighbor_eth_coord.rack != chip_eth_coord.rack) { - eth_coord_t higher_rack_coord = neighbor_eth_coord.rack > chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; - eth_coord_t lower_rack_coord = neighbor_eth_coord.rack < chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; + if (neighbor_eth_coord.rack != chip_eth_coord.rack) { + eth_coord_t higher_rack_coord = + neighbor_eth_coord.rack > chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; + eth_coord_t lower_rack_coord = + neighbor_eth_coord.rack < chip_eth_coord.rack ? neighbor_eth_coord : chip_eth_coord; int lower_rack_id = lower_rack_coord.rack; int lower_rack_x = lower_rack_coord.x; - auto& galaxy_rack_exit_chip_coords_per_x_dim = desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id]; + auto &galaxy_rack_exit_chip_coords_per_x_dim = + desc.galaxy_racks_exit_chip_coords_per_x_dim[lower_rack_id]; log_assert( - galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == galaxy_rack_exit_chip_coords_per_x_dim.end() || - galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord, + galaxy_rack_exit_chip_coords_per_x_dim.find(lower_rack_x) == + galaxy_rack_exit_chip_coords_per_x_dim.end() || + galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord == lower_rack_coord, "Expected a single exit chip on each rack column"); galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].source_chip_coord = lower_rack_coord; galaxy_rack_exit_chip_coords_per_x_dim[lower_rack_x].destination_chip_coords.insert(higher_rack_coord); @@ -479,23 +607,36 @@ void tt_ClusterDescriptor::fill_galaxy_connections(tt_ClusterDescriptor &desc) { // verify that every shelf (except the highest in id) is found in galaxy_shelves_exit_chip_coords_per_y_dim // this means that we expect the shelves to be connected linearly in a daisy-chain fashion. // shelf0->shelf1->shelf2->...->shelfN - for(int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) { - log_assert(desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(), - "Expected shelf {} to be connected to the next shelf", shelf_id); + for (int shelf_id = 0; shelf_id < highest_shelf_id; shelf_id++) { + log_assert( + desc.galaxy_shelves_exit_chip_coords_per_y_dim.find(shelf_id) != + desc.galaxy_shelves_exit_chip_coords_per_y_dim.end(), + "Expected shelf {} to be connected to the next shelf", + shelf_id); } // this prints the exit chip coordinates for each shelf // this is used in get_ethernet_link_coord_distance to find the distance between two chips for (const auto &[shelf, shelf_exit_chip_coords_per_y_dim] : desc.galaxy_shelves_exit_chip_coords_per_y_dim) { for (const auto &[y_dim, shelf_exit_chip_coords] : shelf_exit_chip_coords_per_y_dim) { - log_debug(LogSiliconDriver, "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})", - shelf, y_dim, - shelf_exit_chip_coords.source_chip_coord.x, shelf_exit_chip_coords.source_chip_coord.y, - shelf_exit_chip_coords.source_chip_coord.rack, shelf_exit_chip_coords.source_chip_coord.shelf); + log_debug( + LogSiliconDriver, + "shelf: {} y_dim: {} exit_coord:({}, {}, {}, {})", + shelf, + y_dim, + shelf_exit_chip_coords.source_chip_coord.x, + shelf_exit_chip_coords.source_chip_coord.y, + shelf_exit_chip_coords.source_chip_coord.rack, + shelf_exit_chip_coords.source_chip_coord.shelf); for (const auto &destination_chip_coord : shelf_exit_chip_coords.destination_chip_coords) { // print shelf_exit_chip_coord in the format: (x, y, rack, shelf) - log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - destination_chip_coord.x, destination_chip_coord.y, destination_chip_coord.rack, destination_chip_coord.shelf); + log_debug( + LogSiliconDriver, + "\tdestination_chip_coord: ({}, {}, {}, {})", + destination_chip_coord.x, + destination_chip_coord.y, + destination_chip_coord.rack, + destination_chip_coord.shelf); } } } @@ -503,28 +644,41 @@ void tt_ClusterDescriptor::fill_galaxy_connections(tt_ClusterDescriptor &desc) { // verify that every rack (except the highest in id) is found in galaxy_racks_exit_chip_coords_per_x_dim // this means that we expect the racks to be connected linearly in a daisy-chain fashion. // rack0->rack1->rack2->...->rackN - for(int rack_id = 0; rack_id < highest_rack_id; rack_id++) { - log_assert(desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != desc.galaxy_racks_exit_chip_coords_per_x_dim.end(), - "Expected rack {} to be connected to the next rack", rack_id); + for (int rack_id = 0; rack_id < highest_rack_id; rack_id++) { + log_assert( + desc.galaxy_racks_exit_chip_coords_per_x_dim.find(rack_id) != + desc.galaxy_racks_exit_chip_coords_per_x_dim.end(), + "Expected rack {} to be connected to the next rack", + rack_id); } // this prints the exit chip coordinates for each rack // this is used in get_ethernet_link_coord_distance to find the distance between two chips for (const auto &[rack, rack_exit_chip_coords_per_x_dim] : desc.galaxy_racks_exit_chip_coords_per_x_dim) { for (const auto &[x_dim, rack_exit_chip_coords] : rack_exit_chip_coords_per_x_dim) { - log_debug(LogSiliconDriver, "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", rack, x_dim, - rack_exit_chip_coords.source_chip_coord.x, rack_exit_chip_coords.source_chip_coord.y, - rack_exit_chip_coords.source_chip_coord.rack, rack_exit_chip_coords.source_chip_coord.shelf); + log_debug( + LogSiliconDriver, + "rack: {} x_dim: {} exit_coord:({}, {}, {}, {})", + rack, + x_dim, + rack_exit_chip_coords.source_chip_coord.x, + rack_exit_chip_coords.source_chip_coord.y, + rack_exit_chip_coords.source_chip_coord.rack, + rack_exit_chip_coords.source_chip_coord.shelf); for (const auto &destination_chip_coord : rack_exit_chip_coords.destination_chip_coords) { - log_debug(LogSiliconDriver, "\tdestination_chip_coord: ({}, {}, {}, {})", - destination_chip_coord.x, destination_chip_coord.y, destination_chip_coord.rack, destination_chip_coord.shelf); + log_debug( + LogSiliconDriver, + "\tdestination_chip_coord: ({}, {}, {}, {})", + destination_chip_coord.x, + destination_chip_coord.y, + destination_chip_coord.rack, + destination_chip_coord.shelf); } } } } void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) { - DisjointSet chip_sets; for (const auto &[chip, _] : desc.chip_locations) { chip_sets.add_item(chip); @@ -545,7 +699,6 @@ void tt_ClusterDescriptor::merge_cluster_ids(tt_ClusterDescriptor &desc) { } void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc) { - for (YAML::const_iterator node = yaml["arch"].begin(); node != yaml["arch"].end(); ++node) { chip_id_t chip_id = node->first.as(); desc.all_chips.insert(chip_id); @@ -561,14 +714,13 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y desc.chip_locations.insert({chip_id, chip_location}); desc.coords_to_chip_ids[chip_location.rack][chip_location.shelf][chip_location.y][chip_location.x] = chip_id; } - - for(const auto& chip : yaml["chips_with_mmio"]) { - if(chip.IsMap()) { + + for (const auto &chip : yaml["chips_with_mmio"]) { + if (chip.IsMap()) { const auto &chip_map = chip.as>(); const auto &chips = chip_map.begin(); desc.chips_with_mmio.insert({chips->first, chips->second}); - } - else { + } else { const auto &chip_val = chip.as(); desc.chips_with_mmio.insert({chip_val, chip_val}); } @@ -585,8 +737,8 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y chip_location.shelf); } - if (yaml["boardtype"]) { - for (const auto& chip_board_type : yaml["boardtype"].as>()) { + if (yaml["boardtype"]) { + for (const auto &chip_board_type : yaml["boardtype"].as>()) { auto &chip = chip_board_type.first; BoardType board_type; if (chip_board_type.second == "n150") { @@ -597,25 +749,28 @@ void tt_ClusterDescriptor::load_chips_from_connectivity_descriptor(YAML::Node &y board_type = BoardType::GALAXY; } else if (chip_board_type.second == "e150") { board_type = BoardType::E150; - } - else if (chip_board_type.second == "p150A") { + } else if (chip_board_type.second == "p150A") { board_type = BoardType::P150A; } else { - log_warning(LogSiliconDriver, "Unknown board type for chip {}. This might happen because chip is running old firmware. Defaulting to DEFAULT", chip); + log_warning( + LogSiliconDriver, + "Unknown board type for chip {}. This might happen because chip is running old firmware. " + "Defaulting to DEFAULT", + chip); board_type = BoardType::DEFAULT; } desc.chip_board_type.insert({chip, board_type}); } } else { - for (const auto& chip: desc.all_chips) { + for (const auto &chip : desc.all_chips) { desc.chip_board_type.insert({chip, BoardType::DEFAULT}); } } } void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc) { - if(yaml["harvesting"]) { - for (const auto& chip_node : yaml["harvesting"].as>()) { + if (yaml["harvesting"]) { + for (const auto &chip_node : yaml["harvesting"].as>()) { chip_id_t chip = chip_node.first; auto harvesting_info = chip_node.second; desc.noc_translation_enabled.insert({chip, harvesting_info["noc_translation"].as()}); @@ -624,9 +779,7 @@ void tt_ClusterDescriptor::load_harvesting_information(YAML::Node &yaml, tt_Clus } } -void tt_ClusterDescriptor::enable_all_devices() { - this->enabled_active_chips = this->all_chips; -} +void tt_ClusterDescriptor::enable_all_devices() { this->enabled_active_chips = this->all_chips; } void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() { for (const auto &chip : this->all_chips) { @@ -636,8 +789,10 @@ void tt_ClusterDescriptor::fill_chips_grouped_by_closest_mmio() { } } -const std::unordered_map > > tt_ClusterDescriptor::get_ethernet_connections() const { - auto eth_connections = std::unordered_map > >(); +const std::unordered_map>> +tt_ClusterDescriptor::get_ethernet_connections() const { + auto eth_connections = std:: + unordered_map>>(); for (const auto &[chip, channel_mapping] : this->ethernet_connections) { if (this->enabled_active_chips.find(chip) != this->enabled_active_chips.end()) { @@ -653,7 +808,7 @@ const std::unordered_map& tt_ClusterDescriptor::get_chip_locations() const { +const std::unordered_map &tt_ClusterDescriptor::get_chip_locations() const { static auto locations = std::unordered_map(); if (locations.empty() and !this->chip_locations.empty()) { for (auto chip_id : this->enabled_active_chips) { @@ -665,9 +820,12 @@ const std::unordered_map& tt_ClusterDescriptor::get_chip } chip_id_t tt_ClusterDescriptor::get_shelf_local_physical_chip_coords(chip_id_t virtual_coord) { - log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates"); + log_assert( + !this->chip_locations.empty(), + "Getting physical chip coordinates is only valid for systems where chips have coordinates"); // Physical cooridnates of chip inside a single rack. Calculated based on Galaxy topology. - // See: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png + // See: + // https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/wikis/uploads/23e7a5168f38dfb706f9887fde78cb03/image.png int x = get_chip_locations().at(virtual_coord).x; int y = get_chip_locations().at(virtual_coord).y; return 8 * x + y; @@ -686,30 +844,31 @@ const std::unordered_map tt_ClusterDescriptor::get_chips_w return chips_map; } -const std::unordered_set& tt_ClusterDescriptor::get_all_chips() const { - return this->enabled_active_chips; -} +const std::unordered_set &tt_ClusterDescriptor::get_all_chips() const { return this->enabled_active_chips; } -const std::unordered_map& tt_ClusterDescriptor::get_harvesting_info() const { +const std::unordered_map &tt_ClusterDescriptor::get_harvesting_info() const { return harvesting_masks; } -const std::unordered_map& tt_ClusterDescriptor::get_noc_translation_table_en() const { +const std::unordered_map &tt_ClusterDescriptor::get_noc_translation_table_en() const { return noc_translation_enabled; } std::size_t tt_ClusterDescriptor::get_number_of_chips() const { return this->enabled_active_chips.size(); } int tt_ClusterDescriptor::get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const { - log_assert(!this->chip_locations.empty(), "Getting physical chip coordinates is only valid for systems where chips have coordinates"); + log_assert( + !this->chip_locations.empty(), + "Getting physical chip coordinates is only valid for systems where chips have coordinates"); return this->get_ethernet_link_coord_distance(chip_locations.at(chip_a), chip_locations.at(chip_b)); } BoardType tt_ClusterDescriptor::get_board_type(chip_id_t chip_id) const { - BoardType board_type = this->chip_board_type.at(chip_id); - return board_type; + BoardType board_type = this->chip_board_type.at(chip_id); + return board_type; } -const std::unordered_map>& tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const { +const std::unordered_map> & +tt_ClusterDescriptor::get_chips_grouped_by_closest_mmio() const { return chips_grouped_by_closest_mmio; } diff --git a/device/tt_cluster_descriptor.h b/device/tt_cluster_descriptor.h index 56ec9393c..842da6ddb 100644 --- a/device/tt_cluster_descriptor.h +++ b/device/tt_cluster_descriptor.h @@ -4,23 +4,24 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once -#include "device/tt_xy_pair.h" - #include -#include -#include -#include #include -#include +#include +#include #include +#include +#include +#include #include -#include + #include "device/tt_cluster_descriptor_types.h" +#include "device/tt_xy_pair.h" -namespace YAML { class Node; } +namespace YAML { +class Node; +} enum BoardType : uint32_t { N150 = 0, @@ -32,90 +33,93 @@ enum BoardType : uint32_t { }; class tt_ClusterDescriptor { - private: - int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const; + int get_ethernet_link_coord_distance(const eth_coord_t &location_a, const eth_coord_t &location_b) const; protected: - - std::unordered_map > > ethernet_connections; - std::unordered_map chip_locations; - // reverse map: rack/shelf/y/x -> chip_id - std::map > > > coords_to_chip_ids; - std::unordered_map chips_with_mmio; - std::unordered_set all_chips; - std::unordered_map noc_translation_enabled = {}; - std::unordered_map harvesting_masks = {}; - std::unordered_set enabled_active_chips; - std::unordered_map closest_mmio_chip_cache = {}; - std::unordered_map chip_board_type = {}; - std::unordered_map> chips_grouped_by_closest_mmio; - - // one-to-many chip connections - struct Chip2ChipConnection { - eth_coord_t source_chip_coord; - std::unordered_set destination_chip_coords; - }; - - // shelf_id -> y dim -> list of chip2chip connections between different shelves - // assumption is that on every row of the shelf there is a chip that is connected to the other shelf - // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other shelf (in case of nebula->galaxy) - std::unordered_map > galaxy_shelves_exit_chip_coords_per_y_dim = {}; - // rack_id -> x dim -> list of chip2chip connections between different racks - // assumption is that on every row of the rack there is a chip that is connected to the other rack - std::unordered_map > galaxy_racks_exit_chip_coords_per_x_dim = {}; - - static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); - static void fill_galaxy_connections(tt_ClusterDescriptor &desc); - static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); - static void merge_cluster_ids(tt_ClusterDescriptor &desc); - static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); - - void fill_chips_grouped_by_closest_mmio(); + std::unordered_map>> + ethernet_connections; + std::unordered_map chip_locations; + // reverse map: rack/shelf/y/x -> chip_id + std::map>>> coords_to_chip_ids; + std::unordered_map chips_with_mmio; + std::unordered_set all_chips; + std::unordered_map noc_translation_enabled = {}; + std::unordered_map harvesting_masks = {}; + std::unordered_set enabled_active_chips; + std::unordered_map closest_mmio_chip_cache = {}; + std::unordered_map chip_board_type = {}; + std::unordered_map> chips_grouped_by_closest_mmio; + + // one-to-many chip connections + struct Chip2ChipConnection { + eth_coord_t source_chip_coord; + std::unordered_set destination_chip_coords; + }; + + // shelf_id -> y dim -> list of chip2chip connections between different shelves + // assumption is that on every row of the shelf there is a chip that is connected to the other shelf + // there could be one-to-many connections between shelves, i.e. one chip is connected to multiple chips on the other + // shelf (in case of nebula->galaxy) + std::unordered_map> galaxy_shelves_exit_chip_coords_per_y_dim = + {}; + // rack_id -> x dim -> list of chip2chip connections between different racks + // assumption is that on every row of the rack there is a chip that is connected to the other rack + std::unordered_map> galaxy_racks_exit_chip_coords_per_x_dim = {}; + + static void load_ethernet_connections_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void fill_galaxy_connections(tt_ClusterDescriptor &desc); + static void load_chips_from_connectivity_descriptor(YAML::Node &yaml, tt_ClusterDescriptor &desc); + static void merge_cluster_ids(tt_ClusterDescriptor &desc); + static void load_harvesting_information(YAML::Node &yaml, tt_ClusterDescriptor &desc); + + void fill_chips_grouped_by_closest_mmio(); public: - tt_ClusterDescriptor() = default; - tt_ClusterDescriptor(const tt_ClusterDescriptor&) = default; - - /* - * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument ordering when calling the function - * An empty result implies that the two chips do not share any direct connection - */ - std::vector> get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; - - bool is_chip_mmio_capable(const chip_id_t chip_id) const; - bool is_chip_remote(const chip_id_t chip_id) const; - chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip); - chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); - - // TODO: These following functions will be removed, and ClusterDescriptor will be created without any parameters. - // get_cluster_descriptor_file_path will create ethernet map in the background. - static std::string get_cluster_descriptor_file_path(); - static std::unique_ptr create_from_yaml(const std::string &cluster_descriptor_file_path); - - // TODO: This function is used to create mock cluster descriptor yaml files, for example for simulation. - // The name of the function is kept to not gate the changes regarding create-ethernet-map. - // It should be renamed to something like create_mock_cluster_descriptor and changed in tt-metal/tt-debuda. - static std::unique_ptr create_for_grayskull_cluster( - const std::set &logical_mmio_device_ids, - const std::vector &physical_mmio_device_ids); - - const std::unordered_map& get_harvesting_info() const; - const std::unordered_map& get_noc_translation_table_en() const; - const std::unordered_map& get_chip_locations() const; - const std::unordered_map > > get_ethernet_connections() const; - const std::unordered_map get_chips_with_mmio() const; - const std::unordered_set& get_all_chips() const; - const std::unordered_map>& get_chips_grouped_by_closest_mmio() const; - std::size_t get_number_of_chips() const; - - int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const; - - BoardType get_board_type(chip_id_t chip_id) const; - - bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - std::tuple get_chip_and_channel_of_remote_ethernet_core(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; - - void enable_all_devices(); - + tt_ClusterDescriptor() = default; + tt_ClusterDescriptor(const tt_ClusterDescriptor &) = default; + + /* + * Returns the pairs of channels that are connected where the first entry in the pair corresponds to the argument + * ordering when calling the function An empty result implies that the two chips do not share any direct connection + */ + std::vector> + get_directly_connected_ethernet_channels_between_chips(const chip_id_t &first, const chip_id_t &second) const; + + bool is_chip_mmio_capable(const chip_id_t chip_id) const; + bool is_chip_remote(const chip_id_t chip_id) const; + chip_id_t get_closest_mmio_capable_chip(const chip_id_t chip); + chip_id_t get_shelf_local_physical_chip_coords(chip_id_t virtual_coord); + + // TODO: These following functions will be removed, and ClusterDescriptor will be created without any parameters. + // get_cluster_descriptor_file_path will create ethernet map in the background. + static std::string get_cluster_descriptor_file_path(); + static std::unique_ptr create_from_yaml(const std::string &cluster_descriptor_file_path); + + // TODO: This function is used to create mock cluster descriptor yaml files, for example for simulation. + // The name of the function is kept to not gate the changes regarding create-ethernet-map. + // It should be renamed to something like create_mock_cluster_descriptor and changed in tt-metal/tt-debuda. + static std::unique_ptr create_for_grayskull_cluster( + const std::set &logical_mmio_device_ids, const std::vector &physical_mmio_device_ids); + + const std::unordered_map &get_harvesting_info() const; + const std::unordered_map &get_noc_translation_table_en() const; + const std::unordered_map &get_chip_locations() const; + const std:: + unordered_map>> + get_ethernet_connections() const; + const std::unordered_map get_chips_with_mmio() const; + const std::unordered_set &get_all_chips() const; + const std::unordered_map> &get_chips_grouped_by_closest_mmio() const; + std::size_t get_number_of_chips() const; + + int get_ethernet_link_distance(chip_id_t chip_a, chip_id_t chip_b) const; + + BoardType get_board_type(chip_id_t chip_id) const; + + bool ethernet_core_has_active_ethernet_link(chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; + std::tuple get_chip_and_channel_of_remote_ethernet_core( + chip_id_t local_chip, ethernet_channel_t local_ethernet_channel) const; + + void enable_all_devices(); }; diff --git a/device/tt_cluster_descriptor_types.h b/device/tt_cluster_descriptor_types.h index e120ffd9b..b9e018235 100644 --- a/device/tt_cluster_descriptor_types.h +++ b/device/tt_cluster_descriptor_types.h @@ -4,17 +4,17 @@ * SPDX-License-Identifier: Apache-2.0 */ -#pragma once +#pragma once #include - #include #include using chip_id_t = int; using ethernet_channel_t = int; + struct eth_coord_t { - int cluster_id; // This is the same for connected chips. + int cluster_id; // This is the same for connected chips. int x; int y; int rack; @@ -23,21 +23,23 @@ struct eth_coord_t { // in C++20 this should be defined as: // constexpr bool operator==(const eth_coord_t &other) const noexcept = default; constexpr bool operator==(const eth_coord_t &other) const noexcept { - return (cluster_id == other.cluster_id and x == other.x and y == other.y and rack == other.rack and shelf == other.shelf); + return ( + cluster_id == other.cluster_id and x == other.x and y == other.y and rack == other.rack and + shelf == other.shelf); } }; namespace std { template <> struct hash { - std::size_t operator()(eth_coord_t const &c) const { - std::size_t seed = 0; - boost::hash_combine(seed, c.cluster_id); - boost::hash_combine(seed, c.x); - boost::hash_combine(seed, c.y); - boost::hash_combine(seed, c.rack); - boost::hash_combine(seed, c.shelf); - return seed; - } + std::size_t operator()(eth_coord_t const &c) const { + std::size_t seed = 0; + boost::hash_combine(seed, c.cluster_id); + boost::hash_combine(seed, c.x); + boost::hash_combine(seed, c.y); + boost::hash_combine(seed, c.rack); + boost::hash_combine(seed, c.shelf); + return seed; + } }; -} +} // namespace std diff --git a/device/tt_device.cpp b/device/tt_device.cpp index 9df2f3923..071f66764 100644 --- a/device/tt_device.cpp +++ b/device/tt_device.cpp @@ -2,30 +2,32 @@ // // SPDX-License-Identifier: Apache-2.0 - #ifdef TT_DEBUG_LOGGING -#define DEBUG_LOG(str) do { std::cout << str << std::endl; } while( false ) +#define DEBUG_LOG(str) \ + do { \ + std::cout << str << std::endl; \ + } while (false) #else #define DEBUG_LOG(str) ((void)0) #endif #include "tt_device.h" -#include "device/tt_cluster_descriptor_types.h" -#include + #include +#include #include -#include #include +#include + +#include "device/tt_cluster_descriptor_types.h" #include "yaml-cpp/yaml.h" //////// // Device base //////// -tt_device::tt_device() : soc_descriptor_per_chip({}) { -} +tt_device::tt_device() : soc_descriptor_per_chip({}) {} -tt_device::~tt_device() { -} +tt_device::~tt_device() {} const tt_SocDescriptor& tt_device::get_soc_descriptor(chip_id_t chip_id) const { return soc_descriptor_per_chip.at(chip_id); diff --git a/device/tt_io.hpp b/device/tt_io.hpp index 8d0203e36..174903cbd 100644 --- a/device/tt_io.hpp +++ b/device/tt_io.hpp @@ -11,7 +11,7 @@ namespace tt { namespace umd { - class Cluster; +class Cluster; } /** @@ -22,20 +22,18 @@ namespace umd { * * It is the caller's responsibility to manage the lifetime of Writer objects. */ -class Writer -{ +class Writer { friend class tt::umd::Cluster; public: /** * @brief Write to a SoC core. - * + * * @param address must be aligned to the size of T - * @param value + * @param value */ template - void write(uint32_t address, T value) - { + void write(uint32_t address, T value) { auto dst = reinterpret_cast(base) + address; if (address >= tlb_size) { @@ -46,27 +44,23 @@ class Writer throw std::runtime_error("Unaligned write"); } - *reinterpret_cast(dst) = value; + *reinterpret_cast(dst) = value; } private: /** * @brief tt::umd::Cluster interface to construct a new Writer object. - * + * * @param base pointer to the base address of a mapped TLB. * @param tlb_size size of the mapped TLB. */ - Writer(void *base, size_t tlb_size) - : base(base) - , tlb_size(tlb_size) - { + Writer(void *base, size_t tlb_size) : base(base), tlb_size(tlb_size) { assert(base); assert(tlb_size > 0); } - void *base{ nullptr }; - size_t tlb_size{ 0 }; + void *base{nullptr}; + size_t tlb_size{0}; }; - -} // namespace tt +} // namespace tt diff --git a/device/tt_silicon_driver_common.cpp b/device/tt_silicon_driver_common.cpp index 26360be6e..53f87753a 100644 --- a/device/tt_silicon_driver_common.cpp +++ b/device/tt_silicon_driver_common.cpp @@ -3,37 +3,37 @@ // SPDX-License-Identifier: Apache-2.0 #include "device/tt_silicon_driver_common.hpp" -#include "tt_xy_pair.h" + #include "cluster.h" +#include "tt_xy_pair.h" std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value) { std::string output; - if((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::BRISC) != TensixSoftResetOptions::NONE) { output += "BRISC | "; } - if((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC0) != TensixSoftResetOptions::NONE) { output += "TRISC0 | "; } - if((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC1) != TensixSoftResetOptions::NONE) { output += "TRISC1 | "; } - if((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::TRISC2) != TensixSoftResetOptions::NONE) { output += "TRISC2 | "; } - if((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::NCRISC) != TensixSoftResetOptions::NONE) { output += "NCRISC | "; } - if((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) { + if ((value & TensixSoftResetOptions::STAGGERED_START) != TensixSoftResetOptions::NONE) { output += "STAGGERED_START | "; } - if(output.empty()) { - output = "UNKNOWN"; - } else { - output.erase(output.end() - 3, output.end()); - } + if (output.empty()) { + output = "UNKNOWN"; + } else { + output.erase(output.end() - 3, output.end()); + } - return output; + return output; } - diff --git a/device/tt_silicon_driver_common.hpp b/device/tt_silicon_driver_common.hpp index 9f2756683..6dc6d7f49 100644 --- a/device/tt_silicon_driver_common.hpp +++ b/device/tt_silicon_driver_common.hpp @@ -9,53 +9,42 @@ #include #include -enum class TensixSoftResetOptions: std::uint32_t { +enum class TensixSoftResetOptions : std::uint32_t { NONE = 0, - BRISC = ((std::uint32_t) 1 << 11), - TRISC0 = ((std::uint32_t) 1 << 12), - TRISC1 = ((std::uint32_t) 1 << 13), - TRISC2 = ((std::uint32_t) 1 << 14), - NCRISC = ((std::uint32_t) 1 << 18), - STAGGERED_START = ((std::uint32_t) 1 << 31) + BRISC = ((std::uint32_t)1 << 11), + TRISC0 = ((std::uint32_t)1 << 12), + TRISC1 = ((std::uint32_t)1 << 13), + TRISC2 = ((std::uint32_t)1 << 14), + NCRISC = ((std::uint32_t)1 << 18), + STAGGERED_START = ((std::uint32_t)1 << 31) }; std::string TensixSoftResetOptionsToString(TensixSoftResetOptions value); + constexpr TensixSoftResetOptions operator|(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return static_cast( - static_cast(lhs) | - static_cast(rhs) - ); + return static_cast(static_cast(lhs) | static_cast(rhs)); } constexpr TensixSoftResetOptions operator&(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return static_cast( - static_cast(lhs) & - static_cast(rhs) - ); + return static_cast(static_cast(lhs) & static_cast(rhs)); } constexpr bool operator!=(TensixSoftResetOptions lhs, TensixSoftResetOptions rhs) { - return - static_cast(lhs) != - static_cast(rhs); + return static_cast(lhs) != static_cast(rhs); } -static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = TensixSoftResetOptions::TRISC0 | - TensixSoftResetOptions::TRISC1 | - TensixSoftResetOptions::TRISC2; +static constexpr TensixSoftResetOptions ALL_TRISC_SOFT_RESET = + TensixSoftResetOptions::TRISC0 | TensixSoftResetOptions::TRISC1 | TensixSoftResetOptions::TRISC2; -static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = TensixSoftResetOptions::BRISC | - TensixSoftResetOptions::NCRISC | - TensixSoftResetOptions::STAGGERED_START | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions ALL_TENSIX_SOFT_RESET = + TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | TensixSoftResetOptions::STAGGERED_START | + ALL_TRISC_SOFT_RESET; -static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = TensixSoftResetOptions::BRISC | - TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions TENSIX_ASSERT_SOFT_RESET = + TensixSoftResetOptions::BRISC | TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET; -static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET | - TensixSoftResetOptions::STAGGERED_START; +static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET = + TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET | TensixSoftResetOptions::STAGGERED_START; -static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = TensixSoftResetOptions::NCRISC | - ALL_TRISC_SOFT_RESET; +static constexpr TensixSoftResetOptions TENSIX_DEASSERT_SOFT_RESET_NO_STAGGER = + TensixSoftResetOptions::NCRISC | ALL_TRISC_SOFT_RESET; diff --git a/device/tt_soc_descriptor.cpp b/device/tt_soc_descriptor.cpp index 74c35e59a..ac9b31ad1 100644 --- a/device/tt_soc_descriptor.cpp +++ b/device/tt_soc_descriptor.cpp @@ -2,66 +2,65 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "yaml-cpp/yaml.h" #include "tt_soc_descriptor.h" -#include "common/utils.hpp" - #include + #include #include #include #include #include +#include "common/utils.hpp" #include "fmt/core.h" +#include "yaml-cpp/yaml.h" // #include "l1_address_map.h" std::string format_node(tt_xy_pair xy) { return fmt::format("{}-{}", xy.x, xy.y); } tt_xy_pair format_node(std::string str) { - int x_coord; - int y_coord; - std::regex expr("([0-9]+)[-,xX]([0-9]+)"); - std::smatch x_y_pair; - - if (std::regex_search(str, x_y_pair, expr)) { - x_coord = std::stoi(x_y_pair[1]); - y_coord = std::stoi(x_y_pair[2]); - } else { - throw std::runtime_error(fmt::format("Could not parse the core id: {}", str)); - } + int x_coord; + int y_coord; + std::regex expr("([0-9]+)[-,xX]([0-9]+)"); + std::smatch x_y_pair; + + if (std::regex_search(str, x_y_pair, expr)) { + x_coord = std::stoi(x_y_pair[1]); + y_coord = std::stoi(x_y_pair[2]); + } else { + throw std::runtime_error(fmt::format("Could not parse the core id: {}", str)); + } - tt_xy_pair xy(x_coord, y_coord); + tt_xy_pair xy(x_coord, y_coord); - return xy; + return xy; } -const char* ws = " \t\n\r\f\v"; + +const char *ws = " \t\n\r\f\v"; // trim from end of string (right) -inline std::string& rtrim(std::string& s, const char* t = ws) -{ +inline std::string &rtrim(std::string &s, const char *t = ws) { s.erase(s.find_last_not_of(t) + 1); return s; } // trim from beginning of string (left) -inline std::string& ltrim(std::string& s, const char* t = ws) -{ +inline std::string <rim(std::string &s, const char *t = ws) { s.erase(0, s.find_first_not_of(t)); return s; } // trim from both ends of string (right then left) -inline std::string& trim(std::string& s, const char* t = ws) -{ - return ltrim(rtrim(s, t), t); -} +inline std::string &trim(std::string &s, const char *t = ws) { return ltrim(rtrim(s, t), t); } void tt_SocDescriptor::load_soc_features_from_device_descriptor(YAML::Node &device_descriptor_yaml) { overlay_version = device_descriptor_yaml["features"]["overlay"]["version"].as(); - noc_translation_id_enabled = device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as() : false; + noc_translation_id_enabled = + device_descriptor_yaml["features"]["noc"] && device_descriptor_yaml["features"]["noc"]["translation_id_enabled"] + ? device_descriptor_yaml["features"]["noc"]["translation_id_enabled"].as() + : false; packer_version = device_descriptor_yaml["features"]["packer"]["version"].as(); unpacker_version = device_descriptor_yaml["features"]["unpacker"]["version"].as(); dst_size_alignment = device_descriptor_yaml["features"]["math"]["dst_size_alignment"].as(); @@ -92,7 +91,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & } int current_dram_channel = 0; - for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); ++channel_it) { + for (auto channel_it = device_descriptor_yaml["dram"].begin(); channel_it != device_descriptor_yaml["dram"].end(); + ++channel_it) { dram_cores.push_back({}); auto &soc_dram_cores = dram_cores.at(dram_cores.size() - 1); const auto &dram_cores = (*channel_it).as>(); @@ -123,8 +123,8 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & std::vector worker_cores = device_descriptor_yaml["functional_workers"].as>(); std::set worker_routing_coords_x; std::set worker_routing_coords_y; - std::unordered_map routing_coord_worker_x; - std::unordered_map routing_coord_worker_y; + std::unordered_map routing_coord_worker_x; + std::unordered_map routing_coord_worker_y; for (const auto &core_string : worker_cores) { CoreDescriptor core_descriptor; core_descriptor.coord = format_node(core_string); @@ -139,12 +139,12 @@ void tt_SocDescriptor::load_core_descriptors_from_device_descriptor(YAML::Node & int func_x_start = 0; int func_y_start = 0; std::set::iterator it; - for (it=worker_routing_coords_x.begin(); it!=worker_routing_coords_x.end(); ++it) { + for (it = worker_routing_coords_x.begin(); it != worker_routing_coords_x.end(); ++it) { worker_log_to_routing_x[func_x_start] = *it; routing_x_to_worker_x[*it] = func_x_start; func_x_start++; } - for (it=worker_routing_coords_y.begin(); it!=worker_routing_coords_y.end(); ++it) { + for (it = worker_routing_coords_y.begin(); it != worker_routing_coords_y.end(); ++it) { worker_log_to_routing_y[func_y_start] = *it; routing_y_to_worker_y[*it] = func_y_start; func_y_start++; @@ -227,7 +227,8 @@ tt_virtual_coords tt_SocDescriptor::to_virtual_coords(tt_translated_coords trans tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask) { std::ifstream fdesc(device_descriptor_path); if (fdesc.fail()) { - throw std::runtime_error(fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path)); + throw std::runtime_error( + fmt::format("Error: device descriptor file {} does not exist!", device_descriptor_path)); } fdesc.close(); @@ -235,10 +236,12 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size auto grid_size_x = device_descriptor_yaml["grid"]["x_size"].as(); auto grid_size_y = device_descriptor_yaml["grid"]["y_size"].as(); - int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] ? - device_descriptor_yaml["physical"]["x_size"].as() : grid_size_x; - int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] ? - device_descriptor_yaml["physical"]["y_size"].as() : grid_size_y; + int physical_grid_size_x = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["x_size"] + ? device_descriptor_yaml["physical"]["x_size"].as() + : grid_size_x; + int physical_grid_size_y = device_descriptor_yaml["physical"] && device_descriptor_yaml["physical"]["y_size"] + ? device_descriptor_yaml["physical"]["y_size"].as() + : grid_size_y; load_core_descriptors_from_device_descriptor(device_descriptor_yaml); grid_size = tt_xy_pair(grid_size_x, grid_size_y); physical_grid_size = tt_xy_pair(physical_grid_size_x, physical_grid_size_y); @@ -253,7 +256,7 @@ tt_SocDescriptor::tt_SocDescriptor(std::string device_descriptor_path, std::size int tt_SocDescriptor::get_num_dram_channels() const { int num_channels = 0; - for (auto& dram_core : dram_cores) { + for (auto &dram_core : dram_cores) { if (dram_core.size() > 0) { num_channels++; } @@ -299,7 +302,7 @@ std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name) { } else if (arch_name == tt::ARCH::WORMHOLE_B0) { out << "wormhole_b0"; } else if (arch_name == tt::ARCH::BLACKHOLE) { - out << "blackhole"; //Just how many ARCH-to-string functions do we plan to have, anyway? + out << "blackhole"; // Just how many ARCH-to-string functions do we plan to have, anyway? } else { out << "ArchNameSerializationNotImplemented"; } diff --git a/device/tt_soc_descriptor.h b/device/tt_soc_descriptor.h index e0529570a..4d3c2ad5f 100644 --- a/device/tt_soc_descriptor.h +++ b/device/tt_soc_descriptor.h @@ -7,29 +7,25 @@ #pragma once #include -#include +#include +#include #include +#include #include #include -#include -#include -#include - -#include "tt_xy_pair.h" -#include "device/tt_arch_types.h" - #include "device/coordinate_manager.h" - +#include "device/tt_arch_types.h" #include "fmt/core.h" +#include "tt_xy_pair.h" namespace YAML { - class Node; +class Node; } std::ostream &operator<<(std::ostream &out, const tt::ARCH &arch_name); -static inline std::string get_arch_str(const tt::ARCH arch_name){ +static inline std::string get_arch_str(const tt::ARCH arch_name) { std::string arch_name_str; if (arch_name == tt::ARCH::GRAYSKULL) { @@ -45,16 +41,18 @@ static inline std::string get_arch_str(const tt::ARCH arch_name){ return arch_name_str; } -static inline tt::ARCH get_arch_name(const std::string &arch_str){ +static inline tt::ARCH get_arch_name(const std::string &arch_str) { tt::ARCH arch; if ((arch_str == "grayskull") || (arch_str == "GRAYSKULL")) { arch = tt::ARCH::GRAYSKULL; - } else if ((arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || (arch_str == "WORMHOLE_B0")){ + } else if ( + (arch_str == "wormhole") || (arch_str == "WORMHOLE") || (arch_str == "wormhole_b0") || + (arch_str == "WORMHOLE_B0")) { arch = tt::ARCH::WORMHOLE_B0; - } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")){ + } else if ((arch_str == "blackhole") || (arch_str == "BLACKHOLE")) { arch = tt::ARCH::BLACKHOLE; - }else { + } else { throw std::runtime_error( fmt::format("At LoadSocDescriptorFromYaml: \"{}\" is not recognized as tt::ARCH.", arch_str)); } @@ -69,13 +67,13 @@ tt_xy_pair format_node(std::string str); //! SocCore type enumerations /*! Superset for all chip generations */ enum class CoreType { - ARC, - DRAM, - ETH, - PCIE, - WORKER, - HARVESTED, - ROUTER_ONLY, + ARC, + DRAM, + ETH, + PCIE, + WORKER, + HARVESTED, + ROUTER_ONLY, }; @@ -84,10 +82,10 @@ enum class CoreType { Should only contain relevant configuration for SOC */ struct CoreDescriptor { - tt_xy_pair coord = tt_xy_pair(0, 0); - CoreType type; + tt_xy_pair coord = tt_xy_pair(0, 0); + CoreType type; - std::size_t l1_size = 0; + std::size_t l1_size = 0; }; //! tt_SocDescriptor contains information regarding the SOC configuration targetted. @@ -95,7 +93,6 @@ struct CoreDescriptor { Should only contain relevant configuration for SOC */ class tt_SocDescriptor { - public: tt::ARCH arch; tt_xy_pair grid_size; @@ -110,13 +107,15 @@ class tt_SocDescriptor { std::unordered_map worker_log_to_routing_y; std::unordered_map routing_x_to_worker_x; std::unordered_map routing_y_to_worker_y; - std::vector> dram_cores; // per channel list of dram cores + std::vector> dram_cores; // per channel list of dram cores std::unordered_map> dram_core_channel_map; // map dram core to chan/subchan - std::vector ethernet_cores; // ethernet cores (index == channel id) - std::unordered_map ethernet_core_channel_map; + std::vector ethernet_cores; // ethernet cores (index == channel id) + std::unordered_map ethernet_core_channel_map; std::vector trisc_sizes; // Most of software stack assumes same trisc size for whole chip.. std::string device_descriptor_file_path = std::string(""); + bool has(tt_xy_pair input) { return cores.find(input) != cores.end(); } + int overlay_version; int unpacker_version; int dst_size_alignment; @@ -129,15 +128,15 @@ class tt_SocDescriptor { int get_num_dram_channels() const; bool is_worker_core(const tt_xy_pair &core) const; tt_xy_pair get_core_for_dram_channel(int dram_chan, int subchannel) const; - bool is_ethernet_core(const tt_xy_pair& core) const; + bool is_ethernet_core(const tt_xy_pair &core) const; // Default constructor. Creates uninitialized object with public access to all of its attributes. tt_SocDescriptor() = default; - // Constructor used to build object from device descriptor file. + // Constructor used to build object from device descriptor file. tt_SocDescriptor(std::string device_descriptor_path, std::size_t harvesting_mask = 0); // Copy constructor - tt_SocDescriptor(const tt_SocDescriptor& other) : + tt_SocDescriptor(const tt_SocDescriptor &other) : arch(other.arch), grid_size(other.grid_size), physical_grid_size(other.physical_grid_size), @@ -167,7 +166,7 @@ class tt_SocDescriptor { dram_bank_size(other.dram_bank_size) { coordinate_manager.reset(new CoordinateManager(*other.coordinate_manager)); } - + // Coordinate conversions. // Conversions from logical coordinates should be used just for worker cores. @@ -189,7 +188,7 @@ class tt_SocDescriptor { void perform_harvesting(std::size_t harvesting_mask); - static std::string get_soc_descriptor_path(tt::ARCH arch); + static std::string get_soc_descriptor_path(tt::ARCH arch); private: void create_coordinate_manager(std::size_t harvesting_mask); diff --git a/device/tt_xy_pair.h b/device/tt_xy_pair.h index fde3b4578..3d982ac92 100644 --- a/device/tt_xy_pair.h +++ b/device/tt_xy_pair.h @@ -15,44 +15,56 @@ using tt_cxy_pair = tt::umd::cxy_pair; struct tt_physical_coords : public tt_xy_pair { tt_physical_coords() : tt_xy_pair() {} + tt_physical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_physical_coords : public tt_cxy_pair { tt_chip_physical_coords() : tt_cxy_pair() {} + tt_chip_physical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_physical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_logical_coords : public tt_xy_pair { tt_logical_coords() : tt_xy_pair() {} + tt_logical_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_logical_coords : public tt_cxy_pair { tt_chip_logical_coords() : tt_cxy_pair() {} + tt_chip_logical_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_logical_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_virtual_coords : public tt_xy_pair { tt_virtual_coords() : tt_xy_pair() {} + tt_virtual_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_virtual_coords : public tt_cxy_pair { tt_chip_virtual_coords() : tt_cxy_pair() {} + tt_chip_virtual_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_virtual_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; struct tt_translated_coords : public tt_xy_pair { tt_translated_coords() : tt_xy_pair() {} + tt_translated_coords(std::size_t x, std::size_t y) : tt_xy_pair(x, y) {} }; struct tt_chip_translated_coords : public tt_cxy_pair { tt_chip_translated_coords() : tt_cxy_pair() {} + tt_chip_translated_coords(std::size_t ichip, xy_pair pair) : tt_cxy_pair(ichip, pair) {} + tt_chip_translated_coords(std::size_t ichip, std::size_t x, std::size_t y) : tt_cxy_pair(ichip, x, y) {} }; diff --git a/device/wormhole/wormhole_coordinate_manager.cpp b/device/wormhole/wormhole_coordinate_manager.cpp index ddb088dea..e9766d16e 100644 --- a/device/wormhole/wormhole_coordinate_manager.cpp +++ b/device/wormhole/wormhole_coordinate_manager.cpp @@ -19,9 +19,11 @@ std::set WormholeCoordinateManager::get_y_coordinates_to_harvest(st } tt_translated_coords WormholeCoordinateManager::to_translated_coords(tt_logical_coords logical_coords) { - return tt_translated_coords(logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y); + return tt_translated_coords( + logical_coords.x + translated_coordinate_start_x, logical_coords.y + translated_coordinate_start_y); } tt_logical_coords WormholeCoordinateManager::to_logical_coords(tt_translated_coords translated_coords) { - return tt_logical_coords(translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y); + return tt_logical_coords( + translated_coords.x - translated_coordinate_start_x, translated_coords.y - translated_coordinate_start_y); } diff --git a/device/wormhole/wormhole_coordinate_manager.h b/device/wormhole/wormhole_coordinate_manager.h index 9eca9fd1d..e3e358860 100644 --- a/device/wormhole/wormhole_coordinate_manager.h +++ b/device/wormhole/wormhole_coordinate_manager.h @@ -9,16 +9,16 @@ #include "device/coordinate_manager.h" class WormholeCoordinateManager : public CoordinateManager { - public: - WormholeCoordinateManager(const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) - : CoordinateManager(worker_grid_size, workers, harvesting_mask) {} + WormholeCoordinateManager( + const tt_xy_pair& worker_grid_size, const std::vector& workers, std::size_t harvesting_mask) : + CoordinateManager(worker_grid_size, workers, harvesting_mask) {} tt_translated_coords to_translated_coords(tt_logical_coords logical_coords) override; tt_logical_coords to_logical_coords(tt_translated_coords translated_coords) override; -protected: +protected: std::set get_y_coordinates_to_harvest(std::size_t harvesting_mask) override; private: diff --git a/device/wormhole/wormhole_implementation.cpp b/device/wormhole/wormhole_implementation.cpp index 247a5eaa6..2a4c50489 100644 --- a/device/wormhole/wormhole_implementation.cpp +++ b/device/wormhole/wormhole_implementation.cpp @@ -4,13 +4,12 @@ #include "wormhole_implementation.h" -#include "src/firmware/riscv/wormhole/host_mem_address_map.h" -#include "src/firmware/riscv/wormhole/eth_interface.h" - #include "device/cluster.h" +#include "src/firmware/riscv/wormhole/eth_interface.h" +#include "src/firmware/riscv/wormhole/host_mem_address_map.h" -constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH -constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_LOCAL_BITS = 36; // source: noc_parameters.h, common for WH && BH +constexpr std::uint32_t NOC_ADDR_NODE_ID_BITS = 6; // source: noc_parameters.h, common for WH && BH namespace tt::umd { @@ -98,7 +97,9 @@ std::pair wormhole_implementation::get_tlb_data( } tt_driver_host_address_params wormhole_implementation::get_host_address_params() const { - return {::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; + return { + ::wormhole::host_mem::address_map::ETH_ROUTING_BLOCK_SIZE, + ::wormhole::host_mem::address_map::ETH_ROUTING_BUFFERS_START}; } tt_driver_eth_interface_params wormhole_implementation::get_eth_interface_params() const { diff --git a/device/wormhole/wormhole_implementation.h b/device/wormhole/wormhole_implementation.h index 2346185a3..f8bf6f142 100644 --- a/device/wormhole/wormhole_implementation.h +++ b/device/wormhole/wormhole_implementation.h @@ -167,7 +167,8 @@ static constexpr uint32_t TLB_BASE_INDEX_16M = TLB_BASE_INDEX_2M + TLB_COUNT_2M; static constexpr uint32_t DYNAMIC_TLB_COUNT = 16; static constexpr uint32_t DYNAMIC_TLB_16M_SIZE = 16 * 1024 * 1024; -static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); +static constexpr uint32_t DYNAMIC_TLB_16M_CFG_ADDR = + STATIC_TLB_CFG_ADDR + (TLB_BASE_INDEX_16M * TLB_CFG_REG_SIZE_BYTES); static constexpr uint32_t DYNAMIC_TLB_16M_BASE = TLB_BASE_16M; static constexpr uint32_t DYNAMIC_TLB_2M_SIZE = 2 * 1024 * 1024; @@ -205,59 +206,93 @@ static constexpr uint32_t TENSIX_SOFT_RESET_ADDR = 0xFFB121B0; } // namespace wormhole class wormhole_implementation : public architecture_implementation { - public: +public: tt::ARCH get_architecture() const override { return tt::ARCH::WORMHOLE_B0; } + uint32_t get_arc_message_arc_get_harvesting() const override { return static_cast(wormhole::arc_message_type::ARC_GET_HARVESTING); } + uint32_t get_arc_message_arc_go_busy() const override { return static_cast(wormhole::arc_message_type::ARC_GO_BUSY); } + uint32_t get_arc_message_arc_go_long_idle() const override { return static_cast(wormhole::arc_message_type::ARC_GO_LONG_IDLE); } + uint32_t get_arc_message_arc_go_short_idle() const override { return static_cast(wormhole::arc_message_type::ARC_GO_SHORT_IDLE); } + uint32_t get_arc_message_deassert_riscv_reset() const override { return static_cast(wormhole::arc_message_type::DEASSERT_RISCV_RESET); } + uint32_t get_arc_message_get_aiclk() const override { return static_cast(wormhole::arc_message_type::GET_AICLK); } + uint32_t get_arc_message_setup_iatu_for_peer_to_peer() const override { return static_cast(wormhole::arc_message_type::SETUP_IATU_FOR_PEER_TO_PEER); } + uint32_t get_arc_message_test() const override { return static_cast(wormhole::arc_message_type::TEST); } + uint32_t get_arc_csm_mailbox_offset() const override { return wormhole::ARC_CSM_MAILBOX_OFFSET; } + uint32_t get_arc_reset_arc_misc_cntl_offset() const override { return wormhole::ARC_RESET_ARC_MISC_CNTL_OFFSET; } + uint32_t get_arc_reset_scratch_offset() const override { return wormhole::ARC_RESET_SCRATCH_OFFSET; } + uint32_t get_dram_channel_0_peer2peer_region_start() const override { return wormhole::DRAM_CHANNEL_0_PEER2PEER_REGION_START; } + uint32_t get_dram_channel_0_x() const override { return wormhole::DRAM_CHANNEL_0_X; } + uint32_t get_dram_channel_0_y() const override { return wormhole::DRAM_CHANNEL_0_Y; } + uint32_t get_broadcast_tlb_index() const override { return wormhole::BROADCAST_TLB_INDEX; } + uint32_t get_dynamic_tlb_2m_base() const override { return wormhole::DYNAMIC_TLB_2M_BASE; } + uint32_t get_dynamic_tlb_2m_size() const override { return wormhole::DYNAMIC_TLB_2M_SIZE; } + uint32_t get_dynamic_tlb_16m_base() const override { return wormhole::DYNAMIC_TLB_16M_BASE; } + uint32_t get_dynamic_tlb_16m_size() const override { return wormhole::DYNAMIC_TLB_16M_SIZE; } + uint32_t get_dynamic_tlb_16m_cfg_addr() const override { return wormhole::DYNAMIC_TLB_16M_CFG_ADDR; } + uint32_t get_mem_large_read_tlb() const override { return wormhole::MEM_LARGE_READ_TLB; } + uint32_t get_mem_large_write_tlb() const override { return wormhole::MEM_LARGE_WRITE_TLB; } + uint32_t get_static_tlb_cfg_addr() const override { return wormhole::STATIC_TLB_CFG_ADDR; } + uint32_t get_static_tlb_size() const override { return wormhole::STATIC_TLB_SIZE; } + uint32_t get_reg_tlb() const override { return wormhole::REG_TLB; } + uint32_t get_tlb_base_index_16m() const override { return wormhole::TLB_BASE_INDEX_16M; } + uint32_t get_tensix_soft_reset_addr() const override { return wormhole::TENSIX_SOFT_RESET_ADDR; } + uint32_t get_grid_size_x() const override { return wormhole::GRID_SIZE_X; } + uint32_t get_grid_size_y() const override { return wormhole::GRID_SIZE_Y; } + uint32_t get_tlb_cfg_reg_size_bytes() const override { return wormhole::TLB_CFG_REG_SIZE_BYTES; } + uint32_t get_small_read_write_tlb() const override { return wormhole::MEM_SMALL_READ_WRITE_TLB; } + const std::vector& get_harvesting_noc_locations() const override { return wormhole::HARVESTING_NOC_LOCATIONS; } + const std::vector& get_t6_x_locations() const override { return wormhole::T6_X_LOCATIONS; } + const std::vector& get_t6_y_locations() const override { return wormhole::T6_Y_LOCATIONS; } std::tuple multicast_workaround(xy_pair start, xy_pair end) const override; @@ -268,7 +303,6 @@ class wormhole_implementation : public architecture_implementation { tt_driver_host_address_params get_host_address_params() const override; tt_driver_eth_interface_params get_eth_interface_params() const override; tt_driver_noc_params get_noc_params() const override; - }; } // namespace tt::umd diff --git a/device/xy_pair.cpp b/device/xy_pair.cpp index 0c3c9e20f..44b0ad8fb 100644 --- a/device/xy_pair.cpp +++ b/device/xy_pair.cpp @@ -11,6 +11,7 @@ namespace tt::umd { std::string xy_pair::str() const { return fmt::format("(x={},y={})", x, y); } + std::string cxy_pair::str() const { return fmt::format("(chip={},x={},y={})", chip, x, y); } } // namespace tt::umd diff --git a/device/xy_pair.h b/device/xy_pair.h index ca717052f..b989b31e8 100644 --- a/device/xy_pair.h +++ b/device/xy_pair.h @@ -12,6 +12,7 @@ namespace tt::umd { struct xy_pair { constexpr xy_pair() : x{}, y{} {} + constexpr xy_pair(std::size_t x, std::size_t y) : x(x), y(y) {} std::size_t x; @@ -30,7 +31,9 @@ constexpr inline bool operator<(const xy_pair &left, const xy_pair &right) { struct cxy_pair : public xy_pair { cxy_pair() : xy_pair{}, chip{} {} + cxy_pair(std::size_t ichip, xy_pair pair) : xy_pair(pair.x, pair.y), chip(ichip) {} + cxy_pair(std::size_t ichip, std::size_t x, std::size_t y) : xy_pair(x, y), chip(ichip) {} std::size_t chip; diff --git a/tests/.clang-format b/tests/.clang-format deleted file mode 100644 index 9d159247d..000000000 --- a/tests/.clang-format +++ /dev/null @@ -1,2 +0,0 @@ -DisableFormat: true -SortIncludes: false diff --git a/tests/api/test_chip.cpp b/tests/api/test_chip.cpp index 339d1abc6..ceeac384e 100644 --- a/tests/api/test_chip.cpp +++ b/tests/api/test_chip.cpp @@ -5,23 +5,23 @@ // This file holds Chip specific API examples. #include -#include "fmt/xchar.h" #include #include #include #include +#include "fmt/xchar.h" #include "tests/test_utils/generate_cluster_desc.hpp" // TODO: change to tt_cluster +#include "device/architecture_implementation.h" #include "device/cluster.h" #include "device/tt_cluster_descriptor.h" -#include "device/architecture_implementation.h" using namespace tt::umd; -inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr &umd_cluster) { +inline tt_cxy_pair get_tensix_chip_core_coord(const std::unique_ptr& umd_cluster) { chip_id_t any_mmio_chip = *umd_cluster->get_target_mmio_device_ids().begin(); const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(any_mmio_chip); tt_xy_pair core = soc_desc.workers[0]; @@ -68,16 +68,17 @@ TEST(ApiChipTest, ManualTLBConfiguration) { if (!is_worker_core) { return -1; } - return core.x + core.y * umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x(); + return core.x + + core.y * + umd_cluster->get_pci_device(any_mmio_chip)->get_architecture_implementation()->get_grid_size_x(); }; std::int32_t c_zero_address = 0; // Each MMIO chip has it's own set of TLBs, so needs its own configuration. - for (chip_id_t mmio_chip: umd_cluster->get_target_mmio_device_ids()) { - + for (chip_id_t mmio_chip : umd_cluster->get_target_mmio_device_ids()) { const tt_SocDescriptor& soc_desc = umd_cluster->get_soc_descriptor(mmio_chip); - for (tt_xy_pair core: soc_desc.workers) { + for (tt_xy_pair core : soc_desc.workers) { umd_cluster->configure_tlb(mmio_chip, core, get_static_tlb_index(core), c_zero_address); } @@ -119,7 +120,7 @@ TEST(ApiChipTest, DeassertRiscResetOnCore) { if (umd_cluster == nullptr || umd_cluster->get_all_chips_in_cluster().empty()) { GTEST_SKIP() << "No chips present on the system. Skipping test."; } - + tt_cxy_pair chip_core_coord = get_tensix_chip_core_coord(umd_cluster); umd_cluster->assert_risc_reset_at_core(chip_core_coord); diff --git a/tests/api/test_cluster.cpp b/tests/api/test_cluster.cpp index 339d628fe..fa0345068 100644 --- a/tests/api/test_cluster.cpp +++ b/tests/api/test_cluster.cpp @@ -11,17 +11,16 @@ #include #include +#include "device/cluster.h" +#include "device/tt_cluster_descriptor.h" #include "fmt/xchar.h" #include "tests/test_utils/generate_cluster_desc.hpp" -#include "device/tt_cluster_descriptor.h" -#include "device/cluster.h" - // TODO: obviously we need some other way to set this up +#include "noc/noc_parameters.h" #include "src/firmware/riscv/wormhole/eth_l1_address_map.h" #include "src/firmware/riscv/wormhole/host_mem_address_map.h" #include "src/firmware/riscv/wormhole/l1_address_map.h" -#include "noc/noc_parameters.h" using namespace tt::umd; @@ -36,8 +35,7 @@ inline std::unique_ptr get_cluster() { if (pci_device_ids.empty()) { return nullptr; } - return std::unique_ptr( - new Cluster()); + return std::unique_ptr(new Cluster()); } // TODO: Should not be wormhole specific. @@ -49,11 +47,9 @@ void setup_wormhole_remote(Cluster* umd_cluster) { // Populate address map and NOC parameters that the driver needs for remote transactions umd_cluster->set_device_l1_address_params( - { - l1_mem::address_map::L1_BARRIER_BASE, + {l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, - eth_l1_mem::address_map::FW_VERSION_ADDR - }); + eth_l1_mem::address_map::FW_VERSION_ADDR}); } } diff --git a/tests/api/test_cluster_descriptor.cpp b/tests/api/test_cluster_descriptor.cpp index ff0f4c7ea..af14e77cd 100644 --- a/tests/api/test_cluster_descriptor.cpp +++ b/tests/api/test_cluster_descriptor.cpp @@ -5,19 +5,17 @@ #include #include -#include #include +#include -#include "tests/test_utils/generate_cluster_desc.hpp" #include "common/disjoint_set.hpp" - #include "device/pcie/pci_device.hpp" #include "device/tt_cluster_descriptor.h" +#include "tests/test_utils/generate_cluster_desc.hpp" // TODO: Needed for detect_arch, remove when it is part of cluster descriptor. #include "device/cluster.h" - inline std::unique_ptr get_cluster_desc() { // TODO: remove getting manually cluster descriptor from yaml. std::string yaml_path = tt_ClusterDescriptor::get_cluster_descriptor_file_path(); @@ -45,7 +43,6 @@ TEST(ApiClusterDescriptorTest, DetectArch) { } TEST(ApiClusterDescriptorTest, BasicFunctionality) { - std::unique_ptr cluster_desc = get_cluster_desc(); if (cluster_desc == nullptr) { @@ -57,7 +54,7 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) { std::unordered_map eth_chip_coords = cluster_desc->get_chip_locations(); std::unordered_map local_chips_to_pci_device_id = cluster_desc->get_chips_with_mmio(); std::unordered_set local_chips; - for (auto [chip, _]: local_chips_to_pci_device_id) { + for (auto [chip, _] : local_chips_to_pci_device_id) { local_chips.insert(chip); } std::unordered_set remote_chips; @@ -67,28 +64,30 @@ TEST(ApiClusterDescriptorTest, BasicFunctionality) { } } - std::unordered_map> chips_grouped_by_closest_mmio = cluster_desc->get_chips_grouped_by_closest_mmio(); + std::unordered_map> chips_grouped_by_closest_mmio = + cluster_desc->get_chips_grouped_by_closest_mmio(); } TEST(ApiClusterDescriptorTest, TestAllOfflineClusterDescriptors) { for (std::string cluster_desc_yaml : { - "blackhole_P150.yaml", - "galaxy.yaml", - "grayskull_E150.yaml", - "grayskull_E300.yaml", - "wormhole_2xN300_unconnected.yaml", - "wormhole_N150.yaml", - "wormhole_N300.yaml", - }) { + "blackhole_P150.yaml", + "galaxy.yaml", + "grayskull_E150.yaml", + "grayskull_E300.yaml", + "wormhole_2xN300_unconnected.yaml", + "wormhole_N150.yaml", + "wormhole_N300.yaml", + }) { std::cout << "Testing " << cluster_desc_yaml << std::endl; - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/" + cluster_desc_yaml)); + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml( + test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/" + cluster_desc_yaml)); std::unordered_set all_chips = cluster_desc->get_all_chips(); std::unordered_map harvesting_for_chips = cluster_desc->get_harvesting_info(); std::unordered_map eth_chip_coords = cluster_desc->get_chip_locations(); std::unordered_map local_chips_to_pci_device_id = cluster_desc->get_chips_with_mmio(); std::unordered_set local_chips; - for (auto [chip, _]: local_chips_to_pci_device_id) { + for (auto [chip, _] : local_chips_to_pci_device_id) { local_chips.insert(chip); } std::unordered_set remote_chips; @@ -98,12 +97,14 @@ TEST(ApiClusterDescriptorTest, TestAllOfflineClusterDescriptors) { } } - std::unordered_map> chips_grouped_by_closest_mmio = cluster_desc->get_chips_grouped_by_closest_mmio(); + std::unordered_map> chips_grouped_by_closest_mmio = + cluster_desc->get_chips_grouped_by_closest_mmio(); } } TEST(ApiClusterDescriptorTest, SeparateClusters) { - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml")); + std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml( + test_utils::GetAbsPath("tests/api/cluster_descriptor_examples/wormhole_2xN300_unconnected.yaml")); auto all_chips = cluster_desc->get_all_chips(); DisjointSet chip_clusters; @@ -112,9 +113,9 @@ TEST(ApiClusterDescriptorTest, SeparateClusters) { } // Merge into clusters of chips. - for (auto connection: cluster_desc->get_ethernet_connections()) { + for (auto connection : cluster_desc->get_ethernet_connections()) { chip_id_t chip = connection.first; - for (auto [channel, remote_chip_and_channel]: connection.second) { + for (auto [channel, remote_chip_and_channel] : connection.second) { chip_id_t remote_chip = std::get<0>(remote_chip_and_channel); chip_clusters.merge(chip, remote_chip); } diff --git a/tests/api/test_mockup_device.cpp b/tests/api/test_mockup_device.cpp index f41b5aea6..71ab4140a 100644 --- a/tests/api/test_mockup_device.cpp +++ b/tests/api/test_mockup_device.cpp @@ -25,14 +25,18 @@ std::string get_env_arch_name() { } tt::ARCH get_arch_from_string(const std::string &arch_str) { - if (arch_str == "grayskull" || arch_str == "GRAYSKULL") + if (arch_str == "grayskull" || arch_str == "GRAYSKULL") { return tt::ARCH::GRAYSKULL; - if (arch_str == "wormhole_b0" || arch_str == "WORMHOLE_B0") + } + if (arch_str == "wormhole_b0" || arch_str == "WORMHOLE_B0") { return tt::ARCH::WORMHOLE_B0; - if (arch_str == "blackhole" || arch_str == "BLACKHOLE") + } + if (arch_str == "blackhole" || arch_str == "BLACKHOLE") { return tt::ARCH::BLACKHOLE; - if (arch_str == "Invalid" || arch_str == "INVALID") + } + if (arch_str == "Invalid" || arch_str == "INVALID") { return tt::ARCH::Invalid; + } throw std::runtime_error(arch_str + " is not recognized as tt::ARCH."); } @@ -41,11 +45,16 @@ std::string get_soc_descriptor_file(tt::ARCH arch) { // const std::string umd_root = get_umd_root(); switch (arch) { - case tt::ARCH::GRAYSKULL: return test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); - case tt::ARCH::WORMHOLE_B0: return test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); - case tt::ARCH::BLACKHOLE: return test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"); - case tt::ARCH::Invalid: throw std::runtime_error("Invalid arch not supported"); - default: throw std::runtime_error("Unsupported device architecture"); + case tt::ARCH::GRAYSKULL: + return test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"); + case tt::ARCH::WORMHOLE_B0: + return test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"); + case tt::ARCH::BLACKHOLE: + return test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml"); + case tt::ARCH::Invalid: + throw std::runtime_error("Invalid arch not supported"); + default: + throw std::runtime_error("Unsupported device architecture"); } } diff --git a/tests/api/test_soc_descriptor_bh.cpp b/tests/api/test_soc_descriptor_bh.cpp index 8032d02c5..1a74608ba 100644 --- a/tests/api/test_soc_descriptor_bh.cpp +++ b/tests/api/test_soc_descriptor_bh.cpp @@ -3,13 +3,11 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include "gtest/gtest.h" - #include "device/tt_soc_descriptor.h" +#include "gtest/gtest.h" #include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/soc_desc_test_utils.hpp" - // Blackhole workers - x-y annotation // functional_workers: // [ @@ -28,8 +26,8 @@ // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. TEST(SocDescriptor, SocDescriptorBHNoHarvesting) { - - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 0); + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 0); // We expect full grid size since there is no harvesting. tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; @@ -38,7 +36,7 @@ TEST(SocDescriptor, SocDescriptorBHNoHarvesting) { tt_logical_coords logical_coords = tt_logical_coords(x, y); tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords); tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords); - + // Virtual and physical coordinates should be the same. EXPECT_EQ(physical_coords, virtual_coords); } @@ -49,7 +47,8 @@ TEST(SocDescriptor, SocDescriptorBHNoHarvesting) { // We expect that the top left core will have virtual and physical coordinates (1, 2) and (2, 2) for // the logical coordinates if the first row is harvested. TEST(SocDescriptor, SocDescriptorBHTopLeftCore) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 1); + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), 1); tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; tt_logical_coords logical_coords = tt_logical_coords(0, 0); @@ -65,13 +64,12 @@ TEST(SocDescriptor, SocDescriptorBHTopLeftCore) { // Test logical to physical coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates. -// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) { - const std::size_t max_num_harvested_x = 14; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_physical; @@ -97,7 +95,7 @@ TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) { for (auto it : logical_to_physical) { tt_physical_coords physical_coords = it.second; tt_logical_coords logical_coords = soc_desc.to_logical_coords(physical_coords); - + // Expect that reverse mapping of physical coordinates gives the same logical coordinates // using which we got the physical coordinates. EXPECT_EQ(it.first, logical_coords); @@ -107,13 +105,12 @@ TEST(SocDescriptor, SocDescriptorBHLogicalPhysicalMapping) { // Test logical to virtual coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates. -// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorBHLogicalVirtualMapping) { - const std::size_t max_num_harvested_x = 14; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_virtual; @@ -149,13 +146,12 @@ TEST(SocDescriptor, SocDescriptorBHLogicalVirtualMapping) { // Test logical to translated coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of translated coordinates. -// For the reverse mapping back of translated to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of translated to logical coordinates we expect that same logical coordinates are +// returned as from original mapping. TEST(SocDescriptor, SocDescriptorBHLogicalTranslatedMapping) { - const std::size_t max_num_harvested_x = 14; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_translated; @@ -170,7 +166,8 @@ TEST(SocDescriptor, SocDescriptorBHLogicalTranslatedMapping) { tt_translated_coords translated_coords = soc_desc.to_translated_coords(logical_coords); logical_to_translated[logical_coords] = translated_coords; - // Expect that logical to translated translation is 1-1 mapping. No duplicates for translated coordinates. + // Expect that logical to translated translation is 1-1 mapping. No duplicates for translated + // coordinates. EXPECT_EQ(translated_coords_set.count(translated_coords), 0); translated_coords_set.insert(translated_coords); } @@ -196,7 +193,7 @@ TEST(SocDescriptor, SocDescriptorBHVirtualEqualTranslated) { tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_x); harvesting_mask++) { soc_desc.perform_harvesting(harvesting_mask); - + std::size_t num_harvested_x = test_utils::get_num_harvested(harvesting_mask); for (std::size_t x = 0; x < soc_desc.worker_grid_size.x - num_harvested_x; x++) { @@ -209,5 +206,5 @@ TEST(SocDescriptor, SocDescriptorBHVirtualEqualTranslated) { EXPECT_EQ(translated_coords, virtual_coords); } } - } + } } diff --git a/tests/api/test_soc_descriptor_gs.cpp b/tests/api/test_soc_descriptor_gs.cpp index 1c72449b6..d1760df65 100644 --- a/tests/api/test_soc_descriptor_gs.cpp +++ b/tests/api/test_soc_descriptor_gs.cpp @@ -3,9 +3,8 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include "gtest/gtest.h" - #include "device/tt_soc_descriptor.h" +#include "gtest/gtest.h" #include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/soc_desc_test_utils.hpp" @@ -27,7 +26,6 @@ // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. TEST(SocDescriptor, SocDescriptorGSNoHarvesting) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); // We expect full grid size since there is no harvesting. @@ -37,7 +35,7 @@ TEST(SocDescriptor, SocDescriptorGSNoHarvesting) { tt_logical_coords logical_coords = tt_logical_coords(x, y); tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords); tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords); - + // Virtual and physical coordinates should be the same. EXPECT_EQ(physical_coords, virtual_coords); } @@ -48,7 +46,6 @@ TEST(SocDescriptor, SocDescriptorGSNoHarvesting) { // We expect that the top left core will have virtual and physical coordinates (1, 1) and (1, 2) for // the logical coordinates if the first row is harvested. TEST(SocDescriptor, SocDescriptorGSTopLeftCore) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; @@ -75,7 +72,7 @@ TEST(SocDescriptor, SocDescriptorGSTranslatingCoords) { tt_virtual_coords virtual_coords = soc_desc.to_virtual_coords(logical_coords); tt_physical_coords physical_coords = soc_desc.to_physical_coords(logical_coords); tt_translated_coords translated_coords = soc_desc.to_translated_coords(logical_coords); - + // Virtual, physical and translated coordinates should be the same. EXPECT_EQ(physical_coords, virtual_coords); EXPECT_EQ(physical_coords, translated_coords); @@ -85,9 +82,9 @@ TEST(SocDescriptor, SocDescriptorGSTranslatingCoords) { // Test logical to physical coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates. -// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); std::map logical_to_physical; @@ -111,7 +108,7 @@ TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) { for (auto it : logical_to_physical) { tt_physical_coords physical_coords = it.second; tt_logical_coords logical_coords = soc_desc.to_logical_coords(physical_coords); - + // Expect that reverse mapping of physical coordinates gives the same logical coordinates // using which we got the physical coordinates. EXPECT_EQ(it.first, logical_coords); @@ -120,9 +117,9 @@ TEST(SocDescriptor, SocDescriptorGSLogicalPhysicalMapping) { // Test logical to virtual coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates. -// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorGSLogicalVirtualMapping) { - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml")); std::map logical_to_virtual; diff --git a/tests/api/test_soc_descriptor_wh.cpp b/tests/api/test_soc_descriptor_wh.cpp index 37923210c..327219655 100644 --- a/tests/api/test_soc_descriptor_wh.cpp +++ b/tests/api/test_soc_descriptor_wh.cpp @@ -3,35 +3,33 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include "gtest/gtest.h" - #include "device/tt_soc_descriptor.h" +#include "gtest/gtest.h" #include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/soc_desc_test_utils.hpp" - // Wormhole workers - x-y annotation // functional_workers: // [ -// 1-1, 2-1, 3-1, 4-1, 6-1, 7-1, 8-1, 9-1, -// 1-2, 2-2, 3-2, 4-2, 6-2, 7-2, 8-2, 9-2, -// 1-3, 2-3, 3-3, 4-3, 6-3, 7-3, 8-3, 9-3, -// 1-4, 2-4, 3-4, 4-4, 6-4, 7-4, 8-4, 9-4, -// 1-5, 2-5, 3-5, 4-5, 6-5, 7-5, 8-5, 9-5, -// 1-7, 2-7, 3-7, 4-7, 6-7, 7-7, 8-7, 9-7, -// 1-8, 2-8, 3-8, 4-8, 6-8, 7-8, 8-8, 9-8, -// 1-9, 2-9, 3-9, 4-9, 6-9, 7-9, 8-9, 9-9, -// 1-10, 2-10, 3-10, 4-10, 6-10, 7-10, 8-10, 9-10, -// 1-11, 2-11, 3-11, 4-11, 6-11, 7-11, 8-11, 9-11, +// 1-1, 2-1, 3-1, 4-1, 6-1, 7-1, 8-1, 9-1, +// 1-2, 2-2, 3-2, 4-2, 6-2, 7-2, 8-2, 9-2, +// 1-3, 2-3, 3-3, 4-3, 6-3, 7-3, 8-3, 9-3, +// 1-4, 2-4, 3-4, 4-4, 6-4, 7-4, 8-4, 9-4, +// 1-5, 2-5, 3-5, 4-5, 6-5, 7-5, 8-5, 9-5, +// 1-7, 2-7, 3-7, 4-7, 6-7, 7-7, 8-7, 9-7, +// 1-8, 2-8, 3-8, 4-8, 6-8, 7-8, 8-8, 9-8, +// 1-9, 2-9, 3-9, 4-9, 6-9, 7-9, 8-9, 9-9, +// 1-10, 2-10, 3-10, 4-10, 6-10, 7-10, 8-10, 9-10, +// 1-11, 2-11, 3-11, 4-11, 6-11, 7-11, 8-11, 9-11, // ] // Tests that all physical coordinates are same as all virtual coordinates // when there is no harvesting. TEST(SocDescriptor, SocDescriptorWHNoHarvesting) { - const std::size_t harvesting_mask = 0; - - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); + + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); // We expect full grid size since there is no harvesting. tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; @@ -51,10 +49,10 @@ TEST(SocDescriptor, SocDescriptorWHNoHarvesting) { // We expect that the top left core will have virtual and physical coordinates (1, 1) and (1, 2) for // the logical coordinates if the first row is harvested. TEST(SocDescriptor, SocDescriptorWHTopLeftCore) { - const std::size_t harvesting_mask = 1; - tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); + tt_SocDescriptor soc_desc = + tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), harvesting_mask); tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; tt_logical_coords logical_coords = tt_logical_coords(0, 0); @@ -70,13 +68,12 @@ TEST(SocDescriptor, SocDescriptorWHTopLeftCore) { // Test logical to physical coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of physical coordinates. -// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of physical to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) { - const std::size_t max_num_harvested_y = 10; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_physical; @@ -96,8 +93,9 @@ TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) { physical_coords_set.insert(physical_coords); } } - - // Expect that the number of physical coordinates is equal to the number of workers minus the number of harvested rows. + + // Expect that the number of physical coordinates is equal to the number of workers minus the number of + // harvested rows. EXPECT_EQ(physical_coords_set.size(), worker_grid_size.x * (worker_grid_size.y - num_harvested_y)); for (auto it : logical_to_physical) { @@ -113,13 +111,12 @@ TEST(SocDescriptor, SocDescriptorWHLogicalPhysicalMapping) { // Test logical to virtual coordinate translation. // For the full grid of logical coordinates we expect that there are no duplicates of virtual coordinates. -// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned as from original mapping. +// For the reverse mapping back of virtual to logical coordinates we expect that same logical coordinates are returned +// as from original mapping. TEST(SocDescriptor, SocDescriptorWHLogicalVirtualMapping) { - const std::size_t max_num_harvested_y = 10; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml")); for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y); harvesting_mask++) { - soc_desc.perform_harvesting(harvesting_mask); std::map logical_to_virtual; @@ -153,17 +150,18 @@ TEST(SocDescriptor, SocDescriptorWHLogicalVirtualMapping) { // Test top left corner translation from logical to translated coordinates. TEST(SocDescriptor, SocDescriptorWHLogicalTranslatedTopLeft) { - const std::size_t translated_x_start = 18; const std::size_t translated_y_start = 18; - const tt_translated_coords expected_translated_coords = tt_translated_coords(translated_x_start, translated_y_start); + const tt_translated_coords expected_translated_coords = + tt_translated_coords(translated_x_start, translated_y_start); const std::size_t max_num_harvested_y = 10; tt_SocDescriptor soc_desc = tt_SocDescriptor(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml")); - // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we don't want to try to convert if everything is harvested. + // We go up to numbers less than 2^10 - 1 to test all possible harvesting masks, we don't want to try to convert if + // everything is harvested. for (std::size_t harvesting_mask = 0; harvesting_mask < (1 << max_num_harvested_y) - 1; harvesting_mask++) { soc_desc.perform_harvesting(harvesting_mask); - + tt_xy_pair worker_grid_size = soc_desc.worker_grid_size; std::size_t num_harvested_y = test_utils::get_num_harvested(harvesting_mask); diff --git a/tests/blackhole/test_bh_common.h b/tests/blackhole/test_bh_common.h index 5d115e31b..7d89dfe53 100644 --- a/tests/blackhole/test_bh_common.h +++ b/tests/blackhole/test_bh_common.h @@ -3,12 +3,11 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include "tt_xy_pair.h" -#include "tt_cluster_descriptor.h" #include "cluster.h" - -#include "tests/test_utils/stimulus_generators.hpp" #include "eth_l1_address_map.h" +#include "tests/test_utils/stimulus_generators.hpp" +#include "tt_cluster_descriptor.h" +#include "tt_xy_pair.h" using namespace tt::umd; @@ -16,68 +15,68 @@ namespace tt::umd::test::utils { static void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } class BlackholeTestFixture : public ::testing::Test { - protected: - // You can remove any or all of the following functions if their bodies would - // be empty. - - std::unique_ptr device; +protected: + // You can remove any or all of the following functions if their bodies would + // be empty. - BlackholeTestFixture() { + std::unique_ptr device; - } + BlackholeTestFixture() {} - ~BlackholeTestFixture() override { - // You can do clean-up work that doesn't throw exceptions here. - } + ~BlackholeTestFixture() override { + // You can do clean-up work that doesn't throw exceptions here. + } - virtual int get_detected_num_chips() = 0; - virtual bool is_test_skipped() = 0; + virtual int get_detected_num_chips() = 0; + virtual bool is_test_skipped() = 0; - // If the constructor and destructor are not enough for setting up - // and cleaning up each test, you can define the following methods: + // If the constructor and destructor are not enough for setting up + // and cleaning up each test, you can define the following methods: - void SetUp() override { - // Code here will be called immediately after the constructor (right - // before each test). + void SetUp() override { + // Code here will be called immediately after the constructor (right + // before each test). - if (is_test_skipped()) { - GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; - } + if (is_test_skipped()) { + GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; + } - // std::cout << "Setting Up Test." << std::endl; - assert(get_detected_num_chips() > 0); - auto devices = std::vector(get_detected_num_chips()); - std::iota(devices.begin(), devices.end(), 0); - std::set target_devices = {devices.begin(), devices.end()}; - uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); - assert(device != nullptr); - assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); + // std::cout << "Setting Up Test." << std::endl; + assert(get_detected_num_chips() > 0); + auto devices = std::vector(get_detected_num_chips()); + std::iota(devices.begin(), devices.end(), 0); + std::set target_devices = {devices.begin(), devices.end()}; + uint32_t num_host_mem_ch_per_mmio_device = 1; + device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); + assert(device != nullptr); + assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); - set_params_for_remote_txn(*device); + set_params_for_remote_txn(*device); - tt_device_params default_params; - device->start_device(default_params); + tt_device_params default_params; + device->start_device(default_params); - device->deassert_risc_reset(); + device->deassert_risc_reset(); - device->wait_for_non_mmio_flush(); - } + device->wait_for_non_mmio_flush(); + } - void TearDown() override { - // Code here will be called immediately after each test (right - // before the destructor). + void TearDown() override { + // Code here will be called immediately after each test (right + // before the destructor). - if (!is_test_skipped()) { - // std::cout << "Tearing Down Test." << std::endl; - device->close_device(); + if (!is_test_skipped()) { + // std::cout << "Tearing Down Test." << std::endl; + device->close_device(); + } } - } - }; -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils diff --git a/tests/blackhole/test_silicon_driver_bh.cpp b/tests/blackhole/test_silicon_driver_bh.cpp index b2b7bde10..39b3c89ef 100644 --- a/tests/blackhole/test_silicon_driver_bh.cpp +++ b/tests/blackhole/test_silicon_driver_bh.cpp @@ -2,30 +2,41 @@ // // SPDX-License-Identifier: Apache-2.0 -#include "gtest/gtest.h" #include -#include "eth_l1_address_map.h" -#include "l1_address_map.h" -#include "host_mem_address_map.h" -#include + #include +#include #include "device/blackhole/blackhole_implementation.h" #include "device/tt_cluster_descriptor.h" -#include "tests/test_utils/generate_cluster_desc.hpp" +#include "eth_l1_address_map.h" +#include "gtest/gtest.h" +#include "host_mem_address_map.h" +#include "l1_address_map.h" #include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" using namespace tt::umd; void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } std::int32_t get_static_tlb_index(tt_xy_pair target) { - bool is_eth_location = std::find(std::begin(tt::umd::blackhole::ETH_LOCATIONS), std::end(tt::umd::blackhole::ETH_LOCATIONS), target) != std::end(tt::umd::blackhole::ETH_LOCATIONS); - bool is_tensix_location = std::find(std::begin(tt::umd::blackhole::T6_X_LOCATIONS), std::end(tt::umd::blackhole::T6_X_LOCATIONS), target.x) != std::end(tt::umd::blackhole::T6_X_LOCATIONS) && - std::find(std::begin(tt::umd::blackhole::T6_Y_LOCATIONS), std::end(tt::umd::blackhole::T6_Y_LOCATIONS), target.y) != std::end(tt::umd::blackhole::T6_Y_LOCATIONS); + bool is_eth_location = + std::find(std::begin(tt::umd::blackhole::ETH_LOCATIONS), std::end(tt::umd::blackhole::ETH_LOCATIONS), target) != + std::end(tt::umd::blackhole::ETH_LOCATIONS); + bool is_tensix_location = + std::find( + std::begin(tt::umd::blackhole::T6_X_LOCATIONS), std::end(tt::umd::blackhole::T6_X_LOCATIONS), target.x) != + std::end(tt::umd::blackhole::T6_X_LOCATIONS) && + std::find( + std::begin(tt::umd::blackhole::T6_Y_LOCATIONS), std::end(tt::umd::blackhole::T6_Y_LOCATIONS), target.y) != + std::end(tt::umd::blackhole::T6_Y_LOCATIONS); if (is_eth_location) { if (target.y == 6) { target.y = 1; @@ -61,7 +72,8 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) { std::set get_target_devices() { std::set target_devices; - std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + std::unique_ptr cluster_desc_uniq = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) { target_devices.insert(i); } @@ -73,8 +85,15 @@ TEST(SiliconDriverBH, CreateDestroy) { uint32_t num_host_mem_ch_per_mmio_device = 1; tt_device_params default_params; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - for(int i = 0; i < 50; i++) { - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false); + for (int i = 0; i < 50; i++) { + Cluster device = Cluster( + test_utils::GetAbsPath("tests/soc_descs/blackhole_140_arch_no_eth.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false); set_params_for_remote_txn(device); device.start_device(default_params); device.deassert_risc_reset(); @@ -85,81 +104,113 @@ TEST(SiliconDriverBH, CreateDestroy) { // TEST(SiliconDriverWH, Harvesting) { // std::set target_devices = {0, 1}; // std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; - + // { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); +// std::unique_ptr cluster_desc_uniq = +// tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); // if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; +// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula +// system"; // } // } // uint32_t num_host_mem_ch_per_mmio_device = 1; -// Cluster device = Cluster("./tests/soc_descs/wormhole_b0_8x10.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); +// Cluster device = Cluster( +// "./tests/soc_descs/wormhole_b0_8x10.yaml", +// tt_ClusterDescriptor::get_cluster_descriptor_file_path(), +// target_devices, +// num_host_mem_ch_per_mmio_device, +// false, +// true, +// true, +// simulated_harvesting_masks); // auto sdesc_per_chip = device.get_virtual_soc_descriptors(); // ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; -// for(const auto& chip : sdesc_per_chip) { -// ASSERT_EQ(chip.second.workers.size(), 48) << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; +// for (const auto& chip : sdesc_per_chip) { +// ASSERT_EQ(chip.second.workers.size(), 48) +// << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; // } -// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0), 30) << "Expected first chip to have harvesting mask of 30"; -// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 60) << "Expected second chip to have harvesting mask of 60"; +// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0), 30) +// << "Expected first chip to have harvesting mask of 30"; +// ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 60) +// << "Expected second chip to have harvesting mask of 60"; // } // TEST(SiliconDriverWH, CustomSocDesc) { // std::set target_devices = {0, 1}; // std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; // { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); +// std::unique_ptr cluster_desc_uniq = +// tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); // if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; +// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula +// system"; // } // } // uint32_t num_host_mem_ch_per_mmio_device = 1; // // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting -// Cluster device = Cluster("./tests/soc_descs/wormhole_b0_1x1.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks); +// Cluster device = Cluster( +// "./tests/soc_descs/wormhole_b0_1x1.yaml", +// tt_ClusterDescriptor::get_cluster_descriptor_file_path(), +// target_devices, +// num_host_mem_ch_per_mmio_device, +// false, +// true, +// false, +// simulated_harvesting_masks); // auto sdesc_per_chip = device.get_virtual_soc_descriptors(); - -// ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled"; -// for(const auto& chip : sdesc_per_chip) { + +// ASSERT_EQ(device.using_harvested_soc_descriptors(), false) +// << "SOC descriptors should not be modified when harvesting is disabled"; +// for (const auto& chip : sdesc_per_chip) { // ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver"; // } // } // TEST(SiliconDriverWH, HarvestingRuntime) { - -// auto get_static_tlb_index_callback = [] (tt_xy_pair target) { -// return get_static_tlb_index(target); -// }; +// auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; // std::set target_devices = {0, 1}; // std::unordered_map simulated_harvesting_masks = {{0, 30}, {1, 60}}; // { -// std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); +// std::unique_ptr cluster_desc_uniq = +// tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); // if (cluster_desc_uniq->get_number_of_chips() != target_devices.size()) { -// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula system"; +// GTEST_SKIP() << "SiliconDriverWH.Harvesting skipped because it can only be run on a two chip nebula +// system"; // } // } // uint32_t num_host_mem_ch_per_mmio_device = 1; - -// Cluster device = Cluster("./tests/soc_descs/wormhole_b0_8x10.yaml", tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); + +// Cluster device = Cluster( +// "./tests/soc_descs/wormhole_b0_8x10.yaml", +// tt_ClusterDescriptor::get_cluster_descriptor_file_path(), +// target_devices, +// num_host_mem_ch_per_mmio_device, +// false, +// true, +// true, +// simulated_harvesting_masks); // set_params_for_remote_txn(device); // auto mmio_devices = device.get_target_mmio_device_ids(); - -// for(int i = 0; i < target_devices.size(); i++) { + +// for (int i = 0; i < target_devices.size(); i++) { // // Iterate over MMIO devices and only setup static TLBs for worker cores -// if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { +// if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { // auto& sdesc = device.get_virtual_soc_descriptors().at(i); -// for(auto& core : sdesc.workers) { -// // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. -// device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); +// for (auto& core : sdesc.workers) { +// // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. +// device.configure_tlb( +// i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); // } -// } +// } // } // device.setup_core_to_tlb_map(get_static_tlb_index_callback); - + // tt_device_params default_params; // device.start_device(default_params); // device.deassert_risc_reset(); @@ -169,29 +220,57 @@ TEST(SiliconDriverBH, CreateDestroy) { // std::vector readback_vec = {}; // std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -// for(int i = 0; i < target_devices.size(); i++) { +// for (int i = 0; i < target_devices.size(); i++) { // std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; // std::uint32_t dynamic_write_address = 0x40000000; -// for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses -// for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { -// device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); -// device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); -// device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - +// for (int loop = 0; loop < 100; +// loop++) { // Write to each core a 100 times at different statically mapped addresses +// for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { +// device.write_to_device( +// vector_to_write.data(), +// vector_to_write.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// address, +// ""); +// device.write_to_device( +// vector_to_write.data(), +// vector_to_write.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// dynamic_write_address, +// "SMALL_READ_WRITE_TLB"); +// device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + // test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); -// test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB"); -// ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; -// ASSERT_EQ(vector_to_write, dynamic_readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; +// test_utils::read_data_from_device( +// device, +// dynamic_readback_vec, +// tt_cxy_pair(i, core), +// dynamic_write_address, +// 40, +// "SMALL_READ_WRITE_TLB"); +// ASSERT_EQ(vector_to_write, readback_vec) +// << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; +// ASSERT_EQ(vector_to_write, dynamic_readback_vec) +// << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; // device.wait_for_non_mmio_flush(); - -// device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data -// device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); // Clear any written data + +// device.write_to_device( +// zeros.data(), +// zeros.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// dynamic_write_address, +// "SMALL_READ_WRITE_TLB"); // Clear any written data +// device.write_to_device( +// zeros.data(), +// zeros.size() * sizeof(std::uint32_t), +// tt_cxy_pair(i, core), +// address, +// ""); // Clear any written data // device.wait_for_non_mmio_flush(); // readback_vec = {}; // dynamic_readback_vec = {}; // } -// address += 0x20; // Increment by uint32_t size for each write +// address += 0x20; // Increment by uint32_t size for each write // dynamic_write_address += 0x20; // } // } @@ -199,45 +278,44 @@ TEST(SiliconDriverBH, CreateDestroy) { // } TEST(SiliconDriverBH, UnalignedStaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); - } + } } - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); std::vector unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025}; - for(int i = 0; i < target_devices.size(); i++) { - for(const auto& size : unaligned_sizes) { + for (int i = 0; i < target_devices.size(); i++) { + for (const auto& size : unaligned_sizes) { std::vector write_vec(size, 0); - for(int i = 0; i < size; i++){ + for (int i = 0; i < size; i++) { write_vec[i] = size + i; } std::vector readback_vec(size, 0); std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 50; loop++){ - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, ""); device.wait_for_non_mmio_flush(); device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, ""); @@ -251,37 +329,35 @@ TEST(SiliconDriverBH, UnalignedStaticTLB_RW) { } address += 0x20; } - } } device.close_device(); } TEST(SiliconDriverBH, StaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 2MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + for (auto& core : sdesc.workers) { + // Statically mapping a 2MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); - } + } } - + printf("MT: Static TLBs set\n"); tt_device_params default_params; @@ -292,27 +368,40 @@ TEST(SiliconDriverBH, StaticTLB_RW) { std::vector readback_vec = {}; std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 1; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + for (int loop = 0; loop < 1; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); + device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } - device.close_device(); + device.close_device(); } TEST(SiliconDriverBH, DynamicTLB_RW) { - // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction + // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for + // each transaction std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; @@ -329,42 +418,68 @@ TEST(SiliconDriverBH, DynamicTLB_RW) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; std::vector readback_vec = {}; - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); + device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } printf("Target Tensix cores completed\n"); - + // Target DRAM channel 0 constexpr int NUM_CHANNELS = 8; std::vector dram_vector_to_write = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19}; std::uint32_t address = 0x400; - for(int i = 0; i < target_devices.size(); i++) { - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for (int ch=0; ch chan = device.get_virtual_soc_descriptors().at(i).dram_cores.at(ch); tt_xy_pair subchan = chan.at(0); - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, subchan), address, "SMALL_READ_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, subchan), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << subchan.x << "-" << subchan.y << "does not match what was written"; + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, subchan), + address, + "SMALL_READ_WRITE_TLB"); + device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, subchan), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << subchan.x << "-" + << subchan.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, subchan), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, subchan), + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); readback_vec = {}; - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } } @@ -381,7 +496,7 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); - + set_params_for_remote_txn(device); tt_device_params default_params; @@ -392,11 +507,18 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -407,12 +529,19 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = 0x30000000; - for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { - for(int loop = 0; loop < 100; loop++) { - for(auto& core : core_ls) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : core_ls) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -427,13 +556,11 @@ TEST(SiliconDriverBH, MultiThreadedDevice) { TEST(SiliconDriverBH, MultiThreadedMemBar) { // Have 2 threads read and write from a single device concurrently - // All (fairly large) transactions go through a static TLB. + // All (fairly large) transactions go through a static TLB. // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; @@ -441,11 +568,11 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { - // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. + for (auto& core : sdesc.workers) { + // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); @@ -454,24 +581,41 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); - + std::vector readback_membar_vec = {}; - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM readback_membar_vec = {}; } - - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores + + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers were correctly initialized on all ethernet cores readback_membar_vec = {}; } @@ -481,38 +625,43 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { std::vector vec2(2560); std::vector zeros(2560, 0); - for(int i = 0; i < vec1.size(); i++) { + for (int i = 0; i < vec1.size(); i++) { vec1.at(i) = i; } - for(int i = 0; i < vec2.size(); i++) { + for (int i = 0; i < vec2.size(); i++) { vec2.at(i) = vec1.size() + i; } std::thread th1 = std::thread([&] { std::uint32_t address = base_addr; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), ""); ASSERT_EQ(readback_vec, vec1); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } - } }); std::thread th2 = std::thread([&] { std::uint32_t address = base_addr + vec1.size() * 4; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), ""); ASSERT_EQ(readback_vec, vec2); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ; + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -521,26 +670,42 @@ TEST(SiliconDriverBH, MultiThreadedMemBar) { th1.join(); th2.join(); - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers readback_membar_vec = {}; } - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers end up in the correct sate for ethernet cores readback_membar_vec = {}; } device.close_device(); } -TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/ethernet and DRAM simultaneously on Blackhole .. wait_for_non_mmio_flush() is not working as expected? +TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/ethernet and DRAM simultaneously on + // Blackhole .. wait_for_non_mmio_flush() is not working as expected? // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -555,49 +720,80 @@ TEST(SiliconDriverBH, DISABLED_BroadcastWrite) { // Cannot broadcast to tensix/e std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // flush here so we don't simultaneously broadcast to DRAM? + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); + device.wait_for_non_mmio_flush(); // flush here so we don't simultaneously broadcast to DRAM? // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } -TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as above.. +TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as above.. // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -605,12 +801,14 @@ TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as tt_device_params default_params; device.start_device(default_params); auto eth_version = device.get_ethernet_fw_version(); - bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; + bool virtual_bcast_supported = + (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; if (!virtual_bcast_supported) { device.close_device(); - GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled"; + GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support " + "Virtual Coordinate Broadcast or NOC translation is not enabled"; } - + device.deassert_risc_reset(); std::vector broadcast_sizes = {1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384}; uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; @@ -619,38 +817,69 @@ TEST(SiliconDriverBH, DISABLED_VirtualCoordinateBroadcast) { // same problem as std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } // Wait for data to be cleared before writing next block device.wait_for_non_mmio_flush(); } - device.close_device(); + device.close_device(); } diff --git a/tests/emulation/test_emulation_device.cpp b/tests/emulation/test_emulation_device.cpp index 8ff436ba2..b4136807d 100644 --- a/tests/emulation/test_emulation_device.cpp +++ b/tests/emulation/test_emulation_device.cpp @@ -3,10 +3,10 @@ * * SPDX-License-Identifier: Apache-2.0 */ -#include "gtest/gtest.h" -#include "device/tt_soc_descriptor.h" #include "device/cluster.h" #include "device/tt_emulation_device.h" +#include "device/tt_soc_descriptor.h" +#include "gtest/gtest.h" // DEPRECATED TEST SUITE !!! @@ -22,7 +22,7 @@ TEST(EmulationDeviceGS, BasicEmuTest) { uint64_t l1_addr = 0x1000; std::vector wdata(size); std::vector rdata(size); - + try { device.start_device(default_params); @@ -31,13 +31,23 @@ TEST(EmulationDeviceGS, BasicEmuTest) { } device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), l1_addr, "l1"); test_utils::read_data_from_device(device, rdata, tt_cxy_pair(0, core), l1_addr, size, "l1"); - ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(wdata, rdata) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; device.deassert_risc_reset(); - device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1"); + device.write_to_device( + wdata.data(), + wdata.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), + l1_addr, + "l1"); device.assert_risc_reset(); - device.write_to_device(wdata.data(), wdata.size() * sizeof(std::uint32_t), tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), l1_addr, "l1"); - + device.write_to_device( + wdata.data(), + wdata.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, tt_xy_pair(phys_x, phys_y)), + l1_addr, + "l1"); } catch (const std::exception &e) { std::cout << "Error: " << e.what() << std::endl; diff --git a/tests/galaxy/test_galaxy_common.cpp b/tests/galaxy/test_galaxy_common.cpp index 546c4c7f1..4cff57f15 100644 --- a/tests/galaxy/test_galaxy_common.cpp +++ b/tests/galaxy/test_galaxy_common.cpp @@ -10,9 +10,18 @@ void move_data( Cluster& device, tt_multichip_core_addr sender_core, tt_multichip_core_addr receiver_core, uint32_t size) { std::vector readback_vec = {}; test_utils::read_data_from_device( - device, readback_vec, tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, size, "SMALL_READ_WRITE_TLB"); + device, + readback_vec, + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + size, + "SMALL_READ_WRITE_TLB"); device.write_to_device( - readback_vec.data(), readback_vec.size() * sizeof(std::uint32_t), tt_cxy_pair(receiver_core.chip, receiver_core.core), receiver_core.addr, "SMALL_READ_WRITE_TLB"); + readback_vec.data(), + readback_vec.size() * sizeof(std::uint32_t), + tt_cxy_pair(receiver_core.chip, receiver_core.core), + receiver_core.addr, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited return; @@ -25,7 +34,12 @@ void broadcast_data( uint32_t size) { std::vector readback_vec = {}; test_utils::read_data_from_device( - device, readback_vec, tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, size, "SMALL_READ_WRITE_TLB"); + device, + readback_vec, + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + size, + "SMALL_READ_WRITE_TLB"); for (const auto& receiver_core : receiver_cores) { device.write_to_device( readback_vec.data(), diff --git a/tests/galaxy/test_galaxy_common.h b/tests/galaxy/test_galaxy_common.h index 057719014..c9febe0a8 100644 --- a/tests/galaxy/test_galaxy_common.h +++ b/tests/galaxy/test_galaxy_common.h @@ -4,18 +4,16 @@ * SPDX-License-Identifier: Apache-2.0 */ - #pragma once #include +#include #include #include #include -#include #include "device/cluster.h" #include "device/tt_xy_pair.h" - #include "fmt/core.h" // static const std::string SOC_DESC_PATH = "./tests/soc_descs/wormhole_b0_8x10.yaml"; @@ -24,14 +22,14 @@ using namespace tt::umd; struct tt_multichip_core_addr { tt_multichip_core_addr() : core{}, chip{}, addr{} {} + tt_multichip_core_addr(chip_id_t chip, tt_xy_pair core, std::uint64_t addr) : core(core), chip(chip), addr(addr) {} tt_xy_pair core; chip_id_t chip; std::uint64_t addr; - std::string str() const { - return fmt::format("(chip={},x={},y={},addr=0x{:x})", chip, core.x, core.y, addr); - } + + std::string str() const { return fmt::format("(chip={},x={},y={},addr=0x{:x})", chip, core.x, core.y, addr); } }; // SIMPLE DATAMOVEMENT API BASED ON UMD diff --git a/tests/galaxy/test_umd_concurrent_threads.cpp b/tests/galaxy/test_umd_concurrent_threads.cpp index 3c9eb0922..a8b1c6f78 100644 --- a/tests/galaxy/test_umd_concurrent_threads.cpp +++ b/tests/galaxy/test_umd_concurrent_threads.cpp @@ -2,22 +2,21 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include #include -#include -#include "gtest/gtest.h" -#include "common/logger.hpp" -#include "tt_cluster_descriptor.h" #include "cluster.h" +#include "common/logger.hpp" #include "eth_interface.h" +#include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" - #include "test_galaxy_common.h" -#include "tests/wormhole/test_wh_common.h" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/wormhole/test_wh_common.h" +#include "tt_cluster_descriptor.h" static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; @@ -52,7 +51,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, all_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + all_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -70,7 +74,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; for (const auto& chip : target_devices_th1) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(vector_to_write_th1.data(), vector_to_write_th1.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write_th1.data(), + vector_to_write_th1.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -91,7 +100,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsL1) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; for (const auto& chip : target_devices_th2) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(vector_to_write_th2.data(), vector_to_write_th2.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write_th2.data(), + vector_to_write_th2.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -140,7 +154,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, all_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + all_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -162,7 +181,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) { std::uint32_t address = 0x4000000; for (const auto& chip : target_devices_th1) { for (auto& core : dram_cores) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -182,7 +206,12 @@ TEST(GalaxyConcurrentThreads, WriteToAllChipsDram) { std::uint32_t address = 0x5000000; for (const auto& chip : target_devices_th2) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); @@ -217,7 +246,12 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -239,7 +273,12 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) { chip_id_t mmio_chip = cluster_desc->get_chips_with_mmio().begin()->first; std::vector readback_vec = {}; std::uint32_t address = 0x0; - device.write_to_device(large_vector.data(), large_vector.size() * sizeof(std::uint32_t), tt_cxy_pair(mmio_chip, tt_xy_pair(0, 0)), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + large_vector.data(), + large_vector.size() * sizeof(std::uint32_t), + tt_cxy_pair(mmio_chip, tt_xy_pair(0, 0)), + address, + "SMALL_READ_WRITE_TLB"); test_utils::read_data_from_device( device, readback_vec, @@ -257,14 +296,24 @@ TEST(GalaxyConcurrentThreads, PushInputsWhileSignalingCluster) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; for (const auto& chip : target_devices) { for (auto& core : sdesc_per_chip.at(chip).workers) { - device.write_to_device(small_vector.data(), small_vector.size() * sizeof(std::uint32_t), tt_cxy_pair(chip, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + small_vector.data(), + small_vector.size() * sizeof(std::uint32_t), + tt_cxy_pair(chip, core), + address, + "SMALL_READ_WRITE_TLB"); } } device.wait_for_non_mmio_flush(); for (const auto& chip : target_devices) { for (auto& core : sdesc_per_chip.at(chip).workers) { test_utils::read_data_from_device( - device, readback_vec, tt_cxy_pair(chip, core), address, small_vector.size() * 4, "SMALL_READ_WRITE_TLB"); + device, + readback_vec, + tt_cxy_pair(chip, core), + address, + small_vector.size() * 4, + "SMALL_READ_WRITE_TLB"); EXPECT_EQ(small_vector, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; diff --git a/tests/galaxy/test_umd_remote_api.cpp b/tests/galaxy/test_umd_remote_api.cpp index 4e8130b5a..8904ec6f4 100644 --- a/tests/galaxy/test_umd_remote_api.cpp +++ b/tests/galaxy/test_umd_remote_api.cpp @@ -2,21 +2,20 @@ // // SPDX-License-Identifier: Apache-2.0 -#include #include +#include -#include "gtest/gtest.h" -#include "common/logger.hpp" -#include "tt_cluster_descriptor.h" #include "cluster.h" +#include "common/logger.hpp" #include "eth_interface.h" +#include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" - #include "test_galaxy_common.h" -#include "tests/wormhole/test_wh_common.h" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/wormhole/test_wh_common.h" +#include "tt_cluster_descriptor.h" static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; @@ -32,7 +31,12 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); const auto sdesc_per_chip = device.get_virtual_soc_descriptors(); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -64,7 +68,12 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) { for (const auto& core : target_cores) { tt_cxy_pair target_core = tt_cxy_pair(chip, core); auto start = std::chrono::high_resolution_clock::now(); - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), target_core, address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + target_core, + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited auto end = std::chrono::high_resolution_clock::now(); auto duration = double(std::chrono::duration_cast(end - start).count()); @@ -72,7 +81,8 @@ void run_remote_read_write_test(uint32_t vector_size, bool dram_write) { // std::cout << " chip " << chip << " core " << target_core.str() << " " << duration << std::endl; start = std::chrono::high_resolution_clock::now(); - test_utils::read_data_from_device(device, readback_vec, target_core, address, write_size, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, target_core, address, write_size, "SMALL_READ_WRITE_TLB"); end = std::chrono::high_resolution_clock::now(); duration = double(std::chrono::duration_cast(end - start).count()); // std::cout << " read chip " << chip << " core " << target_core.str()<< " " << duration << std::endl; @@ -145,7 +155,12 @@ void run_data_mover_test( uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -162,7 +177,11 @@ void run_data_mover_test( std::vector send_bw; // Set up data in sender core device.write_to_device( - vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, "SMALL_READ_WRITE_TLB"); + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited // Send data from sender core to receiver core @@ -261,7 +280,12 @@ void run_data_broadcast_test( uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster( - test_utils::GetAbsPath(SOC_DESC_PATH), cluster_desc_path, target_devices, num_host_mem_ch_per_mmio_device, false, true); + test_utils::GetAbsPath(SOC_DESC_PATH), + cluster_desc_path, + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); tt::umd::test::utils::set_params_for_remote_txn(device); @@ -278,7 +302,11 @@ void run_data_broadcast_test( std::vector send_bw; // Set up data in sender core device.write_to_device( - vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(sender_core.chip, sender_core.core), sender_core.addr, "SMALL_READ_WRITE_TLB"); + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(sender_core.chip, sender_core.core), + sender_core.addr, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited // Send data from sender core to receiver core diff --git a/tests/galaxy/test_umd_remote_api_stability.cpp b/tests/galaxy/test_umd_remote_api_stability.cpp index a91dccce8..af2cc5fb4 100644 --- a/tests/galaxy/test_umd_remote_api_stability.cpp +++ b/tests/galaxy/test_umd_remote_api_stability.cpp @@ -7,173 +7,167 @@ #include #include -#include "tt_cluster_descriptor.h" #include "cluster.h" - #include "common/logger.hpp" #include "eth_interface.h" #include "filesystem" #include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" -#include "tt_soc_descriptor.h" - -#include "tests/test_utils/stimulus_generators.hpp" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/galaxy/test_galaxy_common.h" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/test_utils/stimulus_generators.hpp" #include "tests/wormhole/test_wh_common.h" +#include "tt_cluster_descriptor.h" +#include "tt_soc_descriptor.h" namespace tt::umd::test::utils { - class WormholeGalaxyStabilityTestFixture : public WormholeTestFixture { - private: - static int detected_num_chips; - static bool skip_tests; - - protected: - - static constexpr int EXPECTED_MIN_CHIPS = 32; - static uint32_t scale_number_of_tests; - - static void SetUpTestSuite() { - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); - detected_num_chips = cluster_desc->get_number_of_chips(); - if (detected_num_chips < EXPECTED_MIN_CHIPS) { - skip_tests = true; +private: + static int detected_num_chips; + static bool skip_tests; + +protected: + static constexpr int EXPECTED_MIN_CHIPS = 32; + static uint32_t scale_number_of_tests; + + static void SetUpTestSuite() { + std::unique_ptr cluster_desc = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + detected_num_chips = cluster_desc->get_number_of_chips(); + if (detected_num_chips < EXPECTED_MIN_CHIPS) { + skip_tests = true; + } + if (char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { + scale_number_of_tests = std::atoi(scale_number_of_tests_env); + } } - if(char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { - scale_number_of_tests = std::atoi(scale_number_of_tests_env); - } - } - - virtual int get_detected_num_chips() { - return detected_num_chips; - } - virtual bool is_test_skipped() { - return skip_tests; - } + virtual int get_detected_num_chips() { return detected_num_chips; } + virtual bool is_test_skipped() { return skip_tests; } }; - int WormholeGalaxyStabilityTestFixture::detected_num_chips = -1; bool WormholeGalaxyStabilityTestFixture::skip_tests = false; uint32_t WormholeGalaxyStabilityTestFixture::scale_number_of_tests = 1; - TEST_F(WormholeGalaxyStabilityTestFixture, MixedRemoteTransfers) { int seed = 0; - + assert(device != nullptr); - log_info(LogSiliconDriver,"Started MixedRemoteTransfers"); + log_info(LogSiliconDriver, "Started MixedRemoteTransfers"); std::vector command_history; try { RunMixedTransfersUniformDistributions( - *this->device, + *this->device, 100000 * scale_number_of_tests, seed, - transfer_type_weights_t{.write = 0.40, .read = 0.4}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // Set to true if you want to emit the command history code to command line + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } - } TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTransfersMediumSmall) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersMediumSmall"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersMediumSmall"); assert(device != nullptr); - std::thread t1([&](){ + std::thread t1([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 50000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // Set to true if you want to emit the command history code to command line + false, + nullptr); }); - std::thread t2([&](){ + std::thread t2([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 50000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + // Set to true if you want to emit the command history code to command line + std::uniform_int_distribution(0x4, 30000), + false, + nullptr); }); - std::thread t3([&](){ + std::thread t3([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 50000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 30000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 30000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 30000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + // Set to true if you want to emit the command history code to command line + std::uniform_int_distribution(0x4, 30000), + false, + nullptr); }); - std::thread t4([&](){ + std::thread t4([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 0.1, .read = 0.1}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - nullptr - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + nullptr); }); t1.join(); @@ -182,4 +176,4 @@ TEST_F(WormholeGalaxyStabilityTestFixture, DISABLED_MultithreadedMixedRemoteTran t4.join(); } -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils diff --git a/tests/grayskull/test_silicon_driver.cpp b/tests/grayskull/test_silicon_driver.cpp index c61a3a2ef..4d90b9310 100644 --- a/tests/grayskull/test_silicon_driver.cpp +++ b/tests/grayskull/test_silicon_driver.cpp @@ -4,14 +4,14 @@ #include -#include "gtest/gtest.h" #include "cluster.h" -#include "device/tt_soc_descriptor.h" #include "device/tt_cluster_descriptor.h" +#include "device/tt_soc_descriptor.h" #include "device/wormhole/wormhole_implementation.h" +#include "gtest/gtest.h" #include "l1_address_map.h" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" using namespace tt::umd; @@ -19,7 +19,7 @@ TEST(SiliconDriverGS, CreateDestroySequential) { std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; tt_device_params default_params; - for(int i = 0; i < 100; i++) { + for (int i = 0; i < 100; i++) { Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); device.start_device(default_params); device.deassert_risc_reset(); @@ -33,13 +33,13 @@ TEST(SiliconDriverGS, CreateMultipleInstance) { tt_device_params default_params; default_params.init_device = false; std::unordered_map concurrent_devices = {}; - for(int i = 0; i < 100; i++) { + for (int i = 0; i < 100; i++) { concurrent_devices.insert({i, new Cluster(num_host_mem_ch_per_mmio_device, false, true)}); - concurrent_devices.at(i) -> start_device(default_params); + concurrent_devices.at(i)->start_device(default_params); } - for(auto& device : concurrent_devices) { - device.second -> close_device(); + for (auto& device : concurrent_devices) { + device.second->close_device(); delete device.second; } } @@ -52,11 +52,15 @@ TEST(SiliconDriverGS, Harvesting) { auto sdesc_per_chip = device.get_virtual_soc_descriptors(); ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; - for(const auto& chip : sdesc_per_chip) { - ASSERT_LE(chip.second.workers.size(), 96) << "Expected SOC descriptor with harvesting to have less than or equal to 96 workers for chip " << chip.first; + for (const auto& chip : sdesc_per_chip) { + ASSERT_LE(chip.second.workers.size(), 96) + << "Expected SOC descriptor with harvesting to have less than or equal to 96 workers for chip " + << chip.first; } - ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0) & simulated_harvesting_masks[0], 6) << "Expected first chip to include simulated harvesting mask of 6"; - // ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 12) << "Expected second chip to have harvesting mask of 12"; + ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(0) & simulated_harvesting_masks[0], 6) + << "Expected first chip to include simulated harvesting mask of 6"; + // ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(1), 12) << "Expected second chip to have + // harvesting mask of 12"; device.close_device(); } @@ -65,16 +69,25 @@ TEST(SiliconDriverGS, CustomSocDesc) { std::unordered_map simulated_harvesting_masks = {{0, 6}, {1, 12}}; uint32_t num_host_mem_ch_per_mmio_device = 1; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - Cluster device = Cluster(test_utils::GetAbsPath("./tests/soc_descs/grayskull_1x1_arch.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks); + Cluster device = Cluster( + test_utils::GetAbsPath("./tests/soc_descs/grayskull_1x1_arch.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false, + simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); - ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled"; - for(const auto& chip : sdesc_per_chip) { + ASSERT_EQ(device.using_harvested_soc_descriptors(), false) + << "SOC descriptors should not be modified when harvesting is disabled"; + for (const auto& chip : sdesc_per_chip) { ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver"; } } TEST(SiliconDriverGS, HarvestingRuntime) { - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x; if (flat_index == 0) { return -1; @@ -87,10 +100,10 @@ TEST(SiliconDriverGS, HarvestingRuntime) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true, simulated_harvesting_masks); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { + for (auto& core : sdesc.workers) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } @@ -108,29 +121,59 @@ TEST(SiliconDriverGS, HarvestingRuntime) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; float timeout_in_seconds = 10; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; std::uint32_t dynamic_write_address = 0x30000000; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + dynamic_write_address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); - test_utils::read_data_from_device(device, dynamic_readback_vec, tt_cxy_pair(i, core), dynamic_write_address, 40, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, + dynamic_readback_vec, + tt_cxy_pair(i, core), + dynamic_write_address, + 40, + "SMALL_READ_WRITE_TLB"); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), dynamic_write_address, "SMALL_READ_WRITE_TLB"); // Clear any written data + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + dynamic_write_address, + "SMALL_READ_WRITE_TLB"); // Clear any written data readback_vec = {}; dynamic_readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write dynamic_write_address += 0x20; } } @@ -138,7 +181,7 @@ TEST(SiliconDriverGS, HarvestingRuntime) { } TEST(SiliconDriverGS, StaticTLB_RW) { - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x; if (flat_index == 0) { return -1; @@ -149,12 +192,13 @@ TEST(SiliconDriverGS, StaticTLB_RW) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { + for (auto& core : sdesc.workers) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. - device.configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted); + device.configure_tlb( + i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE, TLB_DATA::Posted); } device.setup_core_to_tlb_map(i, get_static_tlb_index); } @@ -168,36 +212,52 @@ TEST(SiliconDriverGS, StaticTLB_RW) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; float timeout_in_seconds = 10; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } device.close_device(); } TEST(SiliconDriverGS, DynamicTLB_RW) { - // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction + // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for + // each transaction std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); - device.set_fallback_tlb_ordering_mode("SMALL_READ_WRITE_TLB", TLB_DATA::Posted); // Explicitly test API to set fallback tlb ordering mode + device.set_fallback_tlb_ordering_mode( + "SMALL_READ_WRITE_TLB", TLB_DATA::Posted); // Explicitly test API to set fallback tlb ordering mode tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -207,25 +267,40 @@ TEST(SiliconDriverGS, DynamicTLB_RW) { std::vector readback_vec = {}; float timeout_in_seconds = 10; - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); + for (int loop = 0; loop < 100; + loop++) { // Write to each core a 100 times at different statically mapped addresses + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } device.close_device(); @@ -239,7 +314,7 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); - + tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -249,18 +324,27 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { std::vector readback_vec = {}; float timeout_in_seconds = 10; std::uint32_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; } - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -272,19 +356,28 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { std::vector readback_vec = {}; float timeout_in_seconds = 10; std::uint32_t address = 0x30000000; - for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { - for(int loop = 0; loop < 100; loop++) { - for(auto& core : core_ls) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); + for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : core_ls) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); auto start_time = std::chrono::high_resolution_clock::now(); - while(!(vector_to_write == readback_vec)) { - float wait_duration = std::chrono::duration_cast(std::chrono::high_resolution_clock::now() - start_time).count(); - if(wait_duration > timeout_in_seconds) { + while (!(vector_to_write == readback_vec)) { + float wait_duration = std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - start_time) + .count(); + if (wait_duration > timeout_in_seconds) { break; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); } - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - } - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -297,14 +390,14 @@ TEST(SiliconDriverGS, MultiThreadedDevice) { device.close_device(); } -TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run - // Have 2 threads read and write from a single device concurrently - // All (fairly large) transactions go through a static TLB. - // We want to make sure the memory barrier is thread/process safe. +TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run + // Have 2 threads read and write from a single device concurrently + // All (fairly large) transactions go through a static TLB. + // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * tt::umd::wormhole::GRID_SIZE_X + target.x; if (flat_index == 0) { return -1; @@ -317,11 +410,11 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true); - - for(int i = 0; i < target_devices.size(); i++) { + + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { + for (auto& core : sdesc.workers) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index(core), base_addr); } @@ -332,22 +425,28 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run device.start_device(default_params); device.deassert_risc_reset(); std::vector readback_membar_vec = {}; - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM readback_membar_vec = {}; } // Launch 2 thread accessing different locations of L1 and using memory barrier between write and read @@ -356,23 +455,26 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run std::vector vec2(25600); std::vector zeros(25600, 0); - for(int i = 0; i < vec1.size(); i++) { + for (int i = 0; i < vec1.size(); i++) { vec1.at(i) = i; } - for(int i = 0; i < vec2.size(); i++) { + for (int i = 0; i < vec2.size(); i++) { vec2.at(i) = vec1.size() + i; } std::thread th1 = std::thread([&] { std::uint32_t address = base_addr; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), ""); ASSERT_EQ(readback_vec, vec1); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -380,14 +482,17 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run std::thread th2 = std::thread([&] { std::uint32_t address = base_addr + vec1.size() * 4; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), ""); ASSERT_EQ(readback_vec, vec2); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ; + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -396,9 +501,10 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run th1.join(); th2.join(); - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in correct sate workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in correct sate workers readback_membar_vec = {}; } @@ -409,14 +515,14 @@ TEST(SiliconDriverGS, MultiThreadedMemBar) { // this tests takes ~5 mins to run * Copied from Wormhole unit tests. */ TEST(SiliconDriverGS, SysmemTestWithPcie) { - Cluster cluster(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), - "", // test_utils::GetClusterDescYAML(), - {0}, - 1, // one "host memory channel", currently a 1G huge page - false, // skip driver allocs - no (don't skip) - true, // clean system resources - yes - true); // perform harvesting - yes - + Cluster cluster( + test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), + "", // test_utils::GetClusterDescYAML(), + {0}, + 1, // one "host memory channel", currently a 1G huge page + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes cluster.start_device(tt_device_params{}); // no special parameters @@ -432,7 +538,7 @@ TEST(SiliconDriverGS, SysmemTestWithPcie) { // Bad API: how big is the buffer? How do we know it's big enough? // Situation today is that there's a 1G hugepage behind it, although this is // unclear from the API and may change in the future. - uint8_t *sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); ASSERT_NE(sysmem, nullptr); uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); diff --git a/tests/microbenchmark/device_fixture.hpp b/tests/microbenchmark/device_fixture.hpp index c53d5f234..b4b744b81 100644 --- a/tests/microbenchmark/device_fixture.hpp +++ b/tests/microbenchmark/device_fixture.hpp @@ -2,26 +2,27 @@ // // SPDX-License-Identifier: Apache-2.0 -#include -#include +#include + #include +#include +#include #include -#include #include "cluster.h" -#include "l1_address_map.h" #include "device/tt_soc_descriptor.h" +#include "l1_address_map.h" #include "tests/test_utils/generate_cluster_desc.hpp" using tt::umd::Cluster; class uBenchmarkFixture : public ::testing::Test { - protected: +protected: void SetUp() override { // get arch name? results_csv.open("ubench_results.csv", std::ios_base::app); - auto get_static_tlb_index = [] (tt_xy_pair target) { + auto get_static_tlb_index = [](tt_xy_pair target) { int flat_index = target.y * 10 + target.x; // grid_size_x = 10 for GS/WH ????? something is wrong here if (flat_index == 0) { return -1; @@ -30,12 +31,18 @@ class uBenchmarkFixture : public ::testing::Test { }; std::set target_devices = {0}; uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_shared(test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), "", target_devices, num_host_mem_ch_per_mmio_device, false, true); - - for(int i = 0; i < target_devices.size(); i++) { + device = std::make_shared( + test_utils::GetAbsPath("tests/soc_descs/grayskull_10x12.yaml"), + "", + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true); + + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores auto& sdesc = device->get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { + for (auto& core : sdesc.workers) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device->configure_tlb(i, core, get_static_tlb_index(core), l1_mem::address_map::DATA_BUFFER_SPACE_BASE); } diff --git a/tests/microbenchmark/test_rw_tensix.cpp b/tests/microbenchmark/test_rw_tensix.cpp index 274e17a70..9d1973b23 100644 --- a/tests/microbenchmark/test_rw_tensix.cpp +++ b/tests/microbenchmark/test_rw_tensix.cpp @@ -6,11 +6,11 @@ #include -#include "nanobench.h" #include "device_fixture.hpp" +#include "nanobench.h" #include "tests/test_utils/device_test_utils.hpp" -std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min=0) { +std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min = 0) { ankerl::nanobench::Rng gen(80085); std::uniform_int_distribution<> dis(min, max); // between 0 and 1MB return dis(gen); @@ -19,81 +19,119 @@ std::uint32_t generate_random_address(std::uint32_t max, std::uint32_t min=0) { TEST_F(uBenchmarkFixture, WriteAllCores32Bytes) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7}; std::uint64_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path + std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path ankerl::nanobench::Bench bench_static; ankerl::nanobench::Bench bench_dynamic; - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { std::stringstream wname; wname << "Write to device core (" << core.x << ", " << core.y << ")"; // Write 32 bytes through static tlbs - bench_static.title("Write 32 bytes").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] { - device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - }); + bench_static.title("Write 32 bytes") + .unit("writes") + .minEpochIterations(50) + .output(nullptr) + .run(wname.str(), [&] { + device->write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + }); // Write through "fallback/dynamic" tlb - bench_dynamic.title("Write 32 bytes fallback").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] { - device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), bad_address, "SMALL_READ_WRITE_TLB"); - }); + bench_dynamic.title("Write 32 bytes fallback") + .unit("writes") + .minEpochIterations(50) + .output(nullptr) + .run(wname.str(), [&] { + device->write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + bad_address, + "SMALL_READ_WRITE_TLB"); + }); wname.clear(); } bench_static.render(ankerl::nanobench::templates::csv(), results_csv); bench_dynamic.render(ankerl::nanobench::templates::csv(), results_csv); } -TEST_F(uBenchmarkFixture, ReadAllCores32Bytes){ +TEST_F(uBenchmarkFixture, ReadAllCores32Bytes) { std::vector readback_vec = {}; std::uint64_t address = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; - std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path + std::uint64_t bad_address = 0x30000000; // this address is not mapped, should trigger fallback write/read path ankerl::nanobench::Bench bench_static; ankerl::nanobench::Bench bench_dynamic; - - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { std::stringstream rname; // Read through static tlbs rname << "Read from device core (" << core.x << ", " << core.y << ")"; bench_static.title("Read 32 bytes").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] { - test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + *device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); }); // Read through "fallback/dynamic" tlb - bench_dynamic.title("Read 32 bytes fallback").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] { - test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), bad_address, 0x20, "SMALL_READ_WRITE_TLB"); - }); + bench_dynamic.title("Read 32 bytes fallback") + .unit("reads") + .minEpochIterations(50) + .output(nullptr) + .run(rname.str(), [&] { + test_utils::read_data_from_device( + *device, readback_vec, tt_cxy_pair(0, core), bad_address, 0x20, "SMALL_READ_WRITE_TLB"); + }); rname.clear(); } bench_static.render(ankerl::nanobench::templates::csv(), results_csv); bench_dynamic.render(ankerl::nanobench::templates::csv(), results_csv); } -TEST_F(uBenchmarkFixture, Write32BytesRandomAddr){ +TEST_F(uBenchmarkFixture, Write32BytesRandomAddr) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7}; std::uint32_t address; ankerl::nanobench::Bench bench; - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { - address = generate_random_address(1<<20); // between 0 and 1MB + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + address = generate_random_address(1 << 20); // between 0 and 1MB std::stringstream wname; wname << "Write to device core (" << core.x << ", " << core.y << ") @ address " << std::hex << address; - bench.title("Write 32 bytes random address").unit("writes").minEpochIterations(50).output(nullptr).run(wname.str(), [&] { - device->write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - }); + bench.title("Write 32 bytes random address") + .unit("writes") + .minEpochIterations(50) + .output(nullptr) + .run(wname.str(), [&] { + device->write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + }); wname.clear(); } bench.render(ankerl::nanobench::templates::csv(), results_csv); } -TEST_F(uBenchmarkFixture, Read32BytesRandomAddr){ +TEST_F(uBenchmarkFixture, Read32BytesRandomAddr) { std::vector readback_vec = {}; std::uint32_t address; ankerl::nanobench::Bench bench; - for(auto& core : device->get_virtual_soc_descriptors().at(0).workers) { - address = generate_random_address(1<<20); // between 0 and 1MB + for (auto& core : device->get_virtual_soc_descriptors().at(0).workers) { + address = generate_random_address(1 << 20); // between 0 and 1MB std::stringstream rname; rname << "Read from device core (" << core.x << ", " << core.y << ") @ address " << std::hex << address; - bench.title("Read 32 bytes random address").unit("reads").minEpochIterations(50).output(nullptr).run(rname.str(), [&] { - test_utils::read_data_from_device(*device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); - }); + bench.title("Read 32 bytes random address") + .unit("reads") + .minEpochIterations(50) + .output(nullptr) + .run(rname.str(), [&] { + test_utils::read_data_from_device( + *device, readback_vec, tt_cxy_pair(0, core), address, 0x20, "SMALL_READ_WRITE_TLB"); + }); rname.clear(); } bench.render(ankerl::nanobench::templates::csv(), results_csv); diff --git a/tests/pcie/test_pcie_device.cpp b/tests/pcie/test_pcie_device.cpp index 942ad0745..f29221037 100644 --- a/tests/pcie/test_pcie_device.cpp +++ b/tests/pcie/test_pcie_device.cpp @@ -5,7 +5,6 @@ */ #include -#include "fmt/xchar.h" #include #include @@ -13,7 +12,7 @@ #include #include "device/pcie/pci_device.hpp" - +#include "fmt/xchar.h" TEST(PcieDeviceTest, Numa) { std::vector nodes; diff --git a/tests/simulation/device_fixture.hpp b/tests/simulation/device_fixture.hpp index 3cba1b981..36e981e4e 100644 --- a/tests/simulation/device_fixture.hpp +++ b/tests/simulation/device_fixture.hpp @@ -5,15 +5,14 @@ #pragma once #include - -#include "tt_simulation_device.h" -#include "common/logger.hpp" -#include "tests/test_utils/generate_cluster_desc.hpp" - #include +#include #include #include -#include + +#include "common/logger.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tt_simulation_device.h" class SimulationDeviceFixture : public ::testing::Test { protected: @@ -24,9 +23,7 @@ class SimulationDeviceFixture : public ::testing::Test { device->start_device(default_params); } - static void TearDownTestSuite() { - device->close_device(); - } + static void TearDownTestSuite() { device->close_device(); } static std::unique_ptr device; }; diff --git a/tests/simulation/test_simulation_device.cpp b/tests/simulation/test_simulation_device.cpp index 1ac6146a2..3b3015e01 100644 --- a/tests/simulation/test_simulation_device.cpp +++ b/tests/simulation/test_simulation_device.cpp @@ -3,86 +3,79 @@ // SPDX-License-Identifier: Apache-2.0 #include + #include "device_fixture.hpp" #include "tests/test_utils/device_test_utils.hpp" -std::vector generate_data(uint32_t size_in_bytes){ - size_t size = size_in_bytes/sizeof(uint32_t); +std::vector generate_data(uint32_t size_in_bytes) { + size_t size = size_in_bytes / sizeof(uint32_t); std::vector data(size); std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution dis(0, 100); - for(uint32_t i = 0; i < size; i++){ + for (uint32_t i = 0; i < size; i++) { data[i] = dis(gen); } return data; } -class LoopbackAllCoresParam : public SimulationDeviceFixture , - public ::testing::WithParamInterface {}; +class LoopbackAllCoresParam : public SimulationDeviceFixture, public ::testing::WithParamInterface {}; INSTANTIATE_TEST_SUITE_P( - LoopbackAllCores, - LoopbackAllCoresParam, - ::testing::Values( - tt_xy_pair{0, 1}, - tt_xy_pair{1, 1}, - tt_xy_pair{1, 0} - ) -); - -TEST_P(LoopbackAllCoresParam, LoopbackSingleTensix){ - std::vector wdata = {1,2,3,4,5}; + LoopbackAllCores, LoopbackAllCoresParam, ::testing::Values(tt_xy_pair{0, 1}, tt_xy_pair{1, 1}, tt_xy_pair{1, 0})); + +TEST_P(LoopbackAllCoresParam, LoopbackSingleTensix) { + std::vector wdata = {1, 2, 3, 4, 5}; std::vector rdata(wdata.size(), 0); tt_cxy_pair core = {0, GetParam()}; - device->write_to_device(wdata.data(), wdata.size()*sizeof(uint32_t), core, 0x100, ""); - device->read_from_device(rdata.data(), core, 0x100, rdata.size()*sizeof(uint32_t), ""); - + device->write_to_device(wdata.data(), wdata.size() * sizeof(uint32_t), core, 0x100, ""); + device->read_from_device(rdata.data(), core, 0x100, rdata.size() * sizeof(uint32_t), ""); + ASSERT_EQ(wdata, rdata); } -bool loopback_stress_size(std::unique_ptr &device, tt_xy_pair core, uint32_t byte_shift){ +bool loopback_stress_size(std::unique_ptr &device, tt_xy_pair core, uint32_t byte_shift) { uint64_t addr = 0x0; std::vector wdata = generate_data(1 << byte_shift); std::vector rdata(wdata.size(), 0); - device->write_to_device(wdata.data(), wdata.size()*sizeof(uint32_t), tt_cxy_pair{0, core}, addr, ""); - device->read_from_device(rdata.data(), tt_cxy_pair{0, core}, addr, rdata.size()*sizeof(uint32_t), ""); - + device->write_to_device(wdata.data(), wdata.size() * sizeof(uint32_t), tt_cxy_pair{0, core}, addr, ""); + device->read_from_device(rdata.data(), tt_cxy_pair{0, core}, addr, rdata.size() * sizeof(uint32_t), ""); + return wdata == rdata; } -TEST_P(LoopbackAllCoresParam, LoopbackStressSize){ +TEST_P(LoopbackAllCoresParam, LoopbackStressSize) { tt_xy_pair core = GetParam(); tt_xy_pair dram = {1, 0}; if (core == dram) { - for (uint32_t i = 2; i <= 30; ++i) { // 2^30 = 1 GB + for (uint32_t i = 2; i <= 30; ++i) { // 2^30 = 1 GB ASSERT_TRUE(loopback_stress_size(device, core, i)); } } else { - for (uint32_t i = 2; i <= 20; ++i) { // 2^20 = 1 MB + for (uint32_t i = 2; i <= 20; ++i) { // 2^20 = 1 MB ASSERT_TRUE(loopback_stress_size(device, core, i)); } } } -TEST_F(SimulationDeviceFixture, LoopbackTwoTensix){ - std::vector wdata1 = {1,2,3,4,5}; - std::vector wdata2 = {6,7,8,9,10}; +TEST_F(SimulationDeviceFixture, LoopbackTwoTensix) { + std::vector wdata1 = {1, 2, 3, 4, 5}; + std::vector wdata2 = {6, 7, 8, 9, 10}; std::vector rdata1(wdata1.size()); std::vector rdata2(wdata2.size()); tt_cxy_pair core1 = {0, 0, 1}; tt_cxy_pair core2 = {0, 1, 1}; - device->write_to_device(wdata1.data(), wdata1.size()*sizeof(uint32_t), core1, 0x100, ""); - device->write_to_device(wdata2.data(), wdata2.size()*sizeof(uint32_t), core2, 0x100, ""); + device->write_to_device(wdata1.data(), wdata1.size() * sizeof(uint32_t), core1, 0x100, ""); + device->write_to_device(wdata2.data(), wdata2.size() * sizeof(uint32_t), core2, 0x100, ""); + + device->read_from_device(rdata1.data(), core1, 0x100, rdata1.size() * sizeof(uint32_t), ""); + device->read_from_device(rdata2.data(), core2, 0x100, rdata2.size() * sizeof(uint32_t), ""); - device->read_from_device(rdata1.data(), core1, 0x100, rdata1.size()*sizeof(uint32_t), ""); - device->read_from_device(rdata2.data(), core2, 0x100, rdata2.size()*sizeof(uint32_t), ""); - ASSERT_EQ(wdata1, rdata1); ASSERT_EQ(wdata2, rdata2); } diff --git a/tests/test_utils/device_test_utils.hpp b/tests/test_utils/device_test_utils.hpp index 87446be3f..ae9292e90 100644 --- a/tests/test_utils/device_test_utils.hpp +++ b/tests/test_utils/device_test_utils.hpp @@ -15,7 +15,7 @@ namespace test_utils { template -static void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_in_bytes) { +static void size_buffer_to_capacity(std::vector& data_buf, std::size_t size_in_bytes) { std::size_t target_size = 0; if (size_in_bytes > 0) { target_size = ((size_in_bytes - 1) / sizeof(T)) + 1; @@ -23,22 +23,27 @@ static void size_buffer_to_capacity(std::vector &data_buf, std::size_t size_i data_buf.resize(target_size); } -static void read_data_from_device(tt_device& device, std::vector &vec, tt_cxy_pair core, uint64_t addr, uint32_t size, const std::string& tlb_to_use) { +static void read_data_from_device( + tt_device& device, + std::vector& vec, + tt_cxy_pair core, + uint64_t addr, + uint32_t size, + const std::string& tlb_to_use) { size_buffer_to_capacity(vec, size); device.read_from_device(vec.data(), core, addr, size, tlb_to_use); } -inline void fill_with_random_bytes(uint8_t* data, size_t n) -{ +inline void fill_with_random_bytes(uint8_t* data, size_t n) { static std::random_device rd; static std::mt19937_64 gen(rd()); uint64_t* data64 = reinterpret_cast(data); - std::generate_n(data64, n/8, [&]() { return gen(); }); + std::generate_n(data64, n / 8, [&]() { return gen(); }); // Handle remaining bytes - for (size_t i = (n/8)*8; i < n; ++i) { + for (size_t i = (n / 8) * 8; i < n; ++i) { data[i] = static_cast(gen()); } } -} +} // namespace test_utils diff --git a/tests/test_utils/generate_cluster_desc.hpp b/tests/test_utils/generate_cluster_desc.hpp index 145f011a3..539dd39f4 100644 --- a/tests/test_utils/generate_cluster_desc.hpp +++ b/tests/test_utils/generate_cluster_desc.hpp @@ -7,24 +7,26 @@ #pragma once #include -#include #include +#include #include "fmt/core.h" namespace test_utils { -inline std::string GetAbsPath(std::string path_){ - // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the compiler. +inline std::string GetAbsPath(std::string path_) { + // Note that __FILE__ might be resolved at compile time to an absolute or relative address, depending on the + // compiler. std::filesystem::path current_file_path = std::filesystem::path(__FILE__); std::filesystem::path umd_root; if (current_file_path.is_absolute()) { umd_root = current_file_path.parent_path().parent_path().parent_path(); } else { - std::filesystem::path umd_root_relative = std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path().parent_path(), "../"); + std::filesystem::path umd_root_relative = + std::filesystem::relative(std::filesystem::path(__FILE__).parent_path().parent_path().parent_path(), "../"); umd_root = std::filesystem::canonical(umd_root_relative); } std::filesystem::path abs_path = umd_root / path_; return abs_path.string(); } -} // namespace test_utils +} // namespace test_utils diff --git a/tests/test_utils/soc_desc_test_utils.hpp b/tests/test_utils/soc_desc_test_utils.hpp index 30fb90d2e..884a3504a 100644 --- a/tests/test_utils/soc_desc_test_utils.hpp +++ b/tests/test_utils/soc_desc_test_utils.hpp @@ -15,4 +15,4 @@ static std::size_t get_num_harvested(std::size_t harvesting_mask) { return __builtin_popcount(harvesting_mask); } -} +} // namespace test_utils diff --git a/tests/test_utils/stimulus_generators.hpp b/tests/test_utils/stimulus_generators.hpp index b831f0a25..320156a5b 100644 --- a/tests/test_utils/stimulus_generators.hpp +++ b/tests/test_utils/stimulus_generators.hpp @@ -4,18 +4,17 @@ * SPDX-License-Identifier: Apache-2.0 */ #pragma once -#include "tt_xy_pair.h" -#include "tt_cluster_descriptor.h" -#include "cluster.h" - - +#include #include #include #include #include -#include #include -#include +#include + +#include "cluster.h" +#include "tt_cluster_descriptor.h" +#include "tt_xy_pair.h" /* Sizes: * Distribution (including min/max) @@ -40,7 +39,6 @@ namespace tt::umd::test::utils { static const std::string SOC_DESC_PATH = "tests/soc_descs/wormhole_b0_8x10.yaml"; - enum RemoteTransferType : uint8_t { WRITE = 0, READ }; template < @@ -50,7 +48,7 @@ template < class DISTRIBUTION_T, typename GENERATOR_T = std::mt19937> class ConstrainedTemplateTemplateGenerator { - public: +public: ConstrainedTemplateTemplateGenerator( int seed, DISTRIBUTION_T const& distribution, @@ -62,24 +60,17 @@ class ConstrainedTemplateTemplateGenerator { return constrain(sample); } - private: +private: GENERATOR_T generator; DISTRIBUTION_T distribution; std::function constrain; }; - -template < - typename SAMPLE_T, - typename UNCONSTRAINED_SAMPLE_T, - class DISTRIBUTION_T, - typename GENERATOR_T = std::mt19937> +template class ConstrainedTemplateGenerator { - public: +public: ConstrainedTemplateGenerator( - int seed, - DISTRIBUTION_T const& distribution, - std::function constrain) : + int seed, DISTRIBUTION_T const& distribution, std::function constrain) : generator(seed), distribution(distribution), constrain(constrain) {} SAMPLE_T generate() { @@ -87,14 +78,14 @@ class ConstrainedTemplateGenerator { return constrain(sample); } - private: +private: GENERATOR_T generator; DISTRIBUTION_T distribution; std::function constrain; }; - -using DefaultTransferTypeGenerator = ConstrainedTemplateTemplateGenerator; +using DefaultTransferTypeGenerator = + ConstrainedTemplateTemplateGenerator; using address_t = uint32_t; using destination_t = tt_cxy_pair; @@ -107,6 +98,7 @@ struct write_transfer_sample_t { std::string tlb_to_use; // (payload.data(), size, destination, address, tlb_to_use, false, false); }; + struct read_transfer_sample_t { destination_t destination; address_t address; @@ -115,7 +107,8 @@ struct read_transfer_sample_t { // (payload.data(), destination, address, size, tlb_to_use); }; -using remote_transfer_sample_t = std::tuple>; +using remote_transfer_sample_t = + std::tuple>; template < template @@ -130,7 +123,8 @@ template < struct WriteCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; WriteCommandGenerator( destination_generator_t const& destination_generator, @@ -159,7 +153,8 @@ template < struct WriteEpochCmdCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; using last_cmd_generator_t = ConstrainedTemplateGenerator; using ordered_generator_t = ConstrainedTemplateGenerator; @@ -196,8 +191,10 @@ template < typename GENERATOR_T = std::mt19937> struct RolledWriteCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; - using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using address_generator_t = + ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; using unroll_count_generator_t = ConstrainedTemplateTemplateGenerator; RolledWriteCommandGenerator( @@ -229,7 +226,8 @@ template < struct ReadCommandGenerator { using destination_generator_t = ConstrainedTemplateTemplateGenerator; using address_generator_t = ConstrainedTemplateTemplateGenerator; - using size_generator_t = ConstrainedTemplateTemplateGenerator; + using size_generator_t = + ConstrainedTemplateTemplateGenerator; ReadCommandGenerator( destination_generator_t const& destination_generator, @@ -239,8 +237,6 @@ struct ReadCommandGenerator { address_generator(address_generator), size_generator(size_generator) {} - - destination_generator_t destination_generator; address_generator_t address_generator; size_generator_t size_generator; @@ -265,12 +261,14 @@ template < typename GENERATOR_T = std::mt19937> class TestGenerator { - using transfer_type_generator_t = DefaultTransferTypeGenerator; // ConstrainedTemplateTemplateGenerator; - using write_command_generator_t = WriteCommandGenerator; - using read_command_generator_t = ReadCommandGenerator; - - public: + // ConstrainedTemplateTemplateGenerator; + using transfer_type_generator_t = DefaultTransferTypeGenerator; + using write_command_generator_t = + WriteCommandGenerator; + using read_command_generator_t = + ReadCommandGenerator; + +public: TestGenerator( int seed, transfer_type_generator_t const& transfer_type_distribution, @@ -279,13 +277,10 @@ class TestGenerator { generator(seed), transfer_type_distribution(transfer_type_distribution), write_command_generator(write_command_generator), - read_command_generator(read_command_generator) - { - } + read_command_generator(read_command_generator) {} // Generate a sample (transfer type, size, destination, address) based on custom distributions remote_transfer_sample_t generate_sample() { - // Randomly select a transfer type RemoteTransferType transfer_type = transfer_type_distribution.generate(); assert(transfer_type < 4 && transfer_type >= 0); @@ -294,22 +289,26 @@ class TestGenerator { destination_t const& destination = write_command_generator.destination_generator.generate(); address_t const& address = write_command_generator.address_generator.generate(); transfer_size_t const& size_in_bytes = write_command_generator.size_generator.generate(); - return {transfer_type, write_transfer_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .tlb_to_use = "LARGE_WRITE_TLB"}}; + return { + transfer_type, + write_transfer_sample_t{ + .destination = destination, + .address = address, + .size_in_bytes = size_in_bytes, + .tlb_to_use = "LARGE_WRITE_TLB"}}; } break; case RemoteTransferType::READ: { destination_t const& destination = read_command_generator.destination_generator.generate(); address_t const& address = read_command_generator.address_generator.generate(); transfer_size_t const& size_in_bytes = read_command_generator.size_generator.generate(); - return {transfer_type, read_transfer_sample_t{ - .destination = destination, - .address = address, - .size_in_bytes = size_in_bytes, - .tlb_to_use = "LARGE_READ_TLB"}}; + return { + transfer_type, + read_transfer_sample_t{ + .destination = destination, + .address = address, + .size_in_bytes = size_in_bytes, + .tlb_to_use = "LARGE_READ_TLB"}}; } break; default: @@ -317,7 +316,7 @@ class TestGenerator { }; } - private: +private: std::mt19937 generator; transfer_type_generator_t transfer_type_distribution; @@ -331,15 +330,32 @@ struct transfer_type_weights_t { double read; }; - -static auto address_aligner = [](address_t addr) -> address_t { addr = (((addr - 1) / 32) + 1) * 32; assert(addr % 32 == 0); return addr;}; -static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 4) + 1) * 4; assert(size > 0); assert(size % 4 == 0); return size; }; -static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; -static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { size = (((size - 1) / 32) + 1) * 32; assert(size > 0); return size;}; -template +static auto address_aligner = [](address_t addr) -> address_t { + addr = (((addr - 1) / 32) + 1) * 32; + assert(addr % 32 == 0); + return addr; +}; +static auto transfer_size_aligner = [](transfer_size_t size) -> transfer_size_t { + size = (((size - 1) / 4) + 1) * 4; + assert(size > 0); + assert(size % 4 == 0); + return size; +}; +static auto address_aligner_32B = [](transfer_size_t size) -> transfer_size_t { + size = (((size - 1) / 32) + 1) * 32; + assert(size > 0); + return size; +}; +static auto size_aligner_32B = [](transfer_size_t size) -> transfer_size_t { + size = (((size - 1) / 32) + 1) * 32; + assert(size > 0); + return size; +}; +template static auto passthrough_constrainer = [](T const& t) -> T { return t; }; -static inline std::vector generate_core_index_locations(tt_ClusterDescriptor const& cluster_desc, tt_SocDescriptor const& soc_desc) { +static inline std::vector generate_core_index_locations( + tt_ClusterDescriptor const& cluster_desc, tt_SocDescriptor const& soc_desc) { std::vector core_index_to_location = {}; for (chip_id_t chip : cluster_desc.get_all_chips()) { @@ -360,16 +376,19 @@ static void print_command(remote_transfer_sample_t const& command) { case RemoteTransferType::WRITE: { write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "Transfer type: WRITE, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; + << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x + << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes + << std::endl; } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); std::cout << "Transfer type: READ, destination: (c=" << command_args.destination.chip - << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x - << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes << std::endl; + << ", y=" << command_args.destination.y << ", x=" << command_args.destination.x + << "), address: " << command_args.address << ", size_in_bytes: " << command_args.size_in_bytes + << std::endl; } break; - default: throw std::runtime_error("Invalid transfer type"); + default: + throw std::runtime_error("Invalid transfer type"); }; } @@ -379,12 +398,9 @@ int bytes_to_words(int num_bytes) { } static inline void dispatch_remote_transfer_command( - Cluster &driver, - remote_transfer_sample_t const& command, - std::vector &payload) { - + Cluster& driver, remote_transfer_sample_t const& command, std::vector& payload) { RemoteTransferType transfer_type = std::get<0>(command); - auto resize_payload = [](std::vector &payload, int size_in_bytes) { + auto resize_payload = [](std::vector& payload, int size_in_bytes) { payload.resize(bytes_to_words(size_in_bytes)); }; @@ -392,28 +408,37 @@ static inline void dispatch_remote_transfer_command( case RemoteTransferType::WRITE: { write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.write_to_device(payload.data(), bytes_to_words(command_args.size_in_bytes), command_args.destination, command_args.address, command_args.tlb_to_use); + resize_payload(payload, command_args.size_in_bytes); + driver.write_to_device( + payload.data(), + bytes_to_words(command_args.size_in_bytes), + command_args.destination, + command_args.address, + command_args.tlb_to_use); } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); - resize_payload(payload,command_args.size_in_bytes); - driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size_in_bytes, command_args.tlb_to_use); + resize_payload(payload, command_args.size_in_bytes); + driver.read_from_device( + payload.data(), + command_args.destination, + command_args.address, + command_args.size_in_bytes, + command_args.tlb_to_use); } break; default: throw std::runtime_error("Invalid transfer type"); }; } - static void print_command_executable_code(remote_transfer_sample_t const& command) { - auto emit_payload_resize_string = [](int size_bytes, int size_word) { std::cout << "payload.resize(((" << size_bytes << " - 1) / " << size_word << ") + 1);" << std::endl; }; auto emit_bytes_to_words_len_string = [](std::string const& var_name, int size_in_bytes, int size_word) { - std::cout << "int " << var_name << " = (((" << size_in_bytes << " - 1) / " << size_word << ") + 1);" << std::endl; + std::cout << "int " << var_name << " = (((" << size_in_bytes << " - 1) / " << size_word << ") + 1);" + << std::endl; }; std::cout << "{" << std::endl; @@ -421,19 +446,25 @@ static void print_command_executable_code(remote_transfer_sample_t const& comman case RemoteTransferType::WRITE: { write_transfer_sample_t const& command_args = std::get(std::get<1>(command)); assert(command_args.size_in_bytes >= sizeof(uint32_t)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; + std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " + << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; std::cout << "assert(" << command_args.size_in_bytes << " >= sizeof(uint32_t));" << std::endl; emit_bytes_to_words_len_string("len", command_args.size_in_bytes, sizeof(uint32_t)); emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" << command_args.tlb_to_use << "\");" << std::endl; - // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, command_args.tlb_to_use, false, false); + std::cout << "device->write_to_device(payload.data(), len, destination, " << command_args.address << ", \"" + << command_args.tlb_to_use << "\");" << std::endl; + // driver.write_to_device(payload.data(), command_args.size, command_args.destination, command_args.address, + // command_args.tlb_to_use, false, false); } break; case RemoteTransferType::READ: { read_transfer_sample_t const& command_args = std::get(std::get<1>(command)); - std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; + std::cout << "tt_cxy_pair const& destination = tt_cxy_pair(" << command_args.destination.chip << ", " + << command_args.destination.x << ", " << command_args.destination.y << ");" << std::endl; emit_payload_resize_string(command_args.size_in_bytes, sizeof(uint32_t)); - std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl; - // driver.read_from_device(payload.data(), command_args.destination, command_args.address, command_args.size, command_args.tlb_to_use); + std::cout << "device->read_from_device(payload.data(), destination, " << command_args.address << ", " + << command_args.size_in_bytes << ", \"" << command_args.tlb_to_use << "\");" << std::endl; + // driver.read_from_device(payload.data(), command_args.destination, command_args.address, + // command_args.size, command_args.tlb_to_use); } break; default: throw std::runtime_error("Invalid transfer type"); @@ -450,32 +481,36 @@ static void print_command_history_executable_code(std::vector class WRITE_DEST_DISTR_T, - template class WRITE_ADDR_DISTR_T, +template < + template + class WRITE_DEST_DISTR_T, + template + class WRITE_ADDR_DISTR_T, class WRITE_SIZE_DISTR_OUT_T, - template class WRITE_SIZE_DISTR_T, + template + class WRITE_SIZE_DISTR_T, - template class READ_DEST_DISTR_T, - template class READ_ADDR_DISTR_T, - class READ_SIZE_DISTR_OUT_T, - template class READ_SIZE_DISTR_T -> + template + class READ_DEST_DISTR_T, + template + class READ_ADDR_DISTR_T, + class READ_SIZE_DISTR_OUT_T, + template + class READ_SIZE_DISTR_T> void RunMixedTransfers( - Cluster& device, + Cluster& device, int num_samples, int seed, transfer_type_weights_t const& transfer_type_weights, - WriteCommandGenerator const& write_command_generator, - ReadCommandGenerator const& read_command_generator, - + WriteCommandGenerator const& + write_command_generator, + ReadCommandGenerator const& + read_command_generator, + bool record_command_history = false, - std::vector *command_history = nullptr -) { + std::vector* command_history = nullptr) { SCOPED_TRACE("RunMixedTransfers"); auto test_generator = TestGenerator( seed, @@ -490,7 +525,7 @@ void RunMixedTransfers( if (record_command_history) { assert(command_history != nullptr); - assert(command_history->size() == 0); // only support passing in empty command histories + assert(command_history->size() == 0); // only support passing in empty command histories command_history->reserve(num_samples); } std::vector payload = {}; @@ -513,16 +548,17 @@ void RunMixedTransfers( } } - -static ConstrainedTemplateTemplateGenerator get_default_address_generator(int seed, address_t start, address_t end) { +static ConstrainedTemplateTemplateGenerator +get_default_address_generator(int seed, address_t start, address_t end) { auto const& address_distribution = std::uniform_int_distribution(start, end); - return ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner); + return ConstrainedTemplateTemplateGenerator( + seed + 1, address_distribution, address_aligner); } - -static ConstrainedTemplateTemplateGenerator get_default_full_dram_dest_generator(int seed, Cluster *device) { +static ConstrainedTemplateTemplateGenerator +get_default_full_dram_dest_generator(int seed, Cluster* device) { assert(device != nullptr); - tt_ClusterDescriptor *cluster_desc = device->get_cluster_description(); + tt_ClusterDescriptor* cluster_desc = device->get_cluster_description(); tt_SocDescriptor const& soc_desc = device->get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); @@ -536,19 +572,23 @@ static WriteCommandGenerator< std::uniform_int_distribution, std::uniform_int_distribution, transfer_size_t, - std::uniform_int_distribution -> build_dummy_write_command_generator(Cluster &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); + std::uniform_int_distribution> +build_dummy_write_command_generator(Cluster& device) { + tt_ClusterDescriptor* cluster_desc = device.get_cluster_description(); tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); auto dest_generator = ConstrainedTemplateTemplateGenerator( 0, std::uniform_int_distribution(0, core_index_to_location.size() - 1), [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator = ConstrainedTemplateTemplateGenerator(0 , std::uniform_int_distribution(0,0), address_aligner); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner_32B); - auto write_size_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), transfer_size_aligner); + auto addr_generator = ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), address_aligner); + auto addr_generator_32B_aligned = + ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), address_aligner_32B); + auto write_size_generator = + ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), transfer_size_aligner); return WriteCommandGenerator(dest_generator, addr_generator, write_size_generator); } @@ -557,24 +597,25 @@ static ReadCommandGenerator< std::uniform_int_distribution, std::uniform_int_distribution, transfer_size_t, - std::uniform_int_distribution -> build_dummy_read_command_generator(Cluster &device) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); + std::uniform_int_distribution> +build_dummy_read_command_generator(Cluster& device) { + tt_ClusterDescriptor* cluster_desc = device.get_cluster_description(); tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); auto dest_generator = ConstrainedTemplateTemplateGenerator( 0, std::uniform_int_distribution(0, core_index_to_location.size() - 1), [core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator = ConstrainedTemplateTemplateGenerator(0, std::uniform_int_distribution(0,0), address_aligner); - auto read_size_generator = ConstrainedTemplateTemplateGenerator( - 0, std::uniform_int_distribution(0,0), transfer_size_aligner); + auto addr_generator = ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), address_aligner); + auto read_size_generator = + ConstrainedTemplateTemplateGenerator( + 0, std::uniform_int_distribution(0, 0), transfer_size_aligner); return ReadCommandGenerator(dest_generator, addr_generator, read_size_generator); - } -template< +template < template class ADDR_GENERATOR_T, typename ADDR_DISTR_T, @@ -583,10 +624,9 @@ template< template class READ_SIZE_GENERATOR_T, template - class UNROLL_COUNT_GENERATOR_T -> + class UNROLL_COUNT_GENERATOR_T> void RunMixedTransfersUniformDistributions( - Cluster& device, + Cluster& device, int num_samples, int seed, @@ -597,11 +637,10 @@ void RunMixedTransfersUniformDistributions( float percent_not_last_epoch_cmd, float percent_not_remote_ordered, READ_SIZE_GENERATOR_T const& read_size_distribution, - + bool record_command_history = false, - std::vector *command_history = nullptr -) { - tt_ClusterDescriptor *cluster_desc = device.get_cluster_description(); + std::vector* command_history = nullptr) { + tt_ClusterDescriptor* cluster_desc = device.get_cluster_description(); tt_SocDescriptor const& soc_desc = device.get_virtual_soc_descriptors().at(0); std::vector core_index_to_location = generate_core_index_locations(*cluster_desc, soc_desc); @@ -609,21 +648,30 @@ void RunMixedTransfersUniformDistributions( seed, std::uniform_int_distribution(0, core_index_to_location.size() - 1), [&core_index_to_location](int dest) -> destination_t { return core_index_to_location.at(dest); }); - auto addr_generator = ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner); - auto addr_generator_32B_aligned = ConstrainedTemplateTemplateGenerator(seed + 1, address_distribution, address_aligner_32B); - auto write_size_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, write_size_distribution, transfer_size_aligner); - auto read_size_generator = ConstrainedTemplateTemplateGenerator( - seed + 2, read_size_distribution, transfer_size_aligner); + auto addr_generator = ConstrainedTemplateTemplateGenerator( + seed + 1, address_distribution, address_aligner); + auto addr_generator_32B_aligned = + ConstrainedTemplateTemplateGenerator( + seed + 1, address_distribution, address_aligner_32B); + auto write_size_generator = + ConstrainedTemplateTemplateGenerator( + seed + 2, write_size_distribution, transfer_size_aligner); + auto read_size_generator = + ConstrainedTemplateTemplateGenerator( + seed + 2, read_size_distribution, transfer_size_aligner); auto last_epoch_cmd_generator = ConstrainedTemplateGenerator( - seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { return last_epoch_cmd; }); + seed + 3, std::bernoulli_distribution(percent_not_last_epoch_cmd), [](bool last_epoch_cmd) -> bool { + return last_epoch_cmd; + }); auto ordered_generator = ConstrainedTemplateGenerator( - seed + 3, std::bernoulli_distribution(percent_not_remote_ordered), [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; }); + seed + 3, + std::bernoulli_distribution(percent_not_remote_ordered), + [](bool ordered_with_prev_remote_write) -> bool { return ordered_with_prev_remote_write; }); auto unroll_count_generator = ConstrainedTemplateTemplateGenerator( seed + 4, unroll_count_distribution, [](int unroll_count) -> int { return unroll_count; }); RunMixedTransfers( - device, + device, num_samples, seed, @@ -631,12 +679,9 @@ void RunMixedTransfersUniformDistributions( WriteCommandGenerator(dest_generator, addr_generator, write_size_generator), ReadCommandGenerator(dest_generator, addr_generator, read_size_generator), - - record_command_history, - command_history - ); + record_command_history, + command_history); } - } // namespace tt::umd::test::utils diff --git a/tests/unit_test_main.cpp b/tests/unit_test_main.cpp index ff89a8892..c48ceb235 100644 --- a/tests/unit_test_main.cpp +++ b/tests/unit_test_main.cpp @@ -3,10 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #include "gtest/gtest.h" - #include "gtest_initializer.hpp" int main(int argc, char **argv) { - initialize_gtest(argc, argv); - return RUN_ALL_TESTS(); + initialize_gtest(argc, argv); + return RUN_ALL_TESTS(); } diff --git a/tests/wormhole/test_silicon_driver_wh.cpp b/tests/wormhole/test_silicon_driver_wh.cpp index 48834d483..cee3b8383 100644 --- a/tests/wormhole/test_silicon_driver_wh.cpp +++ b/tests/wormhole/test_silicon_driver_wh.cpp @@ -1,32 +1,40 @@ // SPDX-FileCopyrightText: (c) 2023 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 -#include #include +#include -#include "gtest/gtest.h" #include "cluster.h" -#include "eth_l1_address_map.h" -#include "l1_address_map.h" -#include "host_mem_address_map.h" - #include "device/tt_cluster_descriptor.h" #include "device/wormhole/wormhole_implementation.h" -#include "tests/test_utils/generate_cluster_desc.hpp" +#include "eth_l1_address_map.h" +#include "gtest/gtest.h" +#include "host_mem_address_map.h" +#include "l1_address_map.h" #include "tests/test_utils/device_test_utils.hpp" +#include "tests/test_utils/generate_cluster_desc.hpp" using namespace tt::umd; - void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } std::int32_t get_static_tlb_index(tt_xy_pair target) { - bool is_eth_location = std::find(std::cbegin(tt::umd::wormhole::ETH_LOCATIONS), std::cend(tt::umd::wormhole::ETH_LOCATIONS), target) != std::cend(tt::umd::wormhole::ETH_LOCATIONS); - bool is_tensix_location = std::find(std::cbegin(tt::umd::wormhole::T6_X_LOCATIONS), std::cend(tt::umd::wormhole::T6_X_LOCATIONS), target.x) != std::cend(tt::umd::wormhole::T6_X_LOCATIONS) && - std::find(std::cbegin(tt::umd::wormhole::T6_Y_LOCATIONS), std::cend(tt::umd::wormhole::T6_Y_LOCATIONS), target.y) != std::cend(tt::umd::wormhole::T6_Y_LOCATIONS); + bool is_eth_location = + std::find(std::cbegin(tt::umd::wormhole::ETH_LOCATIONS), std::cend(tt::umd::wormhole::ETH_LOCATIONS), target) != + std::cend(tt::umd::wormhole::ETH_LOCATIONS); + bool is_tensix_location = + std::find( + std::cbegin(tt::umd::wormhole::T6_X_LOCATIONS), std::cend(tt::umd::wormhole::T6_X_LOCATIONS), target.x) != + std::cend(tt::umd::wormhole::T6_X_LOCATIONS) && + std::find( + std::cbegin(tt::umd::wormhole::T6_Y_LOCATIONS), std::cend(tt::umd::wormhole::T6_Y_LOCATIONS), target.y) != + std::cend(tt::umd::wormhole::T6_Y_LOCATIONS); if (is_eth_location) { if (target.y == 6) { target.y = 1; @@ -65,7 +73,8 @@ std::int32_t get_static_tlb_index(tt_xy_pair target) { std::set get_target_devices() { std::set target_devices; - std::unique_ptr cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + std::unique_ptr cluster_desc_uniq = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); for (int i = 0; i < cluster_desc_uniq->get_number_of_chips(); i++) { target_devices.insert(i); } @@ -77,8 +86,15 @@ TEST(SiliconDriverWH, CreateDestroy) { uint32_t num_host_mem_ch_per_mmio_device = 1; tt_device_params default_params; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - for(int i = 0; i < 50; i++) { - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false); + for (int i = 0; i < 50; i++) { + Cluster device = Cluster( + test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false); set_params_for_remote_txn(device); device.start_device(default_params); device.deassert_risc_reset(); @@ -97,11 +113,13 @@ TEST(SiliconDriverWH, Harvesting) { ASSERT_EQ(device.using_harvested_soc_descriptors(), true) << "Expected Driver to have performed harvesting"; - for(const auto& chip : sdesc_per_chip) { - ASSERT_EQ(chip.second.workers.size(), 48) << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; + for (const auto& chip : sdesc_per_chip) { + ASSERT_EQ(chip.second.workers.size(), 48) + << "Expected SOC descriptor with harvesting to have 48 workers for chip" << chip.first; } - for(int i = 0; i < num_devices; i++){ - ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(i), simulated_harvesting_masks.at(i)) << "Expecting chip " << i << " to have harvesting mask of " << simulated_harvesting_masks.at(i); + for (int i = 0; i < num_devices; i++) { + ASSERT_EQ(device.get_harvesting_masks_for_soc_descriptors().at(i), simulated_harvesting_masks.at(i)) + << "Expecting chip " << i << " to have harvesting mask of " << simulated_harvesting_masks.at(i); } } @@ -111,11 +129,20 @@ TEST(SiliconDriverWH, CustomSocDesc) { uint32_t num_host_mem_ch_per_mmio_device = 1; // Initialize the driver with a 1x1 descriptor and explictly do not perform harvesting - Cluster device = Cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), tt_ClusterDescriptor::get_cluster_descriptor_file_path(), target_devices, num_host_mem_ch_per_mmio_device, false, true, false, simulated_harvesting_masks); + Cluster device = Cluster( + test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_1x1.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_host_mem_ch_per_mmio_device, + false, + true, + false, + simulated_harvesting_masks); auto sdesc_per_chip = device.get_virtual_soc_descriptors(); - ASSERT_EQ(device.using_harvested_soc_descriptors(), false) << "SOC descriptors should not be modified when harvesting is disabled"; - for(const auto& chip : sdesc_per_chip) { + ASSERT_EQ(device.using_harvested_soc_descriptors(), false) + << "SOC descriptors should not be modified when harvesting is disabled"; + for (const auto& chip : sdesc_per_chip) { ASSERT_EQ(chip.second.workers.size(), 1) << "Expected 1x1 SOC descriptor to be unmodified by driver"; } } @@ -190,9 +217,7 @@ TEST(SiliconDriverWH, HarvestingRuntime) { #endif TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); int num_devices = target_devices.size(); @@ -202,13 +227,14 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { + for (auto& core : sdesc.workers) { // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } @@ -219,16 +245,16 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { device.deassert_risc_reset(); std::vector unaligned_sizes = {3, 14, 21, 255, 362, 430, 1022, 1023, 1025}; - for(int i = 0; i < num_devices; i++) { - for(const auto& size : unaligned_sizes) { + for (int i = 0; i < num_devices; i++) { + for (const auto& size : unaligned_sizes) { std::vector write_vec(size, 0); - for(int i = 0; i < size; i++){ + for (int i = 0; i < size; i++) { write_vec[i] = size + i; } std::vector readback_vec(size, 0); std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 50; loop++){ - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { device.write_to_device(write_vec.data(), size, tt_cxy_pair(i, core), address, ""); device.wait_for_non_mmio_flush(); device.read_from_device(readback_vec.data(), tt_cxy_pair(i, core), address, size, ""); @@ -242,16 +268,13 @@ TEST(SiliconDriverWH, UnalignedStaticTLB_RW) { } address += 0x20; } - } } device.close_device(); } TEST(SiliconDriverWH, StaticTLB_RW) { - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); @@ -260,19 +283,19 @@ TEST(SiliconDriverWH, StaticTLB_RW) { set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over MMIO devices and only setup static TLBs for worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { + for (auto& core : sdesc.workers) { // Statically mapping a 1MB TLB to this core, starting from address NCRISC_FIRMWARE_BASE. - device.configure_tlb(i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); + device.configure_tlb( + i, core, get_static_tlb_index_callback(core), l1_mem::address_map::NCRISC_FIRMWARE_BASE); } device.setup_core_to_tlb_map(i, get_static_tlb_index_callback); } } - tt_device_params default_params; device.start_device(default_params); device.deassert_risc_reset(); @@ -281,27 +304,41 @@ TEST(SiliconDriverWH, StaticTLB_RW) { std::vector readback_vec = {}; std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; // Check functionality of Static TLBs by reading adn writing from statically mapped address space - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, ""); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited + // Write to each core a 100 times at different statically mapped addresses + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + ""); + // Barrier to ensure that all writes over ethernet were commited + device.wait_for_non_mmio_flush(); test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, ""); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); // Clear any written data + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); // Clear any written data device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } device.close_device(); } TEST(SiliconDriverWH, DynamicTLB_RW) { - // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for each transaction + // Don't use any static TLBs in this test. All writes go through a dynamic TLB that needs to be reconfigured for + // each transaction std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; @@ -317,20 +354,34 @@ TEST(SiliconDriverWH, DynamicTLB_RW) { std::vector zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; std::vector readback_vec = {}; - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++){ // Write to each core a 100 times at different statically mapped addresses - for(auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); - device.wait_for_non_mmio_flush(); // Barrier to ensure that all writes over ethernet were commited - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + // Write to each core a 100 times at different statically mapped addresses + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); + // Barrier to ensure that all writes over ethernet were commited + device.wait_for_non_mmio_flush(); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; device.wait_for_non_mmio_flush(); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "SMALL_READ_WRITE_TLB"); + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "SMALL_READ_WRITE_TLB"); device.wait_for_non_mmio_flush(); readback_vec = {}; } - address += 0x20; // Increment by uint32_t size for each write + address += 0x20; // Increment by uint32_t size for each write } } device.close_device(); @@ -344,7 +395,7 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { uint32_t num_host_mem_ch_per_mmio_device = 1; Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); - + set_params_for_remote_txn(device); tt_device_params default_params; @@ -355,11 +406,18 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = l1_mem::address_map::NCRISC_FIRMWARE_BASE; - for(int loop = 0; loop < 100; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (int loop = 0; loop < 100; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -370,12 +428,19 @@ TEST(SiliconDriverWH, MultiThreadedDevice) { std::vector vector_to_write = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector readback_vec = {}; std::uint32_t address = 0x30000000; - for(auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { - for(int loop = 0; loop < 100; loop++) { - for(auto& core : core_ls) { - device.write_to_device(vector_to_write.data(), vector_to_write.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "SMALL_READ_WRITE_TLB"); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was written"; + for (auto& core_ls : device.get_virtual_soc_descriptors().at(0).dram_cores) { + for (int loop = 0; loop < 100; loop++) { + for (auto& core : core_ls) { + device.write_to_device( + vector_to_write.data(), + vector_to_write.size() * sizeof(std::uint32_t), + tt_cxy_pair(0, core), + address, + "SMALL_READ_WRITE_TLB"); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 40, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was written"; readback_vec = {}; } address += 0x20; @@ -394,9 +459,7 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { // We want to make sure the memory barrier is thread/process safe. // Memory barrier flags get sent to address 0 for all channels in this test - auto get_static_tlb_index_callback = [] (tt_xy_pair target) { - return get_static_tlb_index(target); - }; + auto get_static_tlb_index_callback = [](tt_xy_pair target) { return get_static_tlb_index(target); }; std::set target_devices = get_target_devices(); uint32_t base_addr = l1_mem::address_map::DATA_BUFFER_SPACE_BASE; @@ -406,11 +469,11 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); - for(int i = 0; i < target_devices.size(); i++) { + for (int i = 0; i < target_devices.size(); i++) { // Iterate over devices and only setup static TLBs for functional worker cores - if(std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { + if (std::find(mmio_devices.begin(), mmio_devices.end(), i) != mmio_devices.end()) { auto& sdesc = device.get_virtual_soc_descriptors().at(i); - for(auto& core : sdesc.workers) { + for (auto& core : sdesc.workers) { // Statically mapping a 1MB TLB to this core, starting from address DATA_BUFFER_SPACE_BASE. device.configure_tlb(i, core, get_static_tlb_index_callback(core), base_addr); } @@ -423,22 +486,39 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { device.deassert_risc_reset(); std::vector readback_membar_vec = {}; - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all workers readback_membar_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(0).get_num_dram_channels(); chan++) { auto core = device.get_virtual_soc_descriptors().at(0).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM + test_utils::read_data_from_device( + device, readback_membar_vec, tt_cxy_pair(0, core), 0, 4, "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all DRAM readback_membar_vec = {}; } - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers were correctly initialized on all ethernet cores + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers were correctly initialized on all ethernet cores readback_membar_vec = {}; } @@ -448,38 +528,43 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { std::vector vec2(2560); std::vector zeros(2560, 0); - for(int i = 0; i < vec1.size(); i++) { + for (int i = 0; i < vec1.size(); i++) { vec1.at(i) = i; } - for(int i = 0; i < vec2.size(); i++) { + for (int i = 0; i < vec2.size(); i++) { vec2.at(i) = vec1.size() + i; } std::thread th1 = std::thread([&] { std::uint32_t address = base_addr; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec1.data(), vec1.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec1.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec1.size(), ""); ASSERT_EQ(readback_vec, vec1); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } - } }); std::thread th2 = std::thread([&] { std::uint32_t address = base_addr + vec1.size() * 4; - for(int loop = 0; loop < 50; loop++) { - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + for (int loop = 0; loop < 50; loop++) { + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { std::vector readback_vec = {}; - device.write_to_device(vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); + device.write_to_device( + vec2.data(), vec2.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); device.l1_membar(0, "SMALL_READ_WRITE_TLB", {core}); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(0, core), address, 4*vec2.size(), ""); + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(0, core), address, 4 * vec2.size(), ""); ASSERT_EQ(readback_vec, vec2); - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, "") ; + device.write_to_device( + zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(0, core), address, ""); readback_vec = {}; } } @@ -488,27 +573,41 @@ TEST(SiliconDriverWH, MultiThreadedMemBar) { th1.join(); th2.join(); - for(auto& core : device.get_virtual_soc_descriptors().at(0).workers) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), l1_mem::address_map::L1_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers + for (auto& core : device.get_virtual_soc_descriptors().at(0).workers) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + l1_mem::address_map::L1_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for workers readback_membar_vec = {}; } - for(auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { - test_utils::read_data_from_device(device, readback_membar_vec, tt_cxy_pair(0, core), eth_l1_mem::address_map::ERISC_BARRIER_BASE, 4, "SMALL_READ_WRITE_TLB"); - ASSERT_EQ(readback_membar_vec.at(0), 187); // Ensure that memory barriers end up in the correct sate for ethernet cores + for (auto& core : device.get_virtual_soc_descriptors().at(0).ethernet_cores) { + test_utils::read_data_from_device( + device, + readback_membar_vec, + tt_cxy_pair(0, core), + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + 4, + "SMALL_READ_WRITE_TLB"); + ASSERT_EQ( + readback_membar_vec.at(0), + 187); // Ensure that memory barriers end up in the correct sate for ethernet cores readback_membar_vec = {}; } device.close_device(); } - TEST(SiliconDriverWH, BroadcastWrite) { // Broadcast multiple vectors to tensix and dram grid. Verify broadcasted data is read back correctly std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -523,33 +622,64 @@ TEST(SiliconDriverWH, BroadcastWrite) { std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } @@ -564,7 +694,7 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { std::set target_devices = get_target_devices(); uint32_t num_host_mem_ch_per_mmio_device = 1; - + Cluster device = Cluster(num_host_mem_ch_per_mmio_device, false, true, true); set_params_for_remote_txn(device); auto mmio_devices = device.get_target_mmio_device_ids(); @@ -572,10 +702,12 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { tt_device_params default_params; device.start_device(default_params); auto eth_version = device.get_ethernet_fw_version(); - bool virtual_bcast_supported = (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; + bool virtual_bcast_supported = + (eth_version >= tt_version(6, 8, 0) || eth_version == tt_version(6, 7, 241)) && device.translation_tables_en; if (!virtual_bcast_supported) { device.close_device(); - GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support Virtual Coordinate Broadcast or NOC translation is not enabled"; + GTEST_SKIP() << "SiliconDriverWH.VirtualCoordinateBroadcast skipped since ethernet version does not support " + "Virtual Coordinate Broadcast or NOC translation is not enabled"; } device.deassert_risc_reset(); @@ -586,33 +718,64 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { std::set rows_to_exclude_for_dram_broadcast = {}; std::set cols_to_exclude_for_dram_broadcast = {1, 2, 3, 4, 6, 7, 8, 9}; - for(const auto& size : broadcast_sizes) { + for (const auto& size : broadcast_sizes) { std::vector vector_to_write(size); std::vector zeros(size); std::vector readback_vec = {}; - for(int i = 0; i < size; i++) { + for (int i = 0; i < size; i++) { vector_to_write[i] = i; zeros[i] = 0; } // Broadcast to Tensix - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude, cols_to_exclude, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude, + cols_to_exclude, + "LARGE_WRITE_TLB"); // Broadcast to DRAM - device.broadcast_write_to_cluster(vector_to_write.data(), vector_to_write.size() * 4, address, {}, rows_to_exclude_for_dram_broadcast, cols_to_exclude_for_dram_broadcast, "LARGE_WRITE_TLB"); + device.broadcast_write_to_cluster( + vector_to_write.data(), + vector_to_write.size() * 4, + address, + {}, + rows_to_exclude_for_dram_broadcast, + cols_to_exclude_for_dram_broadcast, + "LARGE_WRITE_TLB"); device.wait_for_non_mmio_flush(); - for(const auto i : target_devices) { - for(const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { - if(rows_to_exclude.find(core.y) != rows_to_exclude.end()) continue; - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y << "does not match what was broadcasted"; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + for (const auto i : target_devices) { + for (const auto& core : device.get_virtual_soc_descriptors().at(i).workers) { + if (rows_to_exclude.find(core.y) != rows_to_exclude.end()) { + continue; + } + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from core " << core.x << "-" << core.y + << "does not match what was broadcasted"; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } - for(int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { + for (int chan = 0; chan < device.get_virtual_soc_descriptors().at(i).get_num_dram_channels(); chan++) { const auto& core = device.get_virtual_soc_descriptors().at(i).get_core_for_dram_channel(chan, 0); - test_utils::read_data_from_device(device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); - ASSERT_EQ(vector_to_write, readback_vec) << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y << " does not match what was broadcasted " << size; - device.write_to_device(zeros.data(), zeros.size() * sizeof(std::uint32_t), tt_cxy_pair(i, core), address, "LARGE_WRITE_TLB"); // Clear any written data + test_utils::read_data_from_device( + device, readback_vec, tt_cxy_pair(i, core), address, vector_to_write.size() * 4, "LARGE_READ_TLB"); + ASSERT_EQ(vector_to_write, readback_vec) + << "Vector read back from DRAM core " << i << " " << core.x << "-" << core.y + << " does not match what was broadcasted " << size; + device.write_to_device( + zeros.data(), + zeros.size() * sizeof(std::uint32_t), + tt_cxy_pair(i, core), + address, + "LARGE_WRITE_TLB"); // Clear any written data readback_vec = {}; } } @@ -622,7 +785,6 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { device.close_device(); } - /** * This is a basic DMA test -- not using the PCIe controller's DMA engine, but * rather using the ability of the NOC to access the host system bus via traffic @@ -647,10 +809,11 @@ TEST(SiliconDriverWH, VirtualCoordinateBroadcast) { TEST(SiliconDriverWH, SysmemTestWithPcie) { auto target_devices = get_target_devices(); - Cluster cluster(1, // one "host memory channel", currently a 1G huge page - false, // skip driver allocs - no (don't skip) - true, // clean system resources - yes - true); // perform harvesting - yes + Cluster cluster( + 1, // one "host memory channel", currently a 1G huge page + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes set_params_for_remote_txn(cluster); cluster.start_device(tt_device_params{}); // no special parameters @@ -667,7 +830,7 @@ TEST(SiliconDriverWH, SysmemTestWithPcie) { // Bad API: how big is the buffer? How do we know it's big enough? // Situation today is that there's a 1G hugepage behind it, although this is // unclear from the API and may change in the future. - uint8_t *sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, 0); ASSERT_NE(sysmem, nullptr); // This is the address inside the Wormhole PCIe block that is mapped to the @@ -710,13 +873,14 @@ TEST(SiliconDriverWH, RandomSysmemTestWithPcie) { const size_t num_channels = 2; // ideally 4, but CI seems to have 2... auto target_devices = get_target_devices(); - Cluster cluster(test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), - tt_ClusterDescriptor::get_cluster_descriptor_file_path(), - target_devices, - num_channels, - false, // skip driver allocs - no (don't skip) - true, // clean system resources - yes - true); // perform harvesting - yes + Cluster cluster( + test_utils::GetAbsPath("tests/soc_descs/wormhole_b0_8x10.yaml"), + tt_ClusterDescriptor::get_cluster_descriptor_file_path(), + target_devices, + num_channels, + false, // skip driver allocs - no (don't skip) + true, // clean system resources - yes + true); // perform harvesting - yes set_params_for_remote_txn(cluster); cluster.start_device(tt_device_params{}); // no special parameters @@ -725,7 +889,7 @@ TEST(SiliconDriverWH, RandomSysmemTestWithPcie) { const auto PCIE = cluster.get_soc_descriptor(mmio_chip_id).pcie_cores.at(0); const tt_cxy_pair PCIE_CORE(mmio_chip_id, PCIE.x, PCIE.y); const size_t ONE_GIG = 1 << 30; - const size_t num_tests = 0x20000; // runs in a reasonable amount of time + const size_t num_tests = 0x20000; // runs in a reasonable amount of time // PCIe core is at (x=0, y=3) on Wormhole NOC0. ASSERT_EQ(PCIE.x, 0); @@ -735,13 +899,13 @@ TEST(SiliconDriverWH, RandomSysmemTestWithPcie) { auto generate_aligned_address = [&](uint64_t lo, uint64_t hi) -> uint64_t { static std::random_device rd; static std::mt19937_64 gen(rd()); - std::uniform_int_distribution dis(lo/ALIGNMENT, hi/ALIGNMENT); + std::uniform_int_distribution dis(lo / ALIGNMENT, hi / ALIGNMENT); return dis(gen) * ALIGNMENT; }; uint64_t base_address = cluster.get_pcie_base_addr_from_device(mmio_chip_id); for (size_t channel = 0; channel < num_channels; ++channel) { - uint8_t *sysmem = (uint8_t*)cluster.host_dma_address(0, 0, channel); + uint8_t* sysmem = (uint8_t*)cluster.host_dma_address(0, 0, channel); ASSERT_NE(sysmem, nullptr); test_utils::fill_with_random_bytes(sysmem, ONE_GIG); @@ -774,4 +938,3 @@ TEST(SiliconDriverWH, RandomSysmemTestWithPcie) { } } } - diff --git a/tests/wormhole/test_umd_remote_api_stability.cpp b/tests/wormhole/test_umd_remote_api_stability.cpp index dbc648f3d..39ebf39f0 100644 --- a/tests/wormhole/test_umd_remote_api_stability.cpp +++ b/tests/wormhole/test_umd_remote_api_stability.cpp @@ -2,58 +2,51 @@ // // SPDX-License-Identifier: Apache-2.0 +#include #include +#include #include #include #include -#include "tt_cluster_descriptor.h" #include "cluster.h" - #include "common/logger.hpp" #include "eth_interface.h" #include "filesystem" #include "gtest/gtest.h" #include "host_mem_address_map.h" #include "l1_address_map.h" -#include "tt_soc_descriptor.h" - -#include "tests/test_utils/stimulus_generators.hpp" -#include "tests/test_utils/generate_cluster_desc.hpp" #include "test_wh_common.h" - -#include -#include +#include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/test_utils/stimulus_generators.hpp" +#include "tt_cluster_descriptor.h" +#include "tt_soc_descriptor.h" namespace tt::umd::test::utils { class WormholeNebulaX2TestFixture : public WormholeTestFixture { - private: - static int detected_num_chips; - static bool skip_tests; - - protected: - - static constexpr int EXPECTED_NUM_CHIPS = 2; - static uint32_t scale_number_of_tests; - - static void SetUpTestSuite() { - std::unique_ptr cluster_desc = tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); - detected_num_chips = cluster_desc->get_number_of_chips(); - if (detected_num_chips != EXPECTED_NUM_CHIPS) { - skip_tests = true; +private: + static int detected_num_chips; + static bool skip_tests; + +protected: + static constexpr int EXPECTED_NUM_CHIPS = 2; + static uint32_t scale_number_of_tests; + + static void SetUpTestSuite() { + std::unique_ptr cluster_desc = + tt_ClusterDescriptor::create_from_yaml(tt_ClusterDescriptor::get_cluster_descriptor_file_path()); + detected_num_chips = cluster_desc->get_number_of_chips(); + if (detected_num_chips != EXPECTED_NUM_CHIPS) { + skip_tests = true; + } + if (char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { + scale_number_of_tests = std::atoi(scale_number_of_tests_env); + } } - if(char const* scale_number_of_tests_env = std::getenv("SCALE_NUMBER_OF_TESTS")) { - scale_number_of_tests = std::atoi(scale_number_of_tests_env); - } - } - virtual int get_detected_num_chips() { - return detected_num_chips; - } + virtual int get_detected_num_chips() { return detected_num_chips; } - virtual bool is_test_skipped() { - return skip_tests; - } + virtual bool is_test_skipped() { return skip_tests; } }; int WormholeNebulaX2TestFixture::detected_num_chips = -1; @@ -63,28 +56,29 @@ uint32_t WormholeNebulaX2TestFixture::scale_number_of_tests = 1; TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) { int seed = 0; - log_info(LogSiliconDriver,"Started MixedRemoteTransfersMediumSmall"); + log_info(LogSiliconDriver, "Started MixedRemoteTransfersMediumSmall"); std::vector command_history; try { assert(device != nullptr); RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.25, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } @@ -93,88 +87,92 @@ TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersMediumSmall) { TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersMediumSmall"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersMediumSmall"); assert(device != nullptr); std::vector command_history0; std::vector command_history1; std::vector command_history2; std::vector command_history3; - std::thread t1([&](){ + std::thread t1([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread t2([&](){ + std::thread t2([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history1 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history1); }); - std::thread t3([&](){ + std::thread t3([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history2 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history2); }); - std::thread t4([&](){ + std::thread t4([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .read = 0.0}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history3 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history3); }); t1.join(); @@ -186,154 +184,155 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersMediumSmall TEST_F(WormholeNebulaX2TestFixture, MixedRemoteTransfersLarge) { int seed = 0; - log_info(LogSiliconDriver,"Started MixedRemoteTransfersLarge"); + log_info(LogSiliconDriver, "Started MixedRemoteTransfersLarge"); assert(device != nullptr); std::vector command_history; try { RunMixedTransfersUniformDistributions( - *device, + *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.15, .read = 0.15}, - - std::uniform_int_distribution(0x10000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x10000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 300000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 300000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + // Set to true if you want to emit the command history code to command line + std::uniform_int_distribution(0x4, 300000), + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } - } TEST_F(WormholeNebulaX2TestFixture, WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4) { int seed = 0; - log_info(LogSiliconDriver,"Started WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4"); + log_info(LogSiliconDriver, "Started WritesOnlyNormalDistributionMean10kStd3kMinSizeTruncate4"); assert(device != nullptr); std::vector command_history; auto write_size_generator = ConstrainedTemplateTemplateGenerator( - seed, std::normal_distribution<>(10000, 3000), [](double x) -> transfer_size_t { return size_aligner_32B(static_cast((x >= 4) ? x : 4)); }); - + seed, std::normal_distribution<>(10000, 3000), [](double x) -> transfer_size_t { + return size_aligner_32B(static_cast((x >= 4) ? x : 4)); + }); auto dest_generator = get_default_full_dram_dest_generator(seed, device.get()); auto address_generator = get_default_address_generator(seed, 0x100000, 0x5000000); try { RunMixedTransfers( - *device, + *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .read = 0.}, - WriteCommandGenerator(dest_generator, address_generator, write_size_generator), build_dummy_read_command_generator(*device), - - false, // Set to true if you want to emit the command history code to command line - &command_history - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history); } catch (...) { print_command_history_executable_code(command_history); } - } TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersLMS"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersLMS"); assert(device != nullptr); std::vector command_history0; std::vector command_history1; std::vector command_history2; std::vector command_history3; - std::thread t1([&](){ + std::thread t1([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0.50, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(4, 300000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(4, 300000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread t2([&](){ + std::thread t2([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 100, - transfer_type_weights_t{.write = 0.25, .read = 0.50}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history1 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history1); }); - std::thread t3([&](){ + std::thread t3([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 23, - transfer_type_weights_t{.write = 0.5, .read = 0.25}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history2 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history2); }); - std::thread t4([&](){ + std::thread t4([&]() { RunMixedTransfersUniformDistributions( - *device, + *device, 100000 * scale_number_of_tests, 99, - transfer_type_weights_t{.write = 1.0, .read = 0.0}, - - std::uniform_int_distribution(0x100000, 0x200000), // address generator distribution - std::uniform_int_distribution(0x4, 3000), //WRITE_SIZE_GENERATOR_T const& write_size_distribution, - std::uniform_int_distribution(2, 4), //UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + // address generator distribution + std::uniform_int_distribution(0x100000, 0x200000), + // WRITE_SIZE_GENERATOR_T const& write_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // UNROLL_COUNT_GENERATOR_T const& unroll_count_distribution + std::uniform_int_distribution(2, 4), 0.75, 0.75, - std::uniform_int_distribution(0x4, 3000), //READ_SIZE_GENERATOR_T const& read_size_distribution, - - false, // Set to true if you want to emit the command history code to command line - &command_history3 - ); + // READ_SIZE_GENERATOR_T const& read_size_distribution, + std::uniform_int_distribution(0x4, 3000), + // Set to true if you want to emit the command history code to command line + false, + &command_history3); }); t1.join(); @@ -345,85 +344,80 @@ TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLMS) { TEST_F(WormholeNebulaX2TestFixture, MultithreadedMixedRemoteTransfersLargeWritesSmallReads) { int seed = 0; - log_info(LogSiliconDriver,"Started MultithreadedMixedRemoteTransfersLargeWritesSmallReads"); + log_info(LogSiliconDriver, "Started MultithreadedMixedRemoteTransfersLargeWritesSmallReads"); assert(device != nullptr); std::vector command_history0; std::vector command_history1; - auto write_size_generator = ConstrainedTemplateTemplateGenerator( - seed, std::uniform_int_distribution(1000000, 30000000), [](transfer_size_t x) -> transfer_size_t { return size_aligner_32B(static_cast((x >= 4) ? x : 4)); }); - auto read_size_generator = ConstrainedTemplateTemplateGenerator( - seed, std::uniform_int_distribution(16, 4096), [](transfer_size_t x) -> transfer_size_t { return size_aligner_32B(static_cast((x >= 4) ? x : 4)); }); + auto write_size_generator = + ConstrainedTemplateTemplateGenerator( + seed, + std::uniform_int_distribution(1000000, 30000000), + [](transfer_size_t x) -> transfer_size_t { + return size_aligner_32B(static_cast((x >= 4) ? x : 4)); + }); + auto read_size_generator = + ConstrainedTemplateTemplateGenerator( + seed, std::uniform_int_distribution(16, 4096), [](transfer_size_t x) -> transfer_size_t { + return size_aligner_32B(static_cast((x >= 4) ? x : 4)); + }); auto dest_generator = get_default_full_dram_dest_generator(seed, device.get()); auto address_generator = get_default_address_generator(seed, 0x100000, 0x5000000); - std::thread write_cmds_thread1([&](){ + std::thread write_cmds_thread1([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .read = 0.}, - WriteCommandGenerator(dest_generator, address_generator, write_size_generator), build_dummy_read_command_generator(*device), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread write_cmds_thread2([&](){ + std::thread write_cmds_thread2([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 1., .read = 0.}, - WriteCommandGenerator(dest_generator, address_generator, write_size_generator), build_dummy_read_command_generator(*device), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread read_cmd_threads1([&](){ + std::thread read_cmd_threads1([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .read = 1.}, - build_dummy_write_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); - std::thread read_cmd_threads2([&](){ + std::thread read_cmd_threads2([&]() { RunMixedTransfers( *device, 10000 * scale_number_of_tests, 0, - transfer_type_weights_t{.write = 0, .read = 1.}, - build_dummy_write_command_generator(*device), ReadCommandGenerator(dest_generator, address_generator, read_size_generator), - - false, // Set to true if you want to emit the command history code to command line - &command_history0 - ); + // Set to true if you want to emit the command history code to command line + false, + &command_history0); }); write_cmds_thread1.join(); write_cmds_thread2.join(); read_cmd_threads1.join(); read_cmd_threads2.join(); - } -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils diff --git a/tests/wormhole/test_wh_common.h b/tests/wormhole/test_wh_common.h index 812f8b98b..c358cb3ad 100644 --- a/tests/wormhole/test_wh_common.h +++ b/tests/wormhole/test_wh_common.h @@ -5,80 +5,77 @@ */ #pragma once -#include "tt_cluster_descriptor.h" #include "cluster.h" -#include "tt_xy_pair.h" #include "eth_l1_address_map.h" - -#include "tests/test_utils/stimulus_generators.hpp" #include "tests/test_utils/generate_cluster_desc.hpp" +#include "tests/test_utils/stimulus_generators.hpp" +#include "tt_cluster_descriptor.h" +#include "tt_xy_pair.h" namespace tt::umd::test::utils { static void set_params_for_remote_txn(Cluster& device) { // Populate address map and NOC parameters that the driver needs for remote transactions - device.set_device_l1_address_params({l1_mem::address_map::L1_BARRIER_BASE, eth_l1_mem::address_map::ERISC_BARRIER_BASE, eth_l1_mem::address_map::FW_VERSION_ADDR}); + device.set_device_l1_address_params( + {l1_mem::address_map::L1_BARRIER_BASE, + eth_l1_mem::address_map::ERISC_BARRIER_BASE, + eth_l1_mem::address_map::FW_VERSION_ADDR}); } class WormholeTestFixture : public ::testing::Test { - protected: - // You can remove any or all of the following functions if their bodies would - // be empty. - - std::unique_ptr device; +protected: + // You can remove any or all of the following functions if their bodies would + // be empty. - WormholeTestFixture() { + std::unique_ptr device; - } + WormholeTestFixture() {} - ~WormholeTestFixture() override { - // You can do clean-up work that doesn't throw exceptions here. - } + ~WormholeTestFixture() override { + // You can do clean-up work that doesn't throw exceptions here. + } - virtual int get_detected_num_chips() = 0; - virtual bool is_test_skipped() = 0; + virtual int get_detected_num_chips() = 0; + virtual bool is_test_skipped() = 0; - // If the constructor and destructor are not enough for setting up - // and cleaning up each test, you can define the following methods: + // If the constructor and destructor are not enough for setting up + // and cleaning up each test, you can define the following methods: - void SetUp() override { - // Code here will be called immediately after the constructor (right - // before each test). + void SetUp() override { + // Code here will be called immediately after the constructor (right + // before each test). - if (is_test_skipped()) { - GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; - } + if (is_test_skipped()) { + GTEST_SKIP() << "Test is skipped due to incorrect number of chips"; + } - // std::cout << "Setting Up Test." << std::endl; - assert(get_detected_num_chips() > 0); - auto devices = std::vector(get_detected_num_chips()); - std::iota(devices.begin(), devices.end(), 0); - std::set target_devices = {devices.begin(), devices.end()}; - uint32_t num_host_mem_ch_per_mmio_device = 1; - device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); - assert(device != nullptr); - assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); + assert(get_detected_num_chips() > 0); + auto devices = std::vector(get_detected_num_chips()); + std::iota(devices.begin(), devices.end(), 0); + std::set target_devices = {devices.begin(), devices.end()}; + uint32_t num_host_mem_ch_per_mmio_device = 1; + device = std::make_unique(num_host_mem_ch_per_mmio_device, false, true, true); + assert(device != nullptr); + assert(device->get_cluster_description()->get_number_of_chips() == get_detected_num_chips()); - set_params_for_remote_txn(*device); + set_params_for_remote_txn(*device); - tt_device_params default_params; - device->start_device(default_params); + tt_device_params default_params; + device->start_device(default_params); - device->deassert_risc_reset(); + device->deassert_risc_reset(); - device->wait_for_non_mmio_flush(); - } + device->wait_for_non_mmio_flush(); + } - void TearDown() override { - // Code here will be called immediately after each test (right - // before the destructor). + void TearDown() override { + // Code here will be called immediately after each test (right + // before the destructor). - if (!is_test_skipped()) { - // std::cout << "Tearing Down Test." << std::endl; - device->close_device(); + if (!is_test_skipped()) { + device->close_device(); + } } - } - }; -} // namespace tt::umd::test::utils +} // namespace tt::umd::test::utils From 6cf3bb055b81f0f959c5863c1603d0e44c3e6c58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:13:03 +0100 Subject: [PATCH 6/8] Fix eth_coord hash dependency (#317) ### Issue A follow up of #306 ### Description Adding a new boost dependency complicates things. The code, as is currently on main, won't build in tt_metal. ### List of the changes - Remove boost dependency, and use one liner hash function ### Testing No testing ### API Changes There are no API changes in this PR. --- cmake/dependencies.cmake | 1 - device/tt_cluster_descriptor_types.h | 17 +++++++++++------ tests/CMakeLists.txt | 1 - 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index b8f968859..96ebbaf53 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -46,7 +46,6 @@ function(fetch_dependencies) # boost::interprocess ############################################################################################################################ include(${PROJECT_SOURCE_DIR}/cmake/fetch_boost.cmake) - fetch_boost_library(container_hash) fetch_boost_library(interprocess) ############################################################################################################################ diff --git a/device/tt_cluster_descriptor_types.h b/device/tt_cluster_descriptor_types.h index b9e018235..81b652f5d 100644 --- a/device/tt_cluster_descriptor_types.h +++ b/device/tt_cluster_descriptor_types.h @@ -6,7 +6,6 @@ #pragma once -#include #include #include @@ -29,16 +28,22 @@ struct eth_coord_t { } }; +// Small performant hash combiner taken from boost library. +// Not using boost::hash_combine due to dependency complications. +inline void boost_hash_combine(std::size_t &seed, const int value) { + seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + namespace std { template <> struct hash { std::size_t operator()(eth_coord_t const &c) const { std::size_t seed = 0; - boost::hash_combine(seed, c.cluster_id); - boost::hash_combine(seed, c.x); - boost::hash_combine(seed, c.y); - boost::hash_combine(seed, c.rack); - boost::hash_combine(seed, c.shelf); + boost_hash_combine(seed, c.cluster_id); + boost_hash_combine(seed, c.x); + boost_hash_combine(seed, c.y); + boost_hash_combine(seed, c.rack); + boost_hash_combine(seed, c.shelf); return seed; } }; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9afafb9d5..6829c91b0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,7 +13,6 @@ target_link_libraries( gtest pthread fmt::fmt-header-only - Boost::container_hash ) target_include_directories( test_common From 5e4f52afa391fb8e0e4e8a39417c5598c1f586cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Thu, 21 Nov 2024 14:44:42 +0100 Subject: [PATCH 7/8] Refactor our CI yamls (#319) ### Issue No current issue, related to already closed #45 ### Description As we're using CI for some time, I found that it actually brings unnecessary complications to have a layer of yamls just designating what is done on pr, optionally on pr, and on push. This can be directly added to each of the jobs. Another effect is that in the "Actions" tab, we won't have On PR, Optional On PR, and On Push where each includes several different jobs. We will now have Build device, build and run tests, and pre-commit , which will run both on PR branches and main. ### List of the changes - Created a build-and-run-all-tests.yml which wraps build-tests and run-tests for all archs. This just nicely corresponds exactly to previous on-pr-opt. - Deleted on-pr, on-pr-opt and on-push ymls - Added to all referenced .ymls the config to run on pr and/or on push, so that same jobs are called on same triggers. - Added timeout parameter to a couple of jobs. - I will change in settings which jobs are mandatory, if they are now named slightly different. ### Testing CI passes on this PR ### API Changes There are no API changes in this PR. --- ...pr-opt.yml => build-and-run-all-tests.yml} | 6 ++- .github/workflows/build-device.yml | 18 ++++--- .github/workflows/build-image.yml | 10 +++- .github/workflows/build-tests.yml | 3 +- .github/workflows/on-pr.yml | 18 ------- .github/workflows/on-push.yml | 54 ------------------- .github/workflows/pre-commit.yml | 5 +- .github/workflows/run-tests.yml | 3 +- .github/workflows/test-runner.yaml | 12 ++++- 9 files changed, 40 insertions(+), 89 deletions(-) rename .github/workflows/{on-pr-opt.yml => build-and-run-all-tests.yml} (90%) delete mode 100644 .github/workflows/on-pr.yml delete mode 100644 .github/workflows/on-push.yml diff --git a/.github/workflows/on-pr-opt.yml b/.github/workflows/build-and-run-all-tests.yml similarity index 90% rename from .github/workflows/on-pr-opt.yml rename to .github/workflows/build-and-run-all-tests.yml index 18364734f..ff75e6c28 100644 --- a/.github/workflows/on-pr-opt.yml +++ b/.github/workflows/build-and-run-all-tests.yml @@ -1,10 +1,12 @@ -# Optional PR checks -name: On PR - Optional +# Build and then run all tests, on all supported archs. +name: Build and run all tests on: workflow_dispatch: pull_request: branches: ["main"] + push: + branches: ["main"] jobs: build-tests: diff --git a/.github/workflows/build-device.yml b/.github/workflows/build-device.yml index 5a8c06485..335dd2c08 100644 --- a/.github/workflows/build-device.yml +++ b/.github/workflows/build-device.yml @@ -1,19 +1,19 @@ # Builds device. # Build is performed on all supported OS versions. -name: Build Target +name: Build Device on: - workflow_call: - inputs: - timeout: - required: true - type: number workflow_dispatch: inputs: timeout: required: true - description: 'The timeout for the build job in minutes' + description: 'The timeout for the job in minutes' type: number + default: 15 + pull_request: + branches: ["main"] + push: + branches: ["main"] env: BUILD_TARGET: device @@ -25,7 +25,9 @@ env: jobs: build: - timeout-minutes: ${{ inputs.timeout }} + # Due to parsing bug, fromJSON is used to convert string to number. + # In pull_request or push events, the input context is not available, stating the default again here. + timeout-minutes: ${{ fromJSON(inputs.timeout || '15') }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/build-image.yml b/.github/workflows/build-image.yml index 3c21f65d5..5affd5c26 100644 --- a/.github/workflows/build-image.yml +++ b/.github/workflows/build-image.yml @@ -3,11 +3,17 @@ name: Build and Publish Docker Image on: workflow_dispatch: - workflow_call: + inputs: + timeout: + required: true + description: 'The timeout for the job in minutes' + type: number + default: 15 jobs: build: - timeout-minutes: 15 + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 08dc84ee8..3916e4bf5 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -37,7 +37,8 @@ env: jobs: build: - timeout-minutes: ${{ inputs.timeout }} + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/on-pr.yml b/.github/workflows/on-pr.yml deleted file mode 100644 index 158026ddd..000000000 --- a/.github/workflows/on-pr.yml +++ /dev/null @@ -1,18 +0,0 @@ -# Mandatory PR checks -name: On PR - -on: - workflow_dispatch: - pull_request: - branches: ["main"] - -jobs: - build-all: - secrets: inherit - uses: ./.github/workflows/build-device.yml - with: - timeout: 15 - - pre-commit: - secrets: inherit - uses: ./.github/workflows/pre-commit.yml diff --git a/.github/workflows/on-push.yml b/.github/workflows/on-push.yml deleted file mode 100644 index 673be510a..000000000 --- a/.github/workflows/on-push.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: On Push - -on: - workflow_dispatch: - push: - branches: ["main"] - -jobs: - build-all: - secrets: inherit - uses: ./.github/workflows/build-device.yml - with: - timeout: 15 - - pre-commit: - secrets: inherit - uses: ./.github/workflows/pre-commit.yml - - build-tests: - secrets: inherit - strategy: - fail-fast: false - matrix: - test-group: [ - # Enable once we have functional cards with specified architecture. - {arch: grayskull}, - {arch: wormhole_b0}, - # {arch: blackhole}, - ] - uses: ./.github/workflows/build-tests.yml - with: - arch: ${{ matrix.test-group.arch }} - timeout: 15 - - test-all: - secrets: inherit - needs: build-tests - strategy: - fail-fast: false - matrix: - test-group: [ - # Enable once we have functional cards. - {arch: grayskull, card: e75, timeout: 10}, - {arch: grayskull, card: e150, timeout: 10}, - {arch: grayskull, card: e300, timeout: 10}, - {arch: wormhole_b0, card: n150, timeout: 5}, - {arch: wormhole_b0, card: n300, timeout: 15}, - # {arch: blackhole}, - ] - uses: ./.github/workflows/run-tests.yml - with: - arch: ${{ matrix.test-group.arch }} - card: ${{ matrix.test-group.card }} - timeout: ${{ matrix.test-group.timeout }} diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index a4ecb678f..c4b2b9a07 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -2,8 +2,11 @@ name: Run Pre-commit Hooks on: - workflow_call: workflow_dispatch: + pull_request: + branches: ["main"] + push: + branches: ["main"] jobs: pre-commit: diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 98b2a5266..e9d1e6b2a 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -45,7 +45,8 @@ env: jobs: test: - timeout-minutes: ${{ inputs.timeout }} + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: diff --git a/.github/workflows/test-runner.yaml b/.github/workflows/test-runner.yaml index 74a4d6bf9..c871c7738 100644 --- a/.github/workflows/test-runner.yaml +++ b/.github/workflows/test-runner.yaml @@ -2,10 +2,17 @@ name: Check runner on: workflow_dispatch: + inputs: + timeout: + required: true + description: 'The timeout for the job in minutes' + type: number + default: 10 jobs: check-runners-host: - timeout-minutes: 10 + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: @@ -52,7 +59,8 @@ jobs: du -h --max-depth=1 | sort -rh check-runners-docker: - timeout-minutes: 10 + # Due to parsing bug, fromJSON is used to convert string to number + timeout-minutes: ${{ fromJSON(inputs.timeout) }} strategy: fail-fast: false matrix: From 90ea1ad6c84f7f2d3a6f18af1e374f6f893b56d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bojan=20Ro=C5=A1ko?= <156314064+broskoTT@users.noreply.github.com> Date: Thu, 21 Nov 2024 16:23:35 +0100 Subject: [PATCH 8/8] Workflow to track client integration build (#318) ### Issue No tracking issue, but a follow up issue will be #321 tt-lens is a private repo, so it is not trivial to add a build job such as this one. ### Description Adding a job, which won't be mandatory, which tries to build newest tt-metal with the current umd (a branch from this PR if triggered on PR, or main if on main). The idea is that to see asap if something is going to break tt-metal build, which seems to be happening often in the past period. ### List of the changes - Add workflow for building tt-metal, but with the UMD from this branch ### Testing CI on this branch passed. ### API Changes There are no API changes in this PR. --- .github/workflows/build-clients.yml | 56 +++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 .github/workflows/build-clients.yml diff --git a/.github/workflows/build-clients.yml b/.github/workflows/build-clients.yml new file mode 100644 index 000000000..a25023676 --- /dev/null +++ b/.github/workflows/build-clients.yml @@ -0,0 +1,56 @@ +name: Build clients on newest UMD + +on: + workflow_dispatch: + inputs: + timeout: + required: true + description: 'The timeout for the job in minutes' + type: number + default: 30 + pull_request: + branches: ["main"] + push: + branches: ["main"] + +jobs: + build-tt-metal: + # Due to parsing bug, fromJSON is used to convert string to number. + # In pull_request or push events, the input context is not available, stating the default again here. + timeout-minutes: ${{ fromJSON(inputs.timeout || '30') }} + strategy: + fail-fast: false + matrix: + arch_name: [grayskull, wormhole_b0, blackhole] + + name: Build tt-metal for ${{ matrix.arch_name }} with newest UMD + runs-on: ubuntu-20.04 + container: + image: ghcr.io/tenstorrent/tt-metal/tt-metalium/ubuntu-20.04-amd64:latest + options: --user root + + steps: + - name: Checkout client repo + uses: actions/checkout@v4 + with: + # Clone under tt-metal directory + path: tt-metal + repository: tenstorrent/tt-metal + submodules: recursive + lfs: 'true' + + - name: Checkout UMD + uses: actions/checkout@v4 + with: + # Clone directly into tt-metal directory for umd + path: tt-metal/tt_metal/third_party/umd + submodules: recursive + lfs: 'true' + + - name: Build tt-metal + run: | + cd tt-metal + export ARCH_NAME=${{ matrix.arch_name }} + export TT_METAL_HOME=$(pwd) + export PYTHONPATH=$(pwd) + ./build_metal.sh