Skip to content

Commit

Permalink
Enable FD + 2D Fabric on all systems
Browse files Browse the repository at this point in the history
  • Loading branch information
aliuTT committed Feb 28, 2025
1 parent 5965157 commit 8b2d814
Show file tree
Hide file tree
Showing 19 changed files with 226 additions and 130 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2296,9 +2296,6 @@ CoreCoord Device::grid_size() const;
// Get the Tensix Grid Size for this device. Queries SOC Descriptor + harvesting info + system type.
CoreCoord Device::logical_grid_size() const;

// Get the devices connected to this device. Relies on UMD for the cluster descriptor
std::unordered_set<chip_id_t> Device::get_ethernet_connected_device_ids();

// Get the worker core grid-size on this device. Queries SOC Descriptor + harvesting info + system type + core descriptor.
CoreCoord Device::compute_with_storage_grid_size() const;

Expand Down
1 change: 1 addition & 0 deletions tests/scripts/run_cpp_fabric_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ cd $TT_METAL_HOME
echo "Running fabric unit tests now...";

TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*"
./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*"

#############################################
# FABRIC SANITY TESTS #
Expand Down
1 change: 1 addition & 0 deletions tests/scripts/t3000/run_t3000_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ run_t3000_ttfabric_tests() {
echo "LOG_METAL: Running run_t3000_ttfabric_tests"
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter=ControlPlaneFixture.*T3k*
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*"
./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*"
# Unicast tests
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type t3k --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
Expand Down
1 change: 1 addition & 0 deletions tests/scripts/tg/run_tg_unit_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ run_tg_tests() {
echo "LOG_FABRIC: running run_tg_fabric_tests"
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter=ControlPlaneFixture.*TG*
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*"
./build/test/tt_metal/tt_fabric/fabric_unit_tests --gtest_filter="FabricFixture.*"
# Unicast tests
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 1 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
TT_METAL_SLOW_DISPATCH_MODE=1 ./build/test/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity_wormhole_b0 --fabric_command 64 --board_type glx32 --data_kb_per_tx 10 --num_src_endpoints 20 --num_dest_endpoints 8 --num_links 16
Expand Down
28 changes: 23 additions & 5 deletions tests/tt_metal/tt_fabric/common/fabric_fixture.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ class FabricFixture : public ::testing::Test {

void SetUp() override {
slow_dispatch_ = getenv("TT_METAL_SLOW_DISPATCH_MODE");
if (!slow_dispatch_) {
tt::log_info(
tt::LogTest, "Fabric test suite can only be run with slow dispatch or TT_METAL_SLOW_DISPATCH_MODE set");
GTEST_SKIP();
if (slow_dispatch_) {
tt::log_info(tt::LogTest, "Running fabric api tests with slow dispatch");
} else {
tt::log_info(tt::LogTest, "Running fabric api tests with fast dispatch");
}
// Set up all available devices
this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
Expand All @@ -52,12 +52,30 @@ class FabricFixture : public ::testing::Test {
for (unsigned int id = 0; id < num_devices; id++) {
ids.push_back(id);
}
tt::tt_metal::detail::InitializeFabricSetting(tt::tt_metal::detail::FabricSetting::FABRIC);
tt::tt_metal::detail::InitializeFabricConfig(tt::FabricConfig::FABRIC_2D);
devices_map_ = tt::tt_metal::detail::CreateDevices(ids);
for (auto& [id, device] : devices_map_) {
devices_.push_back(device);
}
}
void RunProgramNonblocking(tt::tt_metal::IDevice* device, tt::tt_metal::Program& program) {
if (this->slow_dispatch_) {
tt::tt_metal::detail::LaunchProgram(device, program, false);
} else {
tt::tt_metal::CommandQueue& cq = device->command_queue();
tt::tt_metal::EnqueueProgram(cq, program, false);
}
}
void WaitForSingleProgramDone(tt::tt_metal::IDevice* device, tt::tt_metal::Program& program) {
if (this->slow_dispatch_) {
// Wait for the program to finish
tt::tt_metal::detail::WaitProgramDone(device, program);
} else {
// Wait for all programs on cq to finish
tt::tt_metal::CommandQueue& cq = device->command_queue();
tt::tt_metal::Finish(cq);
}
}

void TearDown() override { tt::tt_metal::detail::CloseDevices(devices_map_); }
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,10 @@ TEST_F(FabricFixture, TestAsyncWrite) {
tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args);

// Launch sender and receiver programs and wait for them to finish
tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false);
tt_metal::detail::LaunchProgram(sender_device, sender_program, false);
tt_metal::detail::WaitProgramDone(sender_device, sender_program);
tt_metal::detail::WaitProgramDone(receiver_device, receiver_program);
this->RunProgramNonblocking(receiver_device, receiver_program);
this->RunProgramNonblocking(sender_device, sender_program);
this->WaitForSingleProgramDone(sender_device, sender_program);
this->WaitForSingleProgramDone(receiver_device, receiver_program);

// Validate the data received by the receiver
std::vector<uint32_t> received_buffer_data;
Expand Down Expand Up @@ -302,10 +302,10 @@ TEST_F(FabricFixture, TestAtomicInc) {
tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args);

// Launch sender and receiver programs and wait for them to finish
tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false);
tt_metal::detail::LaunchProgram(sender_device, sender_program, false);
tt_metal::detail::WaitProgramDone(sender_device, sender_program);
tt_metal::detail::WaitProgramDone(receiver_device, receiver_program);
this->RunProgramNonblocking(receiver_device, receiver_program);
this->RunProgramNonblocking(sender_device, sender_program);
this->WaitForSingleProgramDone(sender_device, sender_program);
this->WaitForSingleProgramDone(receiver_device, receiver_program);

// Validate the data received by the receiver
std::vector<uint32_t> received_buffer_data;
Expand Down Expand Up @@ -469,10 +469,10 @@ TEST_F(FabricFixture, TestAsyncWriteAtomicInc) {
tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args);

// Launch sender and receiver programs and wait for them to finish
tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false);
tt_metal::detail::LaunchProgram(sender_device, sender_program, false);
tt_metal::detail::WaitProgramDone(sender_device, sender_program);
tt_metal::detail::WaitProgramDone(receiver_device, receiver_program);
this->RunProgramNonblocking(receiver_device, receiver_program);
this->RunProgramNonblocking(sender_device, sender_program);
this->WaitForSingleProgramDone(sender_device, sender_program);
this->WaitForSingleProgramDone(receiver_device, receiver_program);

// Validate the data received by the receiver
std::vector<uint32_t> received_buffer_data;
Expand Down Expand Up @@ -586,7 +586,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) {
};
tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args);

tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false);
this->RunProgramNonblocking(receiver_device, receiver_program);
receiver_programs.push_back(std::move(receiver_program));
receiver_buffers.push_back(std::move(receiver_buffer));
}
Expand Down Expand Up @@ -676,12 +676,12 @@ TEST_F(FabricFixture, TestAsyncWriteMulticast) {
tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args);

// Launch sender and receiver programs and wait for them to finish
tt_metal::detail::LaunchProgram(sender_device, sender_program, false);
tt_metal::detail::WaitProgramDone(sender_device, sender_program);
this->RunProgramNonblocking(sender_device, sender_program);
this->WaitForSingleProgramDone(sender_device, sender_program);
for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) {
for (uint32_t i = 0; i < physical_end_device_ids.size(); i++) {
auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_ids[i]);
tt_metal::detail::WaitProgramDone(receiver_device, receiver_programs[i]);
this->WaitForSingleProgramDone(receiver_device, receiver_programs[i]);
}
}

Expand Down Expand Up @@ -799,7 +799,7 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) {
};
tt_metal::SetRuntimeArgs(receiver_program, receiver_kernel, receiver_logical_core, receiver_runtime_args);

tt_metal::detail::LaunchProgram(receiver_device, receiver_program, false);
this->RunProgramNonblocking(receiver_device, receiver_program);
receiver_programs.push_back(std::move(receiver_program));
receiver_buffers.push_back(std::move(receiver_buffer));
}
Expand Down Expand Up @@ -894,12 +894,12 @@ TEST_F(FabricFixture, TestAsyncWriteMulticastMultidirectional) {
tt_metal::SetRuntimeArgs(sender_program, sender_kernel, sender_logical_core, sender_runtime_args);

// Launch sender and receiver programs and wait for them to finish
tt_metal::detail::LaunchProgram(sender_device, sender_program, false);
tt_metal::detail::WaitProgramDone(sender_device, sender_program);
this->RunProgramNonblocking(sender_device, sender_program);
this->WaitForSingleProgramDone(sender_device, sender_program);
for (auto [routing_direction, physical_end_device_ids] : physical_end_device_ids_by_dir) {
for (uint32_t i = 0; i < physical_end_device_ids.size(); i++) {
auto* receiver_device = DevicePool::instance().get_active_device(physical_end_device_ids[i]);
tt_metal::detail::WaitProgramDone(receiver_device, receiver_programs[i]);
this->WaitForSingleProgramDone(receiver_device, receiver_programs[i]);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ int main(int argc, char** argv) {

std::map<chip_id_t, std::vector<CoreCoord>> device_router_map;

auto const& device_active_eth_cores = device_map[test_device_id_l]->get_active_ethernet_cores();
const auto& device_active_eth_cores = device_map[test_device_id_l]->get_active_ethernet_cores();

if (device_active_eth_cores.size() == 0) {
log_info(
Expand Down Expand Up @@ -309,20 +309,19 @@ int main(int argc, char** argv) {
log_info(LogTest, "GK Socket Info Addr = 0x{:08X}", socket_info_addr);

for (auto device : device_map) {
auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(device.second->id());
auto neighbors = tt::Cluster::instance().get_ethernet_cores_grouped_by_connected_chips(device.second->id());
std::vector<CoreCoord> device_router_cores;
std::vector<CoreCoord> device_router_phys_cores;
uint32_t router_mask = 0;
for (auto neighbor : neighbors) {
if (device_map.contains(neighbor)) {
for (const auto& [neighbor_chip, connected_logical_cores] : neighbors) {
if (device_map.contains(neighbor_chip)) {
if (!router_core_found && device.first == test_device_id_l) {
// pick a router so that tx and read in routing tables from this core on the
// sender device.
router_logical_core = device.second->get_ethernet_sockets(neighbor)[0];
router_logical_core = connected_logical_cores[0];
router_phys_core = device.second->ethernet_core_from_logical_core(router_logical_core);
router_core_found = true;
}
auto connected_logical_cores = device.second->get_ethernet_sockets(neighbor);
for (auto logical_core : connected_logical_cores) {
device_router_cores.push_back(logical_core);
device_router_phys_cores.push_back(
Expand All @@ -334,7 +333,7 @@ int main(int argc, char** argv) {
LogTest,
"Device {} skiping Neighbor Device {} since it is not in test device map.",
device.first,
neighbor);
neighbor_chip);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,10 @@ typedef struct test_board {
throw std::runtime_error("Odd number of chips detected, not supported currently");
}

if (metal_fabric_init_level != 0) {
tt::tt_metal::detail::InitializeFabricSetting(tt::tt_metal::detail::FabricSetting::FABRIC);
if (metal_fabric_init_level == 0) {
tt::tt_metal::detail::InitializeFabricConfig(tt::FabricConfig::CUSTOM);
} else if (metal_fabric_init_level == 1) {
tt::tt_metal::detail::InitializeFabricConfig(tt::FabricConfig::FABRIC_2D);
}
device_handle_map = tt::tt_metal::detail::CreateDevices(available_chip_ids);
if (metal_fabric_init_level == 0) {
Expand Down Expand Up @@ -299,8 +301,8 @@ typedef struct test_board {
// for each physical chip id, store the neighbors
// TDOD: update the logic to find inter-mesh neighbors
for (auto chip_id : physical_chip_ids) {
auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(chip_id);
for (auto neighbor : neighbors) {
auto neighbors = tt::Cluster::instance().get_ethernet_cores_grouped_by_connected_chips(chip_id);
for (const auto& [neighbor, cores] : neighbors) {
// only append valid chip IDs since the neighbors could include mmio chips (wh galaxy) or
// could be outside of the board type (in case of partial galaxy configurations)
if (is_valid_chip_id(neighbor)) {
Expand Down Expand Up @@ -502,13 +504,12 @@ typedef struct test_device {
core_range_end_virtual = device_handle->worker_core_from_logical_core(CoreCoord(7, 7));

// populate router cores
auto neighbors = tt::Cluster::instance().get_ethernet_connected_device_ids(physical_chip_id);
for (auto neighbor : neighbors) {
if (!(board_handle->is_valid_chip_id(neighbor))) {
auto neighbors = tt::Cluster::instance().get_ethernet_cores_grouped_by_connected_chips(device_handle->id());
for (const auto& [neighbor_chip, connected_logical_cores] : neighbors) {
if (!(board_handle->is_valid_chip_id(neighbor_chip))) {
continue;
}

auto connected_logical_cores = device_handle->get_ethernet_sockets(neighbor);
for (auto logical_core : connected_logical_cores) {
router_logical_cores.push_back(logical_core);
router_virtual_cores.push_back(device_handle->ethernet_core_from_logical_core(logical_core));
Expand Down
2 changes: 2 additions & 0 deletions tt_metal/api/tt-metalium/control_plane.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class ControlPlane {

routing_plane_id_t get_routing_plane_id(chan_id_t eth_chan_id) const;

size_t get_num_active_fabric_routers(mesh_id_t mesh_id, chip_id_t chip_id) const;

private:
std::unique_ptr<RoutingTableGenerator> routing_table_generator_;
std::vector<std::vector<chip_id_t>> logical_mesh_chip_id_to_physical_chip_id_mapping_;
Expand Down
7 changes: 1 addition & 6 deletions tt_metal/api/tt-metalium/device_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,10 @@ class DevicePool {
DevicePool(DevicePool&& other) noexcept = delete;

static DevicePool& instance() noexcept {
TT_ASSERT((_inst != nullptr) and (_inst->initialized), "Trying to get DevicePool without initializing it");
TT_ASSERT(_inst != nullptr, "Trying to get DevicePool without initializing it");
return *_inst;
}

static void initialize_fabric_setting(detail::FabricSetting fabric_setting) noexcept;

static void initialize(
const std::vector<chip_id_t>& device_ids,
const uint8_t num_hw_cqs,
Expand Down Expand Up @@ -81,11 +79,8 @@ class DevicePool {
bool skip_remote_devices;
std::unordered_set<uint32_t> firmware_built_keys;

detail::FabricSetting fabric_setting = detail::FabricSetting::DEFAULT;
std::unique_ptr<tt::tt_fabric::ControlPlane> control_plane;

bool initialized = false;

// Determine which CPU cores the worker threads need to be placed on for each device
std::unordered_map<uint32_t, uint32_t> worker_thread_to_cpu_core_map;
std::unordered_map<uint32_t, uint32_t> completion_queue_reader_to_cpu_core_map;
Expand Down
9 changes: 4 additions & 5 deletions tt_metal/api/tt-metalium/tt_metal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "dispatch_core_manager.hpp"
#include "buffer.hpp"
#include "profiler.hpp"
#include "llrt/tt_cluster.hpp"

namespace tt::tt_metal {
inline namespace v0 {
Expand All @@ -23,15 +24,13 @@ class IDevice;

namespace detail {

enum class FabricSetting { DISABLED = 0, FABRIC = 1, EDM = 2, DEFAULT = 3 };

bool DispatchStateCheck(bool isFastDispatch);

bool InWorkerThread();
inline bool InMainThread() { return not InWorkerThread(); }

// Call before CreateDevices to enable fabric, which uses all ethernet cores and some tensix cores
void InitializeFabricSetting(detail::FabricSetting fabric_setting);
// Call before CreateDevices to enable fabric, which uses all free ethernet cores
void InitializeFabricConfig(FabricConfig fabric_config);

std::map<chip_id_t, IDevice*> CreateDevices(
// TODO: delete this in favour of DevicePool
Expand Down Expand Up @@ -165,7 +164,7 @@ void CompileProgram(IDevice* device, Program& program, bool fd_bootloader_mode =
* | program | The program holding the runtime args | const Program & | |
* Yes |
*/
void WriteRuntimeArgsToDevice(IDevice* device, Program& program);
void WriteRuntimeArgsToDevice(IDevice* device, Program& program, bool fd_bootloader_mode = false);

// Configures a given device with a given program.
// - Loads all kernel binaries into L1s of assigned Tensix cores
Expand Down
7 changes: 0 additions & 7 deletions tt_metal/fabric/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,3 @@ target_precompile_headers(
)

target_compile_options(fabric PRIVATE -Wno-int-to-pointer-cast)

#set_target_properties(
# fabric
# PROPERTIES
# INSTALL_RPATH
# "${PROJECT_BINARY_DIR}/lib"
#)
Loading

0 comments on commit 8b2d814

Please sign in to comment.