MeshTrace Initial Implementation

- Add distributed APIs to trace MeshWorkloads in MeshDevice DRAM - Supports tracing heterogenous workloads and those running on a subset of the MeshDevice - Add an explicit MeshTrace assembly step that allows a single set of dispatch commands to be reused across physical devices running the same programs - Cleanup logic inside EnqueueTraceCommand and move it to a shared header between distributed and tt_metal/dispatch - Add tests for tracing: - Homogenous workloads - Heterogenous workloads - Workloads Running on SubDevices
tenstorrent · Feb 21, 2025 · 9113d2e · 9113d2e
1 parent 53f3d05
commit 9113d2e
Show file tree

Hide file tree

Showing 36 changed files with 1,930 additions and 641 deletions.
diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt
@@ -6,6 +6,7 @@ set(UNIT_TESTS_DISTRIBUTED_SRC
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_sub_device.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_events.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_trace.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
 )
 

diff --git a/tests/tt_metal/distributed/test_mesh_trace.cpp b/tests/tt_metal/distributed/test_mesh_trace.cpp
diff --git a/tests/tt_metal/distributed/test_mesh_workload.cpp b/tests/tt_metal/distributed/test_mesh_workload.cpp
@@ -9,7 +9,6 @@
 #include <tt-metalium/tt_metal.hpp>
 #include <tt-metalium/bfloat16.hpp>
 
-#include "tests/tt_metal/tt_metal/dispatch/dispatch_test_utils.hpp"
 #include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp"
 #include "tests/tt_metal/distributed/utils.hpp"
 
@@ -23,257 +22,6 @@ struct CBConfig {
     tt::DataFormat data_format;
 };
 
-std::vector<std::shared_ptr<Program>> create_random_programs(
-    uint32_t num_programs,
-    CoreCoord worker_grid_size,
-    uint32_t seed,
-    const std::unordered_set<CoreCoord>& active_eth_cores = {}) {
-    uint32_t MAX_LOOP = 100;
-    uint32_t page_size = 1024;
-    uint32_t max_eth_cores = 3;
-
-    uint32_t BRISC_OUTER_LOOP, BRISC_MIDDLE_LOOP, BRISC_INNER_LOOP, NUM_CBS, NUM_SEMS;
-    uint32_t NCRISC_OUTER_LOOP, NCRISC_MIDDLE_LOOP, NCRISC_INNER_LOOP;
-    uint32_t TRISC_OUTER_LOOP, TRISC_MIDDLE_LOOP, TRISC_INNER_LOOP;
-    uint32_t ERISC_OUTER_LOOP, ERISC_MIDDLE_LOOP, ERISC_INNER_LOOP;
-    bool USE_MAX_RT_ARGS;
-
-    CoreRange cr({0, 0}, {worker_grid_size.x - 1, worker_grid_size.y - 1});
-    CoreRangeSet cr_set(cr);
-
-    std::vector<std::shared_ptr<Program>> programs;
-
-    std::map<string, string> data_movement_defines = {{"DATA_MOVEMENT", "1"}};
-    std::map<string, string> compute_defines = {{"COMPUTE", "1"}};
-    std::map<string, string> erisc_defines = {{"ERISC", "1"}};
-
-    for (uint32_t i = 0; i < num_programs; i++) {
-        Program& program = *programs.emplace_back(std::make_shared<Program>());
-        // ========== Set configs for BRISC ==========
-        if (i == 0) {
-            // Ensures that we get at least one compilation with the max amount to
-            // ensure it compiles and runs
-            BRISC_OUTER_LOOP = MAX_LOOP;
-            BRISC_MIDDLE_LOOP = MAX_LOOP;
-            BRISC_INNER_LOOP = MAX_LOOP;
-            NUM_CBS = NUM_CIRCULAR_BUFFERS;
-            NUM_SEMS = NUM_SEMAPHORES;
-            USE_MAX_RT_ARGS = true;
-        } else {
-            BRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
-            BRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
-            BRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
-            NUM_CBS = rand() % (NUM_CIRCULAR_BUFFERS) + 1;
-            NUM_SEMS = rand() % (NUM_SEMAPHORES) + 1;
-            USE_MAX_RT_ARGS = false;
-        }
-        // Create CBs
-        for (uint32_t j = 0; j < NUM_CBS; j++) {
-            CircularBufferConfig cb_config = CircularBufferConfig(page_size * (j + 1), {{j, tt::DataFormat::Float16_b}})
-                                                 .set_page_size(j, page_size * (j + 1));
-            auto cb = CreateCircularBuffer(program, cr_set, cb_config);
-        }
-
-        // Create Semaphores
-        for (uint32_t j = 0; j < NUM_SEMS; j++) {
-            CreateSemaphore(program, cr_set, j + 1);
-            uint32_t curr_idx = 0;
-            if (active_eth_cores.size()) {
-                auto active_eth_core = active_eth_cores.begin();
-                for (int k = 0; k < max_eth_cores && active_eth_core != active_eth_cores.end();
-                     ++i, ++active_eth_core) {
-                    CreateSemaphore(program, *active_eth_core, j + 1, CoreType::ETH);
-                }
-            }
-        }
-
-        // Create RTAs
-        auto [brisc_unique_rtargs, brisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS);
-        uint32_t num_brisc_unique_rtargs = brisc_unique_rtargs.size();
-        uint32_t num_brisc_common_rtargs = brisc_common_rtargs.size();
-        std::vector<uint32_t> brisc_compile_args = {
-            BRISC_OUTER_LOOP,
-            BRISC_MIDDLE_LOOP,
-            BRISC_INNER_LOOP,
-            NUM_CBS,
-            NUM_SEMS,
-            num_brisc_unique_rtargs,
-            num_brisc_common_rtargs,
-            page_size};
-
-        // ========== Set configs for NCRISC ==========
-        if (i == 0) {
-            NCRISC_OUTER_LOOP = MAX_LOOP;
-            NCRISC_MIDDLE_LOOP = MAX_LOOP;
-            NCRISC_INNER_LOOP = MAX_LOOP;
-        } else {
-            NCRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
-            NCRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
-            NCRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
-        }
-
-        auto [ncrisc_unique_rtargs, ncrisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS);
-        uint32_t num_ncrisc_unique_rtargs = ncrisc_unique_rtargs.size();
-        uint32_t num_ncrisc_common_rtargs = ncrisc_common_rtargs.size();
-        std::vector<uint32_t> ncrisc_compile_args = {
-            NCRISC_OUTER_LOOP,
-            NCRISC_MIDDLE_LOOP,
-            NCRISC_INNER_LOOP,
-            NUM_CBS,
-            NUM_SEMS,
-            num_ncrisc_unique_rtargs,
-            num_ncrisc_common_rtargs,
-            page_size};
-
-        // ========== Set configs for TRISC ==========
-        if (i == 0) {
-            TRISC_OUTER_LOOP = MAX_LOOP;
-            TRISC_MIDDLE_LOOP = MAX_LOOP;
-            TRISC_INNER_LOOP = MAX_LOOP;
-        } else {
-            TRISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
-            TRISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
-            TRISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
-        }
-
-        auto [trisc_unique_rtargs, trisc_common_rtargs] = create_runtime_args(USE_MAX_RT_ARGS);
-        uint32_t num_trisc_unique_rtargs = trisc_unique_rtargs.size();
-        uint32_t num_trisc_common_rtargs = trisc_common_rtargs.size();
-        std::vector<uint32_t> trisc_compile_args = {
-            TRISC_OUTER_LOOP,
-            TRISC_MIDDLE_LOOP,
-            TRISC_INNER_LOOP,
-            NUM_CBS,
-            NUM_SEMS,
-            num_trisc_unique_rtargs,
-            num_trisc_common_rtargs,
-            page_size};
-
-        if (i == 0) {
-            ERISC_OUTER_LOOP = MAX_LOOP;
-            ERISC_MIDDLE_LOOP = MAX_LOOP;
-            ERISC_INNER_LOOP = MAX_LOOP;
-        } else {
-            ERISC_OUTER_LOOP = rand() % (MAX_LOOP) + 1;
-            ERISC_MIDDLE_LOOP = rand() % (MAX_LOOP) + 1;
-            ERISC_INNER_LOOP = rand() % (MAX_LOOP) + 1;
-        }
-        // Only setup RTAs on ERISC. No Common RTAs.
-        uint32_t max_erisc_rtas = 64;
-        uint32_t num_erisc_rtas = rand() % (max_erisc_rtas + 1);
-        auto [erisc_unique_rtargs, erisc_common_rtargs] = create_runtime_args(num_erisc_rtas, 0, 0, 0);
-        uint32_t num_erisc_unique_rtargs = erisc_unique_rtargs.size();
-        uint32_t num_erisc_common_rt_args = erisc_common_rtargs.size();
-
-        std::vector<uint32_t> erisc_compile_time_args = {
-            ERISC_OUTER_LOOP,
-            ERISC_MIDDLE_LOOP,
-            ERISC_INNER_LOOP,
-            0, /* CBs are not supported on ERISC cores */
-            NUM_SEMS,
-            num_erisc_unique_rtargs,
-            num_erisc_common_rt_args,
-            page_size};
-
-        // Create Kernels
-        bool at_least_one_kernel = false;
-        if (i == 0 or ((rand() % 2) == 0)) {
-            auto dummy_brisc_kernel = CreateKernel(
-                program,
-                "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp",
-                cr_set,
-                DataMovementConfig{
-                    .processor = DataMovementProcessor::RISCV_0,
-                    .noc = NOC::RISCV_0_default,
-                    .compile_args = brisc_compile_args,
-                    .defines = data_movement_defines});
-            SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
-            SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
-            at_least_one_kernel = true;
-        }
-
-        if (i == 0 or ((rand() % 2) == 0)) {
-            auto dummy_ncrisc_kernel = CreateKernel(
-                program,
-                "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp",
-                cr_set,
-                DataMovementConfig{
-                    .processor = DataMovementProcessor::RISCV_1,
-                    .noc = NOC::RISCV_1_default,
-                    .compile_args = ncrisc_compile_args,
-                    .defines = data_movement_defines});
-            SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
-            SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
-            at_least_one_kernel = true;
-        }
-
-        if (i == 0 or ((rand() % 2) == 0)) {
-            auto dummy_trisc_kernel = CreateKernel(
-                program,
-                "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp",
-                cr_set,
-                ComputeConfig{
-                    .math_approx_mode = false, .compile_args = trisc_compile_args, .defines = compute_defines});
-            SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
-            SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
-            at_least_one_kernel = true;
-        }
-
-        if (not at_least_one_kernel) {
-            uint32_t random_risc = rand() % 3 + 1;
-            if (random_risc == 1) {
-                auto dummy_brisc_kernel = CreateKernel(
-                    program,
-                    "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp",
-                    cr_set,
-                    DataMovementConfig{
-                        .processor = DataMovementProcessor::RISCV_0,
-                        .noc = NOC::RISCV_0_default,
-                        .compile_args = brisc_compile_args,
-                        .defines = data_movement_defines});
-                SetRuntimeArgs(program, dummy_brisc_kernel, cr_set, brisc_unique_rtargs);
-                SetCommonRuntimeArgs(program, dummy_brisc_kernel, brisc_common_rtargs);
-            } else if (random_risc == 2) {
-                auto dummy_ncrisc_kernel = CreateKernel(
-                    program,
-                    "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp",
-                    cr_set,
-                    DataMovementConfig{
-                        .processor = DataMovementProcessor::RISCV_1,
-                        .noc = NOC::RISCV_1_default,
-                        .compile_args = ncrisc_compile_args,
-                        .defines = data_movement_defines});
-                SetRuntimeArgs(program, dummy_ncrisc_kernel, cr_set, ncrisc_unique_rtargs);
-                SetCommonRuntimeArgs(program, dummy_ncrisc_kernel, ncrisc_common_rtargs);
-            } else if (random_risc == 3) {
-                auto dummy_trisc_kernel = CreateKernel(
-                    program,
-                    "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp",
-                    cr_set,
-                    ComputeConfig{
-                        .math_approx_mode = false, .compile_args = trisc_compile_args, .defines = compute_defines});
-                SetRuntimeArgs(program, dummy_trisc_kernel, cr_set, trisc_unique_rtargs);
-                SetCommonRuntimeArgs(program, dummy_trisc_kernel, trisc_common_rtargs);
-            } else {
-                TT_THROW("Invalid");
-            }
-        }
-        if (active_eth_cores.size()) {
-            auto active_eth_core = active_eth_cores.begin();
-            for (int k = 0; k < max_eth_cores && active_eth_core != active_eth_cores.end(); ++i, ++active_eth_core) {
-                auto dummy_erisc_kernel = CreateKernel(
-                    program,
-                    "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/command_queue/random_program.cpp",
-                    *active_eth_core,
-                    EthernetConfig{
-                        .noc = NOC::NOC_0, .compile_args = erisc_compile_time_args, .defines = erisc_defines});
-                SetRuntimeArgs(program, dummy_erisc_kernel, *active_eth_core, erisc_unique_rtargs);
-            }
-        }
-    }
-    return programs;
-}
-
 std::vector<CBHandle> initialize_dummy_circular_buffers(
     Program& program, const CoreRangeSet& cr_set, const std::vector<CBConfig>& cb_configs) {
     std::vector<CBHandle> cb_handles;
@@ -402,7 +150,7 @@ TEST_F(MeshWorkloadTestT3000, MeshWorkloadOnActiveEthAsserts) {
     for (std::size_t logical_x = 0; logical_x < x_end; logical_x++) {
         for (std::size_t logical_y = 0; logical_y < y_end; logical_y++) {
             IDevice* device = mesh_device_->get_device(logical_y, logical_x);
-            auto programs = create_random_programs(
+            auto programs = tt::tt_metal::distributed::test::utils::create_random_programs(
                 1, mesh_device_->compute_with_storage_grid_size(), seed, device->get_active_ethernet_cores(true));
             LogicalDeviceRange devices = {{logical_x, logical_y}, {logical_x, logical_y}};
             AddProgramToMeshWorkload(*workload, *programs[0], devices);
@@ -422,7 +170,8 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) {
 
     log_info("Create MeshWorkloads with multiple programs each");
 
-    auto programs = create_random_programs(num_programs, mesh_device_->compute_with_storage_grid_size(), seed);
+    auto programs = tt::tt_metal::distributed::test::utils::create_random_programs(
+        num_programs, mesh_device_->compute_with_storage_grid_size(), seed);
     std::vector<std::shared_ptr<MeshWorkload>> mesh_workloads = {};
 
     log_info(tt::LogTest, "Compile and load {} MeshWorkloads", num_programs);
@@ -442,7 +191,8 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) {
         EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false);
         mesh_workloads.push_back(random_workload);
     }
-    programs = create_random_programs(num_programs, mesh_device_->compute_with_storage_grid_size(), seed);
+    programs = tt::tt_metal::distributed::test::utils::create_random_programs(
+        num_programs, mesh_device_->compute_with_storage_grid_size(), seed);
     for (int i = 0; i < num_programs; i += 4) {
         std::shared_ptr<MeshWorkload> random_workload = std::make_shared<MeshWorkload>();
         LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {0, 1});
@@ -456,7 +206,8 @@ TEST_F(MeshWorkloadTestT3000, SimultaneousMeshWorkloads) {
         EnqueueMeshWorkload(mesh_device_->mesh_command_queue(), *random_workload, false);
         mesh_workloads.push_back(random_workload);
     }
-    programs = create_random_programs(num_heterogeneous_programs, mesh_device_->compute_with_storage_grid_size(), seed);
+    programs = tt::tt_metal::distributed::test::utils::create_random_programs(
+        num_heterogeneous_programs, mesh_device_->compute_with_storage_grid_size(), seed);
     for (int i = 0; i < num_heterogeneous_programs; i += 8) {
         std::shared_ptr<MeshWorkload> random_workload = std::make_shared<MeshWorkload>();
         LogicalDeviceRange devices_0 = LogicalDeviceRange({0, 0}, {0, 0});
@@ -500,7 +251,8 @@ TEST_F(MeshWorkloadTestSuite, RandomizedMeshWorkload) {
     log_info(tt::LogTest, "Using Test Seed: {}", seed);
     srand(seed);
     log_info("Create {} MeshWorkloads", num_programs);
-    auto programs = create_random_programs(num_programs, mesh_device_->compute_with_storage_grid_size(), seed);
+    auto programs = tt::tt_metal::distributed::test::utils::create_random_programs(
+        num_programs, mesh_device_->compute_with_storage_grid_size(), seed);
     std::mt19937 rng(seed);
     std::uniform_int_distribution<int> gen_x(1, mesh_device_->num_cols());
     std::uniform_int_distribution<int> gen_y(1, mesh_device_->num_rows());