#0: comprehensive mem benchmark tool

Reland the previously reverted commit 532dd26
tenstorrent · Feb 22, 2025 · f92eee1 · f92eee1
1 parent c9feb5d
commit f92eee1
Show file tree

Hide file tree

Showing 14 changed files with 1,179 additions and 10 deletions.
diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp
@@ -3,19 +3,16 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include <climits>
 #include <magic_enum/magic_enum.hpp>
 #include <mutex>
 #include <tt-metalium/tt_align.hpp>
-#include <unordered_map>
 
 #include "cq_commands.hpp"
 #include "dispatch_core_manager.hpp"
 #include "launch_message_ring_buffer_state.hpp"
 #include "memcpy.hpp"
 #include "hal.hpp"
 #include "dispatch_settings.hpp"
-#include "helpers.hpp"
 #include "buffer.hpp"
 #include "umd/device/tt_core_coordinates.h"
 
@@ -193,8 +190,8 @@ class DispatchMemMap {
         uint32_t prefetch_dispatch_unreserved_base =
             device_cq_addrs_[tt::utils::underlying_type<CommandQueueDeviceAddrType>(
                 CommandQueueDeviceAddrType::UNRESERVED)];
-        cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(settings.prefetch_q_size_, pcie_alignment);
-        scratch_db_base_ = cmddat_q_base_ + round_size(settings.prefetch_cmddat_q_size_, pcie_alignment);
+        cmddat_q_base_ = align(prefetch_dispatch_unreserved_base + settings.prefetch_q_size_, pcie_alignment);
+        scratch_db_base_ = align(cmddat_q_base_ + settings.prefetch_cmddat_q_size_, pcie_alignment);
         dispatch_buffer_base_ = align(prefetch_dispatch_unreserved_base, 1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
         dispatch_buffer_block_size_pages_ = settings.dispatch_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS;
         const uint32_t dispatch_cb_end = dispatch_buffer_base_ + settings.dispatch_size_;

diff --git a/tt_metal/impl/dispatch/util/dispatch_settings.cpp b/tt_metal/impl/dispatch/util/dispatch_settings.cpp
@@ -8,7 +8,7 @@
 #include "magic_enum/magic_enum.hpp"
 #include "umd/device/tt_core_coordinates.h"
 #include <dispatch_settings.hpp>
-#include <helpers.hpp>
+#include "size_literals.hpp"
 
 namespace tt::tt_metal {
 

diff --git a/tt_metal/api/tt-metalium/helpers.hpp → ...etal/impl/dispatch/util/size_literals.hpp b/tt_metal/api/tt-metalium/helpers.hpp → ...etal/impl/dispatch/util/size_literals.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -18,8 +18,6 @@ constexpr auto operator""_MB(const unsigned long long v) -> uint32_t { return 10
 constexpr auto operator""_GB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * 1024 * v; }
 
 // Returns the size rounded up to the given alignment
-inline uint32_t round_size(uint32_t sz, uint32_t alignment) {
-    return ((sz + alignment - 1) / alignment * alignment);
-}
+inline uint32_t round_size(uint32_t sz, uint32_t alignment) { return ((sz + alignment - 1) / alignment * alignment); }
 
 }  // namespace tt::tt_metal
diff --git a/tt_metal/tools/CMakeLists.txt b/tt_metal/tools/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/profiler)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/watcher_dump)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lightmetal_runner)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mem_bench)
 
 set(TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/memset.cpp)
 
@@ -10,6 +11,7 @@ target_link_libraries(
     PUBLIC
         profiler
         Metalium::Metal::LLRT
+        Metalium::Metal
     PRIVATE
         TT::Metalium::HostDevCommon
 )
diff --git a/tt_metal/tools/mem_bench/CMakeLists.txt b/tt_metal/tools/mem_bench/CMakeLists.txt
@@ -0,0 +1,42 @@
+set(IMPL_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/mem_bench.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.cpp
+)
+
+set(HEADERS_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/work_thread.hpp
+)
+
+add_executable(
+    mem_bench
+    ${IMPL_SRC}
+    ${HEADERS_SRC}
+)
+target_link_libraries(
+    mem_bench
+    PRIVATE
+        Metalium::Metal
+        TT::Metalium::Common
+        Metalium::Metal::Impl
+        Metalium::Metal::LLRT
+        numa
+        benchmark::benchmark
+)
+target_include_directories(
+    mem_bench
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${CMAKE_CURRENT_SOURCE_DIR}
+)
+set_target_properties(
+    mem_bench
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/tools
+)
diff --git a/tt_metal/tools/mem_bench/README.md b/tt_metal/tools/mem_bench/README.md
@@ -0,0 +1,46 @@
+# tt mem_bench
+
+Utility to measure host and device bandwidth on Tenstorrent devices.
+
+## Build
+
+Tools are included in `tt_metal` builds. Using a release build is required for accurate perf measurements.
+
+## Usage
+
+By default, each test is run for 5 iterations and only basic tests are executed. All test patterns can be executed by specifying `--full`. Additional run parameters are listed below.
+
+Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core.
+
+> [!NOTE]
+The `tt_metal` library log level can be adjusted by exporting `TT_METAL_LOGGER_LEVEL=fatal|info|error|debug`.
+
+> [!NOTE]
+On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 <command>` will allocate resources closer to the device.
+
+```
+./build/tools/mem_bench --help
+benchmark [--benchmark_list_tests={true|false}]
+          [--benchmark_filter=<regex>]
+          [--benchmark_min_time=`<integer>x` OR `<float>s` ]
+          [--benchmark_min_warmup_time=<min_warmup_time>]
+          [--benchmark_repetitions=<num_repetitions>]
+          [--benchmark_dry_run={true|false}]
+          [--benchmark_enable_random_interleaving={true|false}]
+          [--benchmark_report_aggregates_only={true|false}]
+          [--benchmark_display_aggregates_only={true|false}]
+          [--benchmark_format=<console|json|csv>]
+          [--benchmark_out=<filename>]
+          [--benchmark_out_format=<json|console|csv>]
+          [--benchmark_color={auto|true|false}]
+          [--benchmark_counters_tabular={true|false}]
+          [--benchmark_context=<key>=<value>,...]
+          [--benchmark_time_unit={ns|us|ms|s}]
+          [--v=<verbosity>]
+          [--help] Shows this help message
+          [--full] Run all tests
+
+Counters
+          bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured.
+          dev_bw: Average device core PCIe pull bandwidth. 0 if not measured.
+```
diff --git a/tt_metal/tools/mem_bench/context.hpp b/tt_metal/tools/mem_bench/context.hpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/hal_exp.hpp>
+#include <tt-metalium/tt_align.hpp>
+
+namespace tt::tt_metal::tools::mem_bench {
+
+struct TestResult {
+    double host_bytes_processed{0};
+    double host_time_elapsed{0};
+    double host_wait_for_kernel_time_elapsed{0};
+
+    double total_cores_cycles{0};
+    double total_cores_time{0};
+    double total_cores_bytes_rd{0};
+    double total_cores_bytes_wr{0};
+
+    double kernel_0_cycles{0};
+    double kernel_0_time{0};
+    double kernel_0_bytes_rd{0};
+    double kernel_0_bytes_wr{0};
+
+    // Any additional values to be included in benchmark reports
+    std::map<std::string, double> arb_counters;
+};
+
+struct L1MemoryMap {
+    uint32_t cycles;
+    uint32_t rd_bytes;
+    uint32_t wr_bytes;
+    uint32_t unreserved;
+};
+
+struct Context {
+    std::map<chip_id_t, IDevice*> devices;
+    L1MemoryMap device_address;
+    uint32_t total_size{0};
+    uint32_t page_size{0};
+    int threads{0};
+    int number_reader_kernels{0};
+    int number_writer_kernels{0};
+    bool enable_host_copy_with_kernels{0};
+    int iterations{0};
+
+    Context(
+        const std::map<chip_id_t, IDevice*>& devices_,
+        uint32_t total_size_,
+        uint32_t page_size_,
+        int threads_,
+        int readers_,
+        int writers_,
+        bool enable_host_copy_with_kernels_,
+        int iterations_) {
+        auto l1_alignment = experimental::hal::get_l1_alignment();
+        auto l1_base = experimental::hal::get_tensix_l1_unreserved_base();
+        device_address.cycles = l1_base;
+        device_address.rd_bytes = align(device_address.cycles + sizeof(uint32_t), l1_alignment);
+        device_address.wr_bytes = align(device_address.rd_bytes + sizeof(uint32_t), l1_alignment);
+        device_address.unreserved = align(device_address.wr_bytes + sizeof(uint32_t), l1_alignment);
+        devices = devices_;
+        total_size = total_size_;
+        page_size = page_size_;
+        threads = threads_;
+        number_reader_kernels = readers_;
+        number_writer_kernels = writers_;
+        enable_host_copy_with_kernels = enable_host_copy_with_kernels_;
+        iterations = iterations_;
+    }
+};
+
+}  // namespace tt::tt_metal::tools::mem_bench
diff --git a/tt_metal/tools/mem_bench/device_utils.cpp b/tt_metal/tools/mem_bench/device_utils.cpp
@@ -0,0 +1,92 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include "device_utils.hpp"
+#include "context.hpp"
+
+namespace tt::tt_metal::tools::mem_bench {
+
+std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr) {
+    std::vector<uint32_t> data;
+    for (int xi = cores.start_coord.x; xi <= cores.end_coord.x; ++xi) {
+        for (int yi = cores.start_coord.y; yi <= cores.end_coord.y; ++yi) {
+            std::vector<uint32_t> single_data;
+            tt::tt_metal::detail::ReadFromDeviceL1(device, CoreCoord{xi, yi}, addr, sizeof(uint32_t), single_data);
+            data.push_back(single_data[0]);
+        }
+    }
+    return data;
+}
+
+std::optional<CoreRange> configure_kernels(
+    tt::tt_metal::IDevice* device,
+    tt::tt_metal::Program& program,
+    const Context& context,
+    uint32_t start_y,
+    uint32_t num_kernels,
+    bool is_writer,
+    uint32_t pcie_size,
+    uint32_t pcie_offset) {
+    constexpr std::string_view k_PcieBenchKernel = "tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp";
+    const auto grid_size = device->logical_grid_size();
+    const auto max_x = grid_size.x;
+    const auto max_y = grid_size.y;
+    uint32_t total_kernel_transfer = context.total_size;
+    uint32_t kernel_transfer_size = context.page_size;
+
+    if (!kernel_transfer_size) {
+        kernel_transfer_size = total_kernel_transfer;
+    } else if (!num_kernels) {
+        return {};
+    }
+
+    // Number readers either less than one row
+    // or a multiple of the rows
+    CoreCoord start_coord{0, start_y};
+    CoreCoord end_coord;
+    if (num_kernels <= max_x) {
+        end_coord.x = start_coord.x + num_kernels - 1;
+        end_coord.y = start_coord.y;
+    } else {
+        const auto number_of_rows = num_kernels / max_x;
+        const auto last_row_width = (num_kernels % max_x) ? num_kernels % max_x : max_x;
+        end_coord.x = start_coord.x + last_row_width - 1;
+        end_coord.y = number_of_rows - 1;
+    }
+    CoreRange core_range{start_coord, end_coord};
+
+    std::vector<uint32_t> pcie_bench_compile_args(12, 0);
+    if (is_writer) {
+        pcie_bench_compile_args[5] = 0;                     // reserved_0
+        pcie_bench_compile_args[6] = pcie_offset;           // pcie_wr_base
+        pcie_bench_compile_args[7] = pcie_size;             // pcie_wr_size
+        pcie_bench_compile_args[8] = kernel_transfer_size;  // pcie_wr_transfer_size
+    } else {
+        pcie_bench_compile_args[0] = context.device_address.unreserved;  // my_rd_dst_addr
+        pcie_bench_compile_args[1] = pcie_offset;                        // pcie_rd_base
+        pcie_bench_compile_args[2] = pcie_size;                          // pcie_rd_size
+        pcie_bench_compile_args[3] = kernel_transfer_size;               // pcie_rd_transfer_size
+    }
+    pcie_bench_compile_args[4] = context.device_address.rd_bytes;  // my_bytes_rd_addr
+    pcie_bench_compile_args[9] = context.device_address.wr_bytes;  // my_bytes_wr_addr
+    pcie_bench_compile_args[10] = total_kernel_transfer;
+    pcie_bench_compile_args[11] = context.device_address.cycles;
+
+    [[maybe_unused]] auto kernel = tt::tt_metal::CreateKernel(
+        program,
+        std::string{k_PcieBenchKernel},
+        core_range,
+        DataMovementConfig{
+            .processor = DataMovementProcessor::RISCV_1,
+            .noc = tt::tt_metal::NOC_0,
+            .compile_args = pcie_bench_compile_args,
+            .defines = {},
+        });
+
+    return core_range;
+}
+
+}  // namespace tt::tt_metal::tools::mem_bench
diff --git a/tt_metal/tools/mem_bench/device_utils.hpp b/tt_metal/tools/mem_bench/device_utils.hpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <vector>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include "context.hpp"
+
+namespace tt::tt_metal::tools::mem_bench {
+
+std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr);
+
+std::optional<CoreRange> configure_kernels(
+    tt::tt_metal::IDevice* device,
+    tt::tt_metal::Program& program,
+    const Context& context,
+    uint32_t start_y,
+    uint32_t num_kernels,
+    bool is_writer,
+    uint32_t pcie_size,
+    uint32_t pcie_offset = 0);
+
+}  // namespace tt::tt_metal::tools::mem_bench