diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp index 30de4f2e6312..53f6eb068ea3 100644 --- a/tt_metal/api/tt-metalium/command_queue_interface.hpp +++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp @@ -3,11 +3,9 @@ // SPDX-License-Identifier: Apache-2.0 #pragma once -#include #include #include #include -#include #include "cq_commands.hpp" #include "dispatch_core_manager.hpp" @@ -15,7 +13,6 @@ #include "memcpy.hpp" #include "hal.hpp" #include "dispatch_settings.hpp" -#include "helpers.hpp" #include "buffer.hpp" #include "umd/device/tt_core_coordinates.h" @@ -193,8 +190,8 @@ class DispatchMemMap { uint32_t prefetch_dispatch_unreserved_base = device_cq_addrs_[tt::utils::underlying_type( CommandQueueDeviceAddrType::UNRESERVED)]; - cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(settings.prefetch_q_size_, pcie_alignment); - scratch_db_base_ = cmddat_q_base_ + round_size(settings.prefetch_cmddat_q_size_, pcie_alignment); + cmddat_q_base_ = align(prefetch_dispatch_unreserved_base + settings.prefetch_q_size_, pcie_alignment); + scratch_db_base_ = align(cmddat_q_base_ + settings.prefetch_cmddat_q_size_, pcie_alignment); dispatch_buffer_base_ = align(prefetch_dispatch_unreserved_base, 1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE); dispatch_buffer_block_size_pages_ = settings.dispatch_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS; const uint32_t dispatch_cb_end = dispatch_buffer_base_ + settings.dispatch_size_; diff --git a/tt_metal/impl/dispatch/util/dispatch_settings.cpp b/tt_metal/impl/dispatch/util/dispatch_settings.cpp index 7912a1f825d9..a6003177a968 100644 --- a/tt_metal/impl/dispatch/util/dispatch_settings.cpp +++ b/tt_metal/impl/dispatch/util/dispatch_settings.cpp @@ -8,7 +8,7 @@ #include "magic_enum/magic_enum.hpp" #include "umd/device/tt_core_coordinates.h" #include -#include +#include "size_literals.hpp" namespace tt::tt_metal { diff --git a/tt_metal/api/tt-metalium/helpers.hpp b/tt_metal/impl/dispatch/util/size_literals.hpp similarity index 75% rename from tt_metal/api/tt-metalium/helpers.hpp rename to tt_metal/impl/dispatch/util/size_literals.hpp index aebf3f3f69a5..061d98809044 100644 --- a/tt_metal/api/tt-metalium/helpers.hpp +++ b/tt_metal/impl/dispatch/util/size_literals.hpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. // // SPDX-License-Identifier: Apache-2.0 @@ -18,8 +18,6 @@ constexpr auto operator""_MB(const unsigned long long v) -> uint32_t { return 10 constexpr auto operator""_GB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * 1024 * v; } // Returns the size rounded up to the given alignment -inline uint32_t round_size(uint32_t sz, uint32_t alignment) { - return ((sz + alignment - 1) / alignment * alignment); -} +inline uint32_t round_size(uint32_t sz, uint32_t alignment) { return ((sz + alignment - 1) / alignment * alignment); } } // namespace tt::tt_metal diff --git a/tt_metal/tools/CMakeLists.txt b/tt_metal/tools/CMakeLists.txt index 3509710519aa..186c1ea86c70 100644 --- a/tt_metal/tools/CMakeLists.txt +++ b/tt_metal/tools/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/profiler) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/watcher_dump) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lightmetal_runner) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mem_bench) set(TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/memset.cpp) @@ -10,6 +11,7 @@ target_link_libraries( PUBLIC profiler Metalium::Metal::LLRT + Metalium::Metal PRIVATE TT::Metalium::HostDevCommon ) diff --git a/tt_metal/tools/mem_bench/CMakeLists.txt b/tt_metal/tools/mem_bench/CMakeLists.txt new file mode 100644 index 000000000000..a6c13ae19d20 --- /dev/null +++ b/tt_metal/tools/mem_bench/CMakeLists.txt @@ -0,0 +1,42 @@ +set(IMPL_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/mem_bench.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.cpp +) + +set(HEADERS_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/work_thread.hpp +) + +add_executable( + mem_bench + ${IMPL_SRC} + ${HEADERS_SRC} +) +target_link_libraries( + mem_bench + PRIVATE + Metalium::Metal + TT::Metalium::Common + Metalium::Metal::Impl + Metalium::Metal::LLRT + numa + benchmark::benchmark +) +target_include_directories( + mem_bench + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${CMAKE_CURRENT_SOURCE_DIR} +) +set_target_properties( + mem_bench + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/tools +) diff --git a/tt_metal/tools/mem_bench/README.md b/tt_metal/tools/mem_bench/README.md new file mode 100644 index 000000000000..03f2731d0d08 --- /dev/null +++ b/tt_metal/tools/mem_bench/README.md @@ -0,0 +1,46 @@ +# tt mem_bench + +Utility to measure host and device bandwidth on Tenstorrent devices. + +## Build + +Tools are included in `tt_metal` builds. Using a release build is required for accurate perf measurements. + +## Usage + +By default, each test is run for 5 iterations and only basic tests are executed. All test patterns can be executed by specifying `--full`. Additional run parameters are listed below. + +Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core. + +> [!NOTE] +The `tt_metal` library log level can be adjusted by exporting `TT_METAL_LOGGER_LEVEL=fatal|info|error|debug`. + +> [!NOTE] +On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 ` will allocate resources closer to the device. + +``` +./build/tools/mem_bench --help +benchmark [--benchmark_list_tests={true|false}] + [--benchmark_filter=] + [--benchmark_min_time=`x` OR `s` ] + [--benchmark_min_warmup_time=] + [--benchmark_repetitions=] + [--benchmark_dry_run={true|false}] + [--benchmark_enable_random_interleaving={true|false}] + [--benchmark_report_aggregates_only={true|false}] + [--benchmark_display_aggregates_only={true|false}] + [--benchmark_format=] + [--benchmark_out=] + [--benchmark_out_format=] + [--benchmark_color={auto|true|false}] + [--benchmark_counters_tabular={true|false}] + [--benchmark_context==,...] + [--benchmark_time_unit={ns|us|ms|s}] + [--v=] + [--help] Shows this help message + [--full] Run all tests + +Counters + bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured. + dev_bw: Average device core PCIe pull bandwidth. 0 if not measured. +``` diff --git a/tt_metal/tools/mem_bench/context.hpp b/tt_metal/tools/mem_bench/context.hpp new file mode 100644 index 000000000000..4bf8d8ff4503 --- /dev/null +++ b/tt_metal/tools/mem_bench/context.hpp @@ -0,0 +1,78 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include + +namespace tt::tt_metal::tools::mem_bench { + +struct TestResult { + double host_bytes_processed{0}; + double host_time_elapsed{0}; + double host_wait_for_kernel_time_elapsed{0}; + + double total_cores_cycles{0}; + double total_cores_time{0}; + double total_cores_bytes_rd{0}; + double total_cores_bytes_wr{0}; + + double kernel_0_cycles{0}; + double kernel_0_time{0}; + double kernel_0_bytes_rd{0}; + double kernel_0_bytes_wr{0}; + + // Any additional values to be included in benchmark reports + std::map arb_counters; +}; + +struct L1MemoryMap { + uint32_t cycles; + uint32_t rd_bytes; + uint32_t wr_bytes; + uint32_t unreserved; +}; + +struct Context { + std::map devices; + L1MemoryMap device_address; + uint32_t total_size{0}; + uint32_t page_size{0}; + int threads{0}; + int number_reader_kernels{0}; + int number_writer_kernels{0}; + bool enable_host_copy_with_kernels{0}; + int iterations{0}; + + Context( + const std::map& devices_, + uint32_t total_size_, + uint32_t page_size_, + int threads_, + int readers_, + int writers_, + bool enable_host_copy_with_kernels_, + int iterations_) { + auto l1_alignment = experimental::hal::get_l1_alignment(); + auto l1_base = experimental::hal::get_tensix_l1_unreserved_base(); + device_address.cycles = l1_base; + device_address.rd_bytes = align(device_address.cycles + sizeof(uint32_t), l1_alignment); + device_address.wr_bytes = align(device_address.rd_bytes + sizeof(uint32_t), l1_alignment); + device_address.unreserved = align(device_address.wr_bytes + sizeof(uint32_t), l1_alignment); + devices = devices_; + total_size = total_size_; + page_size = page_size_; + threads = threads_; + number_reader_kernels = readers_; + number_writer_kernels = writers_; + enable_host_copy_with_kernels = enable_host_copy_with_kernels_; + iterations = iterations_; + } +}; + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/device_utils.cpp b/tt_metal/tools/mem_bench/device_utils.cpp new file mode 100644 index 000000000000..bd650a3c0527 --- /dev/null +++ b/tt_metal/tools/mem_bench/device_utils.cpp @@ -0,0 +1,92 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include "device_utils.hpp" +#include "context.hpp" + +namespace tt::tt_metal::tools::mem_bench { + +std::vector read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr) { + std::vector data; + for (int xi = cores.start_coord.x; xi <= cores.end_coord.x; ++xi) { + for (int yi = cores.start_coord.y; yi <= cores.end_coord.y; ++yi) { + std::vector single_data; + tt::tt_metal::detail::ReadFromDeviceL1(device, CoreCoord{xi, yi}, addr, sizeof(uint32_t), single_data); + data.push_back(single_data[0]); + } + } + return data; +} + +std::optional configure_kernels( + tt::tt_metal::IDevice* device, + tt::tt_metal::Program& program, + const Context& context, + uint32_t start_y, + uint32_t num_kernels, + bool is_writer, + uint32_t pcie_size, + uint32_t pcie_offset) { + constexpr std::string_view k_PcieBenchKernel = "tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp"; + const auto grid_size = device->logical_grid_size(); + const auto max_x = grid_size.x; + const auto max_y = grid_size.y; + uint32_t total_kernel_transfer = context.total_size; + uint32_t kernel_transfer_size = context.page_size; + + if (!kernel_transfer_size) { + kernel_transfer_size = total_kernel_transfer; + } else if (!num_kernels) { + return {}; + } + + // Number readers either less than one row + // or a multiple of the rows + CoreCoord start_coord{0, start_y}; + CoreCoord end_coord; + if (num_kernels <= max_x) { + end_coord.x = start_coord.x + num_kernels - 1; + end_coord.y = start_coord.y; + } else { + const auto number_of_rows = num_kernels / max_x; + const auto last_row_width = (num_kernels % max_x) ? num_kernels % max_x : max_x; + end_coord.x = start_coord.x + last_row_width - 1; + end_coord.y = number_of_rows - 1; + } + CoreRange core_range{start_coord, end_coord}; + + std::vector pcie_bench_compile_args(12, 0); + if (is_writer) { + pcie_bench_compile_args[5] = 0; // reserved_0 + pcie_bench_compile_args[6] = pcie_offset; // pcie_wr_base + pcie_bench_compile_args[7] = pcie_size; // pcie_wr_size + pcie_bench_compile_args[8] = kernel_transfer_size; // pcie_wr_transfer_size + } else { + pcie_bench_compile_args[0] = context.device_address.unreserved; // my_rd_dst_addr + pcie_bench_compile_args[1] = pcie_offset; // pcie_rd_base + pcie_bench_compile_args[2] = pcie_size; // pcie_rd_size + pcie_bench_compile_args[3] = kernel_transfer_size; // pcie_rd_transfer_size + } + pcie_bench_compile_args[4] = context.device_address.rd_bytes; // my_bytes_rd_addr + pcie_bench_compile_args[9] = context.device_address.wr_bytes; // my_bytes_wr_addr + pcie_bench_compile_args[10] = total_kernel_transfer; + pcie_bench_compile_args[11] = context.device_address.cycles; + + [[maybe_unused]] auto kernel = tt::tt_metal::CreateKernel( + program, + std::string{k_PcieBenchKernel}, + core_range, + DataMovementConfig{ + .processor = DataMovementProcessor::RISCV_1, + .noc = tt::tt_metal::NOC_0, + .compile_args = pcie_bench_compile_args, + .defines = {}, + }); + + return core_range; +} + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/device_utils.hpp b/tt_metal/tools/mem_bench/device_utils.hpp new file mode 100644 index 000000000000..ab20ebfc3cc8 --- /dev/null +++ b/tt_metal/tools/mem_bench/device_utils.hpp @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include "context.hpp" + +namespace tt::tt_metal::tools::mem_bench { + +std::vector read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr); + +std::optional configure_kernels( + tt::tt_metal::IDevice* device, + tt::tt_metal::Program& program, + const Context& context, + uint32_t start_y, + uint32_t num_kernels, + bool is_writer, + uint32_t pcie_size, + uint32_t pcie_offset = 0); + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/host_utils.cpp b/tt_metal/tools/mem_bench/host_utils.cpp new file mode 100644 index 000000000000..9aad3fe59fa8 --- /dev/null +++ b/tt_metal/tools/mem_bench/host_utils.cpp @@ -0,0 +1,87 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "host_utils.hpp" +#include +#include +#include +#include +#include +#include + +namespace tt::tt_metal::tools::mem_bench { + +void* get_hugepage(int device_id, uint32_t base_offset) { + auto& cluster = tt::Cluster::instance(); + auto mmio_device_id = cluster.get_associated_mmio_device(device_id); + auto channel = cluster.get_assigned_channel_for_device(device_id); + return (void*)(cluster.host_dma_address(base_offset, mmio_device_id, channel)); +} + +uint32_t get_hugepage_size(int device_id) { + auto& cluster = tt::Cluster::instance(); + auto mmio_device_id = cluster.get_associated_mmio_device(device_id); + auto channel = cluster.get_assigned_channel_for_device(device_id); + return cluster.get_host_channel_size(mmio_device_id, channel); +} + +tt::tt_metal::vector_memcpy_aligned generate_random_src_data(uint32_t num_bytes) { + std::uniform_int_distribution distribution( + std::numeric_limits::min(), std::numeric_limits::max()); + std::default_random_engine generator; + + tt::tt_metal::vector_memcpy_aligned vec(num_bytes / sizeof(uint32_t)); + std::generate(vec.begin(), vec.end(), [&]() { return distribution(generator); }); + + return vec; +} + +double get_current_time_seconds() { + return std::chrono::duration(std::chrono::high_resolution_clock::now().time_since_epoch()).count(); +} + +std::vector get_mmio_device_ids(int number_of_devices, int numa_node) { + auto& cluster = tt::Cluster::instance(); + const auto pcie_devices = cluster.number_of_pci_devices(); + std::vector device_ids; + + // Assumes PCIe device IDs are iterated first + for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) { + // Not an MMIO device + if (cluster.get_associated_mmio_device(device_id) != device_id) { + continue; + } + + auto associated_node = cluster.get_numa_node_for_device(device_id); + if (numa_node == -1 || associated_node == numa_node) { + device_ids.push_back(device_id); + } + } + + return device_ids; +} + +std::vector get_mmio_device_ids_unique_nodes(int number_of_devices) { + auto& cluster = tt::Cluster::instance(); + const auto pcie_devices = cluster.number_of_pci_devices(); + std::vector device_ids; + std::unordered_set numa_nodes; + + for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) { + auto associated_node = cluster.get_numa_node_for_device(device_id); + if (!numa_nodes.contains(associated_node)) { + device_ids.push_back(device_id); + numa_nodes.insert(associated_node); + } + } + + return device_ids; +} + +int get_number_of_mmio_devices() { + auto& cluster = tt::Cluster::instance(); + return cluster.number_of_pci_devices(); +} + +} // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/host_utils.hpp b/tt_metal/tools/mem_bench/host_utils.hpp new file mode 100644 index 000000000000..cab39c6ecdea --- /dev/null +++ b/tt_metal/tools/mem_bench/host_utils.hpp @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include + +namespace tt::tt_metal::tools::mem_bench { + +// Generate random data aligned for memcpy_to_device. +tt::tt_metal::vector_memcpy_aligned generate_random_src_data(uint32_t num_bytes); + +// Get current host time, in seconds. +double get_current_time_seconds(); + +// Return device ids. If numa_node is specified then only device ids on that +// node will be returned. If numa_node == -1, then the node is not taken into +// consideration. Note: Less than number_of_devices may be returned. +std::vector get_mmio_device_ids(int number_of_devices, int numa_node); + +// Returns device ids. All devices are on different nodes. Note: Less than +// number_of_devices may be returned. +std::vector get_mmio_device_ids_unique_nodes(int number_of_devices); + +// Returns the number of MMIO connected chips. +int get_number_of_mmio_devices(); + +// Returns the hugepage pointer assigned to a device. +void* get_hugepage(int device_id, uint32_t base_offset); + +// Returns the size of the hugepage assigned to a device. +uint32_t get_hugepage_size(int device_id); + +// Copy data to hugepage. Returns the duration. +// repeating_src_vector: Keep copying the same elements to hugepage. This should force the source data in stay in the +// caches. fence: Memory barrier at the end of each copy. Returns the time in seconds +template +double copy_to_hugepage( + void* hugepage_base, + uint32_t hugepage_size, + std::span src_data, + size_t total_size, + size_t page_size, + bool repeating_src_vector) { + uint64_t hugepage_addr = reinterpret_cast(hugepage_base); + uint64_t hugepage_end = hugepage_addr + hugepage_size; + uint64_t src_addr = reinterpret_cast(src_data.data()); + size_t num_pages; + if (!page_size) { + num_pages = 1; + page_size = total_size; + } else { + num_pages = total_size / page_size; + } + + auto start = get_current_time_seconds(); + for (int i = 0; i < num_pages; ++i) { + tt::tt_metal::memcpy_to_device((void*)(hugepage_addr), (void*)(src_addr), page_size); + + // 64 bit host address alignment + hugepage_addr = ((hugepage_addr + page_size - 1) | (tt::tt_metal::MEMCPY_ALIGNMENT - 1)) + 1; + + if (!repeating_src_vector) { + src_addr += page_size; + } + + // Wrap back to the beginning of hugepage + if (hugepage_addr + page_size >= hugepage_end) { + hugepage_addr = reinterpret_cast(hugepage_base); + } + } + auto end = get_current_time_seconds(); + + return end - start; +} + +}; // namespace tt::tt_metal::tools::mem_bench diff --git a/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp b/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp new file mode 100644 index 000000000000..e04b02013dec --- /dev/null +++ b/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp @@ -0,0 +1,99 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include "dataflow_api.h" +#include +#include +#include "noc_overlay_parameters.h" + +// +// Test Kernel for mem_bench +// +// Performs PCIe reads and/or writes +// + +// reader kernel +constexpr uint32_t my_rd_dst_addr = get_compile_time_arg_val(0); // L1 +constexpr uint32_t pcie_rd_base = get_compile_time_arg_val(1); +constexpr uint32_t pcie_rd_size = get_compile_time_arg_val(2); +constexpr uint32_t pcie_rd_end = pcie_rd_base + pcie_rd_size; +constexpr uint32_t pcie_rd_transfer_size = get_compile_time_arg_val(3); +constexpr uint32_t my_bytes_rd_addr = get_compile_time_arg_val(4); + +// writer kernel +constexpr uint32_t reserved_0 = get_compile_time_arg_val(5); +constexpr uint32_t pcie_wr_base = get_compile_time_arg_val(6); +constexpr uint32_t pcie_wr_size = get_compile_time_arg_val(7); +constexpr uint32_t pcie_wr_end = pcie_wr_base + pcie_wr_size; +constexpr uint32_t pcie_wr_transfer_size = get_compile_time_arg_val(8); +constexpr uint32_t my_bytes_wr_addr = get_compile_time_arg_val(9); + +// common to both +constexpr uint32_t my_total_work = get_compile_time_arg_val(10); // Total bytes to read+write +constexpr uint32_t my_cycles_addr = get_compile_time_arg_val(11); + +static_assert(my_bytes_rd_addr && my_bytes_wr_addr, "Must provide addresses for my_bytes_rd/wr_addr"); +static_assert(my_cycles_addr, "Must provide L1 address for cycles elapsed"); + +uint64_t get_cycles() { + uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L); + uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H); + return (((uint64_t)timestamp_high) << 32) | timestamp_low; +} + +void kernel_main() { + auto my_cycles = reinterpret_cast(my_cycles_addr); + auto my_bytes_read = reinterpret_cast(my_bytes_rd_addr); + auto my_bytes_written = reinterpret_cast(my_bytes_wr_addr); + + my_bytes_read[0] = 0; + my_bytes_written[0] = 0; + my_cycles[0] = 0; + + uint64_t pcie_noc_xy_encoding = (uint64_t)NOC_XY_PCIE_ENCODING(PCIE_NOC_X, PCIE_NOC_Y); + uint32_t rd_ptr = pcie_rd_base; + uint32_t wr_ptr = pcie_wr_base; + + const auto start = get_cycles(); + + uint32_t total_bytes_read = 0; + uint32_t total_bytes_written = 0; + while (total_bytes_read + total_bytes_written < my_total_work) { + if constexpr (my_rd_dst_addr) { + uint64_t host_src_addr = pcie_noc_xy_encoding | rd_ptr; + noc_async_read(host_src_addr, my_rd_dst_addr, pcie_rd_transfer_size); + rd_ptr += pcie_rd_transfer_size; + total_bytes_read += pcie_rd_transfer_size; + if (rd_ptr >= pcie_rd_end) { + rd_ptr = pcie_rd_base; + } + } + if constexpr (pcie_wr_size) { + uint64_t host_dst_addr = pcie_noc_xy_encoding | wr_ptr; + noc_async_write( + wr_ptr, // Any data + host_dst_addr, + pcie_wr_transfer_size); + wr_ptr += pcie_wr_transfer_size; + total_bytes_written += pcie_wr_transfer_size; + if (wr_ptr >= pcie_wr_end) { + wr_ptr = pcie_wr_base; + } + } + } + + if constexpr (my_rd_dst_addr) { + noc_async_read_barrier(); + } + if constexpr (pcie_wr_size) { + noc_async_write_barrier(); + } + + auto end = get_cycles(); + my_cycles[0] = end - start; + my_bytes_read[0] = total_bytes_read; + my_bytes_written[0] = total_bytes_written; +} diff --git a/tt_metal/tools/mem_bench/mem_bench.cpp b/tt_metal/tools/mem_bench/mem_bench.cpp new file mode 100644 index 000000000000..da0b2a8a8af5 --- /dev/null +++ b/tt_metal/tools/mem_bench/mem_bench.cpp @@ -0,0 +1,545 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "context.hpp" +#include "host_utils.hpp" +#include "device_utils.hpp" +#include "work_thread.hpp" +#include "tt_metal/impl/dispatch/util/size_literals.hpp" + +using namespace tt; +using namespace tt::tt_metal; +using namespace tt::tt_metal::tools::mem_bench; + +// Read L1 counters (cycles, bytes rd, bytes wr) and increment test_results +void read_inc_data_from_cores(const Context& ctx, IDevice* device, const CoreRange& cores, TestResult& test_results) { + auto dev_cycles = read_cores(device, cores, ctx.device_address.cycles); + auto dev_bytes_read = read_cores(device, cores, ctx.device_address.rd_bytes); + auto dev_bytes_written = read_cores(device, cores, ctx.device_address.wr_bytes); + auto dev_clk = tt::Cluster::instance().get_device_aiclk(device->id()) * 1e6; // Hz + + double total_cycles = std::reduce(dev_cycles.begin(), dev_cycles.end(), 0ULL); + + test_results.total_cores_cycles += total_cycles; + test_results.total_cores_time += total_cycles / dev_clk; + // Reduce with 64 bits to prevent overflow as values read from device is 32 bits + test_results.total_cores_bytes_rd += std::reduce(dev_bytes_read.begin(), dev_bytes_read.end(), 0ULL); + test_results.total_cores_bytes_wr += std::reduce(dev_bytes_written.begin(), dev_bytes_written.end(), 0ULL); + + test_results.kernel_0_cycles += dev_cycles[0]; + test_results.kernel_0_time += dev_cycles[0] / dev_clk; + test_results.kernel_0_bytes_rd += dev_bytes_read[0]; + test_results.kernel_0_bytes_wr += dev_bytes_written[0]; +} + +// Report device bandwidth to the benchmark state +// Average bw will be reported as "dev_bw" as well as the bw for the +// first core will also be reported by itself as "kernel_0_bw". +void report_device_bw(benchmark::State& state, const TestResult& test_results) { + state.counters["dev_bw"] = + (test_results.total_cores_bytes_rd + test_results.total_cores_bytes_wr) / test_results.total_cores_time; + state.counters["dev_rd_bytes"] = test_results.total_cores_bytes_rd; + state.counters["dev_wr_bytes"] = test_results.total_cores_bytes_wr; + state.counters["dev_rd_bw"] = test_results.total_cores_bytes_rd / test_results.total_cores_time; + state.counters["dev_wr_bw"] = test_results.total_cores_bytes_wr / test_results.total_cores_time; + state.counters["dev_cycles"] = test_results.total_cores_cycles; + + state.counters["kernel_0_bw"] = + (test_results.kernel_0_bytes_rd + test_results.kernel_0_bytes_wr) / test_results.kernel_0_time; + state.counters["kernel_0_rd_bw"] = test_results.kernel_0_bytes_rd / test_results.kernel_0_time; + state.counters["kernel_0_wr_bw"] = test_results.kernel_0_bytes_wr / test_results.kernel_0_time; + state.counters["kernel_0_cycles"] = test_results.kernel_0_cycles; +} + +// Benchmark various memcpy_to_device transfer sizes. +// Reports host bw. +TestResult mem_bench_page_sizing(benchmark::State& state) { + constexpr uint32_t k_DeviceId = 0; + TestResult results; + Context ctx{ + {}, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + 0, // Readers + 0, // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(k_DeviceId, 0); + auto hugepage_size = get_hugepage_size(k_DeviceId); + bool cached = state.range(2); + + for (auto _ : state) { + const double iteration_time = + cached ? copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, true) + : copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += iteration_time; + + state.SetIterationTime(iteration_time); + } + state.SetBytesProcessed(ctx.total_size * state.iterations()); + return results; +} + +// Benchmark memcpy_to_device on multiple threads to try saturating host bandwidth. +// Reports host bw. +TestResult mem_bench_copy_multithread(benchmark::State& state) { + static_assert((MEMCPY_ALIGNMENT & ((MEMCPY_ALIGNMENT)-1)) == 0); + constexpr uint32_t k_DeviceId = 0; + TestResult results; + Context ctx{ + {}, + state.range(0), // Total size + state.range(1), // Page size + state.range(2), // Threads + 0, // Readers + 0, // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(0, 0); + const auto hugepage_size = get_hugepage_size(0); + const auto bytes_per_thread = ((ctx.total_size / ctx.threads) + (MEMCPY_ALIGNMENT)-1) & -(MEMCPY_ALIGNMENT); + const auto last_thread_bytes = ctx.total_size - (bytes_per_thread * (ctx.threads - 1)); + + for (auto _ : state) { + auto iteration_time = execute_work_synced_start( + ctx.threads, + [&](int thread_idx) { + uint64_t thread_dst = (uint64_t)hugepage + (thread_idx * bytes_per_thread); + uint64_t thread_bytes = (thread_idx == ctx.threads - 1) ? last_thread_bytes : bytes_per_thread; + std::span thread_src{src_data}; + thread_src = thread_src.subspan( + (thread_idx * bytes_per_thread) / sizeof(uint32_t), thread_bytes / sizeof(uint32_t)); + copy_to_hugepage( + (void*)thread_dst, hugepage_size, thread_src, thread_bytes, ctx.page_size, false); + }, + []() {}); + + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += iteration_time; + + state.SetIterationTime(iteration_time); + } + + state.SetBytesProcessed(ctx.total_size * state.iterations()); + return results; +} + +// Benchmark memcpy_to_device while the device is reading the hugepage. +// Reports host bw and device bw. +TestResult mem_bench_copy_with_active_kernel(benchmark::State& state) { + TestResult results; + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); + IDevice* device = (*(devices.begin())).second; + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers + 0, // Writers + state.range(3), // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(device->id(), 0); + auto hugepage_size = get_hugepage_size(device->id()); + + for (auto _ : state) { + auto pgm = CreateProgram(); + auto configured_cores = configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size); + double host_copy_time = 1; // Set to 1 so it doesn't divide by 0 if host copy is disabled + + double wait_for_kernel_time = execute_work_synced_start( + 1, + [device, &pgm](int thread_idx) { + // Program + tt::tt_metal::detail::LaunchProgram(device, pgm, true); + }, + [&]() { + if (ctx.enable_host_copy_with_kernels) { + // Host copy while waiting for program + host_copy_time = + copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += host_copy_time; + } + }); + + results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; + + read_inc_data_from_cores(ctx, device, configured_cores.value(), results); + + state.SetIterationTime(host_copy_time); + } + if (ctx.enable_host_copy_with_kernels) { + state.SetBytesProcessed(ctx.total_size * state.iterations()); + } else { + state.SetBytesProcessed(0); + } + + report_device_bw(state, results); + tt::tt_metal::detail::CloseDevices(devices); + return results; +} + +// Host writing to a hugepage while the device pulls from another hugepage. +// Reports host bw and device bw. +TestResult mem_bench_copy_active_kernel_different_page(benchmark::State& state) { + TestResult results; + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); + IDevice* device = (*(devices.begin())).second; + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers + 0, // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto device_hugepage_size = get_hugepage_size(device->id()); + + // 2nd open device is not required + auto host_hugepage = get_hugepage(device->id() + 1, 0); + auto host_hugepage_size = get_hugepage_size(device->id() + 1); + + for (auto _ : state) { + auto pgm = CreateProgram(); + auto configured_cores = + configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size).value(); + double host_copy_time = 0; + + double wait_for_kernel_time = execute_work_synced_start( + 1, + [device, &pgm](int thread_idx) { + // Program + tt::tt_metal::detail::LaunchProgram(device, pgm, true); + }, + [&]() { + // Host copy while waiting for program + host_copy_time = + copy_to_hugepage(host_hugepage, host_hugepage_size, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += host_copy_time; + }); + + results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; + + read_inc_data_from_cores(ctx, device, configured_cores, results); + + state.SetIterationTime(host_copy_time); + } + + state.SetBytesProcessed(ctx.total_size * state.iterations()); + + report_device_bw(state, results); + tt::tt_metal::detail::CloseDevices(devices); + return results; +} + +// Common Multi MMIO device test. +TestResult mem_bench_multi_mmio_devices( + benchmark::State& state, std::map& devices, const Context& ctx) { + TestResult results; + + // One thread to wait for program on each device + int num_threads = devices.size(); + + for (auto _ : state) { + std::map programs; // device : programs + std::map configured_core_ranges; // device : cores + for (auto [device_id, device] : devices) { + programs[device_id] = CreateProgram(); + Program& pgm = programs[device_id]; + auto device_hugepage = get_hugepage(device_id, 0); + auto device_hugepage_size = get_hugepage_size(device_id); + configured_core_ranges.insert( + {device_id, + configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size) + .value()}); + } + + double host_copy_time = 0; + execute_work_synced_start( + 1, + [devices, &programs](int thread_idx) { + // Program + for (auto& [device_id, pgm] : programs) { + tt::tt_metal::detail::LaunchProgram(devices.at(device_id), pgm, false); + } + }, + []() {}); + + // Wait all programs to complete + for (auto& [device_id, pgm] : programs) { + tt::tt_metal::detail::WaitProgramDone(devices.at(device_id), pgm); + } + + // Read counters from each core + for (auto& [device_id, core_range] : configured_core_ranges) { + read_inc_data_from_cores(ctx, devices.at(device_id), core_range, results); + } + + // This test does not report host bw + state.SetIterationTime(1); + } + + state.SetBytesProcessed(0); + report_device_bw(state, results); + state.counters["num_mmio_devices"] = devices.size(); + + return results; +} + +// Multi MMIO devices reading on the same NUMA node. +TestResult mem_bench_multi_mmio_devices_reading_same_node(benchmark::State& state) { + // Node 0 + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(get_number_of_mmio_devices(), 0)); + + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers on each device + 0, // Writers + false, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx); + tt::tt_metal::detail::CloseDevices(devices); + + return results; +} + +// Multi MMIO devices reading on different NUMA nodes. +TestResult mem_bench_multi_mmio_devices_reading_different_node(benchmark::State& state) { + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids_unique_nodes(get_number_of_mmio_devices())); + + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers on each device + 0, // Writers + false, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx); + tt::tt_metal::detail::CloseDevices(devices); + + return results; +} + +// Benchmark memcpy_to_device while device is reading (prefetching) and writing (dispatching data back to host) +// First half of hugepage will be written to by host +// Second half will be written to by device +TestResult mem_bench_copy_with_read_and_write_kernel(benchmark::State& state) { + auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1)); + IDevice* device = (*(devices.begin())).second; + Context ctx{ + devices, + state.range(0), // Total size + state.range(1), // Page size + 0, // Threads + state.range(2), // Readers + state.range(3), // Writers + true, // Enable host copy + 0, // Iterations is managed by the benchmark framework + }; + + auto src_data = generate_random_src_data(ctx.total_size); + auto hugepage = get_hugepage(device->id(), 0); + auto hugepage_size = get_hugepage_size(device->id()); + + // Don't need to seperate device results + // Readers will have 0 bytes written + // Writers will have 0 bytes read. Will not mix. + TestResult results; + + for (auto _ : state) { + auto pgm = CreateProgram(); + auto configured_read_cores = + configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size / 2).value(); + // Offset write cores to second half of PCIe + // Use second row + auto configured_write_cores = + configure_kernels( + device, pgm, ctx, 1, ctx.number_writer_kernels, true, hugepage_size / 2, hugepage_size / 2) + .value(); + double host_copy_time = 0; + + double wait_for_kernel_time = execute_work_synced_start( + 1, + [device, &pgm](int thread_idx) { + // Program + tt::tt_metal::detail::LaunchProgram(device, pgm, true); + }, + [&]() { + // Host copy while waiting for program + host_copy_time = + copy_to_hugepage(hugepage, hugepage_size / 2, src_data, ctx.total_size, ctx.page_size, false); + results.host_bytes_processed += ctx.total_size; + results.host_time_elapsed += host_copy_time; + }); + + results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time; + + read_inc_data_from_cores(ctx, device, configured_read_cores, results); + read_inc_data_from_cores(ctx, device, configured_write_cores, results); + + state.SetIterationTime(host_copy_time); + } + + state.SetBytesProcessed(ctx.total_size * state.iterations()); + report_device_bw(state, results); + tt::tt_metal::detail::CloseDevices(devices); + return results; +} + +void global_bench_args(benchmark::internal::Benchmark* b) { b->UseManualTime()->Iterations(5); } + +void register_basic_benchmark_suite() { + ::benchmark::RegisterBenchmark("Host Copy Page Sizing", mem_bench_page_sizing) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {16, 8_KB, 16_KB, 32_KB}, + {false}, + }); + ::benchmark::RegisterBenchmark("Host Copy (Cached)", mem_bench_page_sizing) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {16, 8_KB, 16_KB, 32_KB}, + {true}, + }); + ::benchmark::RegisterBenchmark("Host Copy Saturation", mem_bench_copy_multithread) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4, 5, 6, 7, 8}, + }); + ::benchmark::RegisterBenchmark("Device Reading Host", mem_bench_copy_with_active_kernel) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4}, + {false}, + }); +} + +void register_full_benchmark_suite() { + ::benchmark::RegisterBenchmark("Host Copy with Active Kernel", mem_bench_copy_with_active_kernel) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4}, + {false}, + }); + ::benchmark::RegisterBenchmark( + "Host Copy with Active Kernel on Different Hugepages", mem_bench_copy_active_kernel_different_page) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2, 3, 4}, + }); + ::benchmark::RegisterBenchmark( + "Host Copy with Active Kernel Reading and Writing", mem_bench_copy_with_read_and_write_kernel) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2}, + {1, 2}, + }); + ::benchmark::RegisterBenchmark( + "Multiple MMIO Devices Reading (Same NUMA node)", mem_bench_multi_mmio_devices_reading_same_node) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2}, + }); + ::benchmark::RegisterBenchmark( + "Multiple MMIO Devices Reading (Different NUMA node)", mem_bench_multi_mmio_devices_reading_different_node) + ->Apply(global_bench_args) + ->ArgsProduct({ + {1_GB}, + {32_KB}, + {1, 2}, + }); +} + +void print_help() { + ::benchmark::PrintDefaultHelp(); + std::cout << " [--help] Shows this help message\n"; + std::cout << " [--full] Run all tests\n"; + std::cout << "\nCounters\n"; + std::cout << " bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured.\n"; + std::cout << " dev_bw: Average device core PCIe pull bandwidth. 0 if not measured.\n"; +} + +int main(int argc, char* argv[]) { + std::vector input_args(argv, argv + argc); + if (test_args::has_command_option(input_args, "--help")) { + print_help(); + return 0; + } + + // Force TT_METAL options + setenv("TT_METAL_SLOW_DISPATCH_MODE", "true", true); + setenv("TT_METAL_CLEAR_L1", "1", true); + // May be overridden by the user + setenv("TT_METAL_LOGGER_LEVEL", "FATAL", false); + + char arg0_default[] = "benchmark"; + char* args_default = arg0_default; + if (!argv) { + argc = 1; + argv = &args_default; + } + + // Run basic benchmarks + register_basic_benchmark_suite(); + + // Run all benchmarks + if (test_args::has_command_option(input_args, "--full")) { + register_full_benchmark_suite(); + } + + ::benchmark::Initialize(&argc, argv); + ::benchmark::RunSpecifiedBenchmarks(); + ::benchmark::Shutdown(); + return 0; +} diff --git a/tt_metal/tools/mem_bench/work_thread.hpp b/tt_metal/tools/mem_bench/work_thread.hpp new file mode 100644 index 000000000000..6e47308a5cb3 --- /dev/null +++ b/tt_metal/tools/mem_bench/work_thread.hpp @@ -0,0 +1,75 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include +#include "host_utils.hpp" + +namespace tt::tt_metal::tools::mem_bench { + +// Execute work_fn on num_threads threads and also do intermediate_fn on the side. +// Returns time taken in seconds for all work_fn to complete. Time is calculated by latest thread end - earliest thread +// start. +template +double execute_work_synced_start(int num_threads, F&& work_fn, IntermediateF&& intermediate_fn, Args&&... args) { + std::mutex m; + int threads_ready{0}; + std::condition_variable go_cv; // Signal to all threads to go + auto total_threads = num_threads + 1; // Including intermediate + std::vector thread_start_times(num_threads); + std::vector thread_end_times(num_threads); + std::vector threads(total_threads); + + for (int i = 0; i < num_threads; ++i) { + threads[i] = std::thread([i, + &m, + &go_cv, + &threads_ready, + &thread_start_times, + &thread_end_times, + total_threads, + work_fn = std::forward(work_fn), + ... args = std::forward(args)]() mutable { + { + std::unique_lock lk{m}; + threads_ready++; + if (threads_ready == total_threads) { + go_cv.notify_all(); + } + go_cv.wait(lk, [&] { return threads_ready == total_threads; }); + } + + thread_start_times[i] = get_current_time_seconds(); + work_fn(i, std::forward(args)...); + thread_end_times[i] = get_current_time_seconds(); + }); + } + + threads[num_threads] = std::thread([&]() mutable { + std::unique_lock lk{m}; + threads_ready++; + if (threads_ready == total_threads) { + go_cv.notify_all(); + } + go_cv.wait(lk, [&] { return threads_ready == total_threads; }); + + intermediate_fn(); + }); + + for (auto& thread : threads) { + thread.join(); + } + + // Calculate work time based on earliest start and latest end + double earliest_start = *std::min_element(thread_start_times.begin(), thread_start_times.end()); + double latest_end = *std::max_element(thread_end_times.begin(), thread_end_times.end()); + + return latest_end - earliest_start; +} + +}; // namespace tt::tt_metal::tools::mem_bench