Skip to content

Commit

Permalink
#0: comprehensive mem benchmark tool
Browse files Browse the repository at this point in the history
Reland the previously reverted commit
532dd26
  • Loading branch information
nhuang-tt committed Feb 22, 2025
1 parent c9feb5d commit f92eee1
Show file tree
Hide file tree
Showing 14 changed files with 1,179 additions and 10 deletions.
7 changes: 2 additions & 5 deletions tt_metal/api/tt-metalium/command_queue_interface.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,16 @@
// SPDX-License-Identifier: Apache-2.0

#pragma once
#include <climits>
#include <magic_enum/magic_enum.hpp>
#include <mutex>
#include <tt-metalium/tt_align.hpp>
#include <unordered_map>

#include "cq_commands.hpp"
#include "dispatch_core_manager.hpp"
#include "launch_message_ring_buffer_state.hpp"
#include "memcpy.hpp"
#include "hal.hpp"
#include "dispatch_settings.hpp"
#include "helpers.hpp"
#include "buffer.hpp"
#include "umd/device/tt_core_coordinates.h"

Expand Down Expand Up @@ -193,8 +190,8 @@ class DispatchMemMap {
uint32_t prefetch_dispatch_unreserved_base =
device_cq_addrs_[tt::utils::underlying_type<CommandQueueDeviceAddrType>(
CommandQueueDeviceAddrType::UNRESERVED)];
cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(settings.prefetch_q_size_, pcie_alignment);
scratch_db_base_ = cmddat_q_base_ + round_size(settings.prefetch_cmddat_q_size_, pcie_alignment);
cmddat_q_base_ = align(prefetch_dispatch_unreserved_base + settings.prefetch_q_size_, pcie_alignment);
scratch_db_base_ = align(cmddat_q_base_ + settings.prefetch_cmddat_q_size_, pcie_alignment);
dispatch_buffer_base_ = align(prefetch_dispatch_unreserved_base, 1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
dispatch_buffer_block_size_pages_ = settings.dispatch_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS;
const uint32_t dispatch_cb_end = dispatch_buffer_base_ + settings.dispatch_size_;
Expand Down
2 changes: 1 addition & 1 deletion tt_metal/impl/dispatch/util/dispatch_settings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "magic_enum/magic_enum.hpp"
#include "umd/device/tt_core_coordinates.h"
#include <dispatch_settings.hpp>
#include <helpers.hpp>
#include "size_literals.hpp"

namespace tt::tt_metal {

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -18,8 +18,6 @@ constexpr auto operator""_MB(const unsigned long long v) -> uint32_t { return 10
constexpr auto operator""_GB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * 1024 * v; }

// Returns the size rounded up to the given alignment
inline uint32_t round_size(uint32_t sz, uint32_t alignment) {
return ((sz + alignment - 1) / alignment * alignment);
}
inline uint32_t round_size(uint32_t sz, uint32_t alignment) { return ((sz + alignment - 1) / alignment * alignment); }

} // namespace tt::tt_metal
2 changes: 2 additions & 0 deletions tt_metal/tools/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/profiler)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/watcher_dump)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lightmetal_runner)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mem_bench)

set(TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/memset.cpp)

Expand All @@ -10,6 +11,7 @@ target_link_libraries(
PUBLIC
profiler
Metalium::Metal::LLRT
Metalium::Metal
PRIVATE
TT::Metalium::HostDevCommon
)
42 changes: 42 additions & 0 deletions tt_metal/tools/mem_bench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
set(IMPL_SRC
${CMAKE_CURRENT_SOURCE_DIR}/mem_bench.cpp
${CMAKE_CURRENT_SOURCE_DIR}/host_utils.cpp
${CMAKE_CURRENT_SOURCE_DIR}/device_utils.cpp
)

set(HEADERS_SRC
${CMAKE_CURRENT_SOURCE_DIR}/host_utils.hpp
${CMAKE_CURRENT_SOURCE_DIR}/device_utils.hpp
${CMAKE_CURRENT_SOURCE_DIR}/work_thread.hpp
)

add_executable(
mem_bench
${IMPL_SRC}
${HEADERS_SRC}
)
target_link_libraries(
mem_bench
PRIVATE
Metalium::Metal
TT::Metalium::Common
Metalium::Metal::Impl
Metalium::Metal::LLRT
numa
benchmark::benchmark
)
target_include_directories(
mem_bench
PRIVATE
${PROJECT_SOURCE_DIR}
${PROJECT_SOURCE_DIR}/tt_metal
${PROJECT_SOURCE_DIR}/tt_metal/common
${PROJECT_SOURCE_DIR}/tests
${CMAKE_CURRENT_SOURCE_DIR}
)
set_target_properties(
mem_bench
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${PROJECT_BINARY_DIR}/tools
)
46 changes: 46 additions & 0 deletions tt_metal/tools/mem_bench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# tt mem_bench

Utility to measure host and device bandwidth on Tenstorrent devices.

## Build

Tools are included in `tt_metal` builds. Using a release build is required for accurate perf measurements.

## Usage

By default, each test is run for 5 iterations and only basic tests are executed. All test patterns can be executed by specifying `--full`. Additional run parameters are listed below.

Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core.

> [!NOTE]
The `tt_metal` library log level can be adjusted by exporting `TT_METAL_LOGGER_LEVEL=fatal|info|error|debug`.

> [!NOTE]
On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 <command>` will allocate resources closer to the device.
```
./build/tools/mem_bench --help
benchmark [--benchmark_list_tests={true|false}]
[--benchmark_filter=<regex>]
[--benchmark_min_time=`<integer>x` OR `<float>s` ]
[--benchmark_min_warmup_time=<min_warmup_time>]
[--benchmark_repetitions=<num_repetitions>]
[--benchmark_dry_run={true|false}]
[--benchmark_enable_random_interleaving={true|false}]
[--benchmark_report_aggregates_only={true|false}]
[--benchmark_display_aggregates_only={true|false}]
[--benchmark_format=<console|json|csv>]
[--benchmark_out=<filename>]
[--benchmark_out_format=<json|console|csv>]
[--benchmark_color={auto|true|false}]
[--benchmark_counters_tabular={true|false}]
[--benchmark_context=<key>=<value>,...]
[--benchmark_time_unit={ns|us|ms|s}]
[--v=<verbosity>]
[--help] Shows this help message
[--full] Run all tests
Counters
bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured.
dev_bw: Average device core PCIe pull bandwidth. 0 if not measured.
```
78 changes: 78 additions & 0 deletions tt_metal/tools/mem_bench/context.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <string>
#include <map>
#include <tt-metalium/device.hpp>
#include <tt-metalium/hal_exp.hpp>
#include <tt-metalium/tt_align.hpp>

namespace tt::tt_metal::tools::mem_bench {

struct TestResult {
double host_bytes_processed{0};
double host_time_elapsed{0};
double host_wait_for_kernel_time_elapsed{0};

double total_cores_cycles{0};
double total_cores_time{0};
double total_cores_bytes_rd{0};
double total_cores_bytes_wr{0};

double kernel_0_cycles{0};
double kernel_0_time{0};
double kernel_0_bytes_rd{0};
double kernel_0_bytes_wr{0};

// Any additional values to be included in benchmark reports
std::map<std::string, double> arb_counters;
};

struct L1MemoryMap {
uint32_t cycles;
uint32_t rd_bytes;
uint32_t wr_bytes;
uint32_t unreserved;
};

struct Context {
std::map<chip_id_t, IDevice*> devices;
L1MemoryMap device_address;
uint32_t total_size{0};
uint32_t page_size{0};
int threads{0};
int number_reader_kernels{0};
int number_writer_kernels{0};
bool enable_host_copy_with_kernels{0};
int iterations{0};

Context(
const std::map<chip_id_t, IDevice*>& devices_,
uint32_t total_size_,
uint32_t page_size_,
int threads_,
int readers_,
int writers_,
bool enable_host_copy_with_kernels_,
int iterations_) {
auto l1_alignment = experimental::hal::get_l1_alignment();
auto l1_base = experimental::hal::get_tensix_l1_unreserved_base();
device_address.cycles = l1_base;
device_address.rd_bytes = align(device_address.cycles + sizeof(uint32_t), l1_alignment);
device_address.wr_bytes = align(device_address.rd_bytes + sizeof(uint32_t), l1_alignment);
device_address.unreserved = align(device_address.wr_bytes + sizeof(uint32_t), l1_alignment);
devices = devices_;
total_size = total_size_;
page_size = page_size_;
threads = threads_;
number_reader_kernels = readers_;
number_writer_kernels = writers_;
enable_host_copy_with_kernels = enable_host_copy_with_kernels_;
iterations = iterations_;
}
};

} // namespace tt::tt_metal::tools::mem_bench
92 changes: 92 additions & 0 deletions tt_metal/tools/mem_bench/device_utils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#include <tt-metalium/host_api.hpp>
#include <tt-metalium/tt_metal.hpp>
#include "device_utils.hpp"
#include "context.hpp"

namespace tt::tt_metal::tools::mem_bench {

std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr) {
std::vector<uint32_t> data;
for (int xi = cores.start_coord.x; xi <= cores.end_coord.x; ++xi) {
for (int yi = cores.start_coord.y; yi <= cores.end_coord.y; ++yi) {
std::vector<uint32_t> single_data;
tt::tt_metal::detail::ReadFromDeviceL1(device, CoreCoord{xi, yi}, addr, sizeof(uint32_t), single_data);
data.push_back(single_data[0]);
}
}
return data;
}

std::optional<CoreRange> configure_kernels(
tt::tt_metal::IDevice* device,
tt::tt_metal::Program& program,
const Context& context,
uint32_t start_y,
uint32_t num_kernels,
bool is_writer,
uint32_t pcie_size,
uint32_t pcie_offset) {
constexpr std::string_view k_PcieBenchKernel = "tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp";
const auto grid_size = device->logical_grid_size();
const auto max_x = grid_size.x;
const auto max_y = grid_size.y;
uint32_t total_kernel_transfer = context.total_size;
uint32_t kernel_transfer_size = context.page_size;

if (!kernel_transfer_size) {
kernel_transfer_size = total_kernel_transfer;
} else if (!num_kernels) {
return {};
}

// Number readers either less than one row
// or a multiple of the rows
CoreCoord start_coord{0, start_y};
CoreCoord end_coord;
if (num_kernels <= max_x) {
end_coord.x = start_coord.x + num_kernels - 1;
end_coord.y = start_coord.y;
} else {
const auto number_of_rows = num_kernels / max_x;
const auto last_row_width = (num_kernels % max_x) ? num_kernels % max_x : max_x;
end_coord.x = start_coord.x + last_row_width - 1;
end_coord.y = number_of_rows - 1;
}
CoreRange core_range{start_coord, end_coord};

std::vector<uint32_t> pcie_bench_compile_args(12, 0);
if (is_writer) {
pcie_bench_compile_args[5] = 0; // reserved_0
pcie_bench_compile_args[6] = pcie_offset; // pcie_wr_base
pcie_bench_compile_args[7] = pcie_size; // pcie_wr_size
pcie_bench_compile_args[8] = kernel_transfer_size; // pcie_wr_transfer_size
} else {
pcie_bench_compile_args[0] = context.device_address.unreserved; // my_rd_dst_addr
pcie_bench_compile_args[1] = pcie_offset; // pcie_rd_base
pcie_bench_compile_args[2] = pcie_size; // pcie_rd_size
pcie_bench_compile_args[3] = kernel_transfer_size; // pcie_rd_transfer_size
}
pcie_bench_compile_args[4] = context.device_address.rd_bytes; // my_bytes_rd_addr
pcie_bench_compile_args[9] = context.device_address.wr_bytes; // my_bytes_wr_addr
pcie_bench_compile_args[10] = total_kernel_transfer;
pcie_bench_compile_args[11] = context.device_address.cycles;

[[maybe_unused]] auto kernel = tt::tt_metal::CreateKernel(
program,
std::string{k_PcieBenchKernel},
core_range,
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1,
.noc = tt::tt_metal::NOC_0,
.compile_args = pcie_bench_compile_args,
.defines = {},
});

return core_range;
}

} // namespace tt::tt_metal::tools::mem_bench
26 changes: 26 additions & 0 deletions tt_metal/tools/mem_bench/device_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
//
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include <vector>
#include <tt-metalium/device.hpp>
#include <tt-metalium/core_coord.hpp>
#include "context.hpp"

namespace tt::tt_metal::tools::mem_bench {

std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr);

std::optional<CoreRange> configure_kernels(
tt::tt_metal::IDevice* device,
tt::tt_metal::Program& program,
const Context& context,
uint32_t start_y,
uint32_t num_kernels,
bool is_writer,
uint32_t pcie_size,
uint32_t pcie_offset = 0);

} // namespace tt::tt_metal::tools::mem_bench
Loading

0 comments on commit f92eee1

Please sign in to comment.