-
Notifications
You must be signed in to change notification settings - Fork 111
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
#0: comprehensive mem benchmark tool
Reland the previously reverted commit 532dd26
- Loading branch information
Showing
14 changed files
with
1,179 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
set(IMPL_SRC | ||
${CMAKE_CURRENT_SOURCE_DIR}/mem_bench.cpp | ||
${CMAKE_CURRENT_SOURCE_DIR}/host_utils.cpp | ||
${CMAKE_CURRENT_SOURCE_DIR}/device_utils.cpp | ||
) | ||
|
||
set(HEADERS_SRC | ||
${CMAKE_CURRENT_SOURCE_DIR}/host_utils.hpp | ||
${CMAKE_CURRENT_SOURCE_DIR}/device_utils.hpp | ||
${CMAKE_CURRENT_SOURCE_DIR}/work_thread.hpp | ||
) | ||
|
||
add_executable( | ||
mem_bench | ||
${IMPL_SRC} | ||
${HEADERS_SRC} | ||
) | ||
target_link_libraries( | ||
mem_bench | ||
PRIVATE | ||
Metalium::Metal | ||
TT::Metalium::Common | ||
Metalium::Metal::Impl | ||
Metalium::Metal::LLRT | ||
numa | ||
benchmark::benchmark | ||
) | ||
target_include_directories( | ||
mem_bench | ||
PRIVATE | ||
${PROJECT_SOURCE_DIR} | ||
${PROJECT_SOURCE_DIR}/tt_metal | ||
${PROJECT_SOURCE_DIR}/tt_metal/common | ||
${PROJECT_SOURCE_DIR}/tests | ||
${CMAKE_CURRENT_SOURCE_DIR} | ||
) | ||
set_target_properties( | ||
mem_bench | ||
PROPERTIES | ||
RUNTIME_OUTPUT_DIRECTORY | ||
${PROJECT_BINARY_DIR}/tools | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# tt mem_bench | ||
|
||
Utility to measure host and device bandwidth on Tenstorrent devices. | ||
|
||
## Build | ||
|
||
Tools are included in `tt_metal` builds. Using a release build is required for accurate perf measurements. | ||
|
||
## Usage | ||
|
||
By default, each test is run for 5 iterations and only basic tests are executed. All test patterns can be executed by specifying `--full`. Additional run parameters are listed below. | ||
|
||
Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core. | ||
|
||
> [!NOTE] | ||
The `tt_metal` library log level can be adjusted by exporting `TT_METAL_LOGGER_LEVEL=fatal|info|error|debug`. | ||
|
||
> [!NOTE] | ||
On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 <command>` will allocate resources closer to the device. | ||
``` | ||
./build/tools/mem_bench --help | ||
benchmark [--benchmark_list_tests={true|false}] | ||
[--benchmark_filter=<regex>] | ||
[--benchmark_min_time=`<integer>x` OR `<float>s` ] | ||
[--benchmark_min_warmup_time=<min_warmup_time>] | ||
[--benchmark_repetitions=<num_repetitions>] | ||
[--benchmark_dry_run={true|false}] | ||
[--benchmark_enable_random_interleaving={true|false}] | ||
[--benchmark_report_aggregates_only={true|false}] | ||
[--benchmark_display_aggregates_only={true|false}] | ||
[--benchmark_format=<console|json|csv>] | ||
[--benchmark_out=<filename>] | ||
[--benchmark_out_format=<json|console|csv>] | ||
[--benchmark_color={auto|true|false}] | ||
[--benchmark_counters_tabular={true|false}] | ||
[--benchmark_context=<key>=<value>,...] | ||
[--benchmark_time_unit={ns|us|ms|s}] | ||
[--v=<verbosity>] | ||
[--help] Shows this help message | ||
[--full] Run all tests | ||
Counters | ||
bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured. | ||
dev_bw: Average device core PCIe pull bandwidth. 0 if not measured. | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#pragma once | ||
|
||
#include <string> | ||
#include <map> | ||
#include <tt-metalium/device.hpp> | ||
#include <tt-metalium/hal_exp.hpp> | ||
#include <tt-metalium/tt_align.hpp> | ||
|
||
namespace tt::tt_metal::tools::mem_bench { | ||
|
||
struct TestResult { | ||
double host_bytes_processed{0}; | ||
double host_time_elapsed{0}; | ||
double host_wait_for_kernel_time_elapsed{0}; | ||
|
||
double total_cores_cycles{0}; | ||
double total_cores_time{0}; | ||
double total_cores_bytes_rd{0}; | ||
double total_cores_bytes_wr{0}; | ||
|
||
double kernel_0_cycles{0}; | ||
double kernel_0_time{0}; | ||
double kernel_0_bytes_rd{0}; | ||
double kernel_0_bytes_wr{0}; | ||
|
||
// Any additional values to be included in benchmark reports | ||
std::map<std::string, double> arb_counters; | ||
}; | ||
|
||
struct L1MemoryMap { | ||
uint32_t cycles; | ||
uint32_t rd_bytes; | ||
uint32_t wr_bytes; | ||
uint32_t unreserved; | ||
}; | ||
|
||
struct Context { | ||
std::map<chip_id_t, IDevice*> devices; | ||
L1MemoryMap device_address; | ||
uint32_t total_size{0}; | ||
uint32_t page_size{0}; | ||
int threads{0}; | ||
int number_reader_kernels{0}; | ||
int number_writer_kernels{0}; | ||
bool enable_host_copy_with_kernels{0}; | ||
int iterations{0}; | ||
|
||
Context( | ||
const std::map<chip_id_t, IDevice*>& devices_, | ||
uint32_t total_size_, | ||
uint32_t page_size_, | ||
int threads_, | ||
int readers_, | ||
int writers_, | ||
bool enable_host_copy_with_kernels_, | ||
int iterations_) { | ||
auto l1_alignment = experimental::hal::get_l1_alignment(); | ||
auto l1_base = experimental::hal::get_tensix_l1_unreserved_base(); | ||
device_address.cycles = l1_base; | ||
device_address.rd_bytes = align(device_address.cycles + sizeof(uint32_t), l1_alignment); | ||
device_address.wr_bytes = align(device_address.rd_bytes + sizeof(uint32_t), l1_alignment); | ||
device_address.unreserved = align(device_address.wr_bytes + sizeof(uint32_t), l1_alignment); | ||
devices = devices_; | ||
total_size = total_size_; | ||
page_size = page_size_; | ||
threads = threads_; | ||
number_reader_kernels = readers_; | ||
number_writer_kernels = writers_; | ||
enable_host_copy_with_kernels = enable_host_copy_with_kernels_; | ||
iterations = iterations_; | ||
} | ||
}; | ||
|
||
} // namespace tt::tt_metal::tools::mem_bench |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#include <tt-metalium/host_api.hpp> | ||
#include <tt-metalium/tt_metal.hpp> | ||
#include "device_utils.hpp" | ||
#include "context.hpp" | ||
|
||
namespace tt::tt_metal::tools::mem_bench { | ||
|
||
std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr) { | ||
std::vector<uint32_t> data; | ||
for (int xi = cores.start_coord.x; xi <= cores.end_coord.x; ++xi) { | ||
for (int yi = cores.start_coord.y; yi <= cores.end_coord.y; ++yi) { | ||
std::vector<uint32_t> single_data; | ||
tt::tt_metal::detail::ReadFromDeviceL1(device, CoreCoord{xi, yi}, addr, sizeof(uint32_t), single_data); | ||
data.push_back(single_data[0]); | ||
} | ||
} | ||
return data; | ||
} | ||
|
||
std::optional<CoreRange> configure_kernels( | ||
tt::tt_metal::IDevice* device, | ||
tt::tt_metal::Program& program, | ||
const Context& context, | ||
uint32_t start_y, | ||
uint32_t num_kernels, | ||
bool is_writer, | ||
uint32_t pcie_size, | ||
uint32_t pcie_offset) { | ||
constexpr std::string_view k_PcieBenchKernel = "tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp"; | ||
const auto grid_size = device->logical_grid_size(); | ||
const auto max_x = grid_size.x; | ||
const auto max_y = grid_size.y; | ||
uint32_t total_kernel_transfer = context.total_size; | ||
uint32_t kernel_transfer_size = context.page_size; | ||
|
||
if (!kernel_transfer_size) { | ||
kernel_transfer_size = total_kernel_transfer; | ||
} else if (!num_kernels) { | ||
return {}; | ||
} | ||
|
||
// Number readers either less than one row | ||
// or a multiple of the rows | ||
CoreCoord start_coord{0, start_y}; | ||
CoreCoord end_coord; | ||
if (num_kernels <= max_x) { | ||
end_coord.x = start_coord.x + num_kernels - 1; | ||
end_coord.y = start_coord.y; | ||
} else { | ||
const auto number_of_rows = num_kernels / max_x; | ||
const auto last_row_width = (num_kernels % max_x) ? num_kernels % max_x : max_x; | ||
end_coord.x = start_coord.x + last_row_width - 1; | ||
end_coord.y = number_of_rows - 1; | ||
} | ||
CoreRange core_range{start_coord, end_coord}; | ||
|
||
std::vector<uint32_t> pcie_bench_compile_args(12, 0); | ||
if (is_writer) { | ||
pcie_bench_compile_args[5] = 0; // reserved_0 | ||
pcie_bench_compile_args[6] = pcie_offset; // pcie_wr_base | ||
pcie_bench_compile_args[7] = pcie_size; // pcie_wr_size | ||
pcie_bench_compile_args[8] = kernel_transfer_size; // pcie_wr_transfer_size | ||
} else { | ||
pcie_bench_compile_args[0] = context.device_address.unreserved; // my_rd_dst_addr | ||
pcie_bench_compile_args[1] = pcie_offset; // pcie_rd_base | ||
pcie_bench_compile_args[2] = pcie_size; // pcie_rd_size | ||
pcie_bench_compile_args[3] = kernel_transfer_size; // pcie_rd_transfer_size | ||
} | ||
pcie_bench_compile_args[4] = context.device_address.rd_bytes; // my_bytes_rd_addr | ||
pcie_bench_compile_args[9] = context.device_address.wr_bytes; // my_bytes_wr_addr | ||
pcie_bench_compile_args[10] = total_kernel_transfer; | ||
pcie_bench_compile_args[11] = context.device_address.cycles; | ||
|
||
[[maybe_unused]] auto kernel = tt::tt_metal::CreateKernel( | ||
program, | ||
std::string{k_PcieBenchKernel}, | ||
core_range, | ||
DataMovementConfig{ | ||
.processor = DataMovementProcessor::RISCV_1, | ||
.noc = tt::tt_metal::NOC_0, | ||
.compile_args = pcie_bench_compile_args, | ||
.defines = {}, | ||
}); | ||
|
||
return core_range; | ||
} | ||
|
||
} // namespace tt::tt_metal::tools::mem_bench |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
#pragma once | ||
|
||
#include <vector> | ||
#include <tt-metalium/device.hpp> | ||
#include <tt-metalium/core_coord.hpp> | ||
#include "context.hpp" | ||
|
||
namespace tt::tt_metal::tools::mem_bench { | ||
|
||
std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr); | ||
|
||
std::optional<CoreRange> configure_kernels( | ||
tt::tt_metal::IDevice* device, | ||
tt::tt_metal::Program& program, | ||
const Context& context, | ||
uint32_t start_y, | ||
uint32_t num_kernels, | ||
bool is_writer, | ||
uint32_t pcie_size, | ||
uint32_t pcie_offset = 0); | ||
|
||
} // namespace tt::tt_metal::tools::mem_bench |
Oops, something went wrong.