diff --git a/tt_metal/api/tt-metalium/command_queue_interface.hpp b/tt_metal/api/tt-metalium/command_queue_interface.hpp
index 30de4f2e6312..53f6eb068ea3 100644
--- a/tt_metal/api/tt-metalium/command_queue_interface.hpp
+++ b/tt_metal/api/tt-metalium/command_queue_interface.hpp
@@ -3,11 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
-#include <climits>
 #include <magic_enum/magic_enum.hpp>
 #include <mutex>
 #include <tt-metalium/tt_align.hpp>
-#include <unordered_map>
 
 #include "cq_commands.hpp"
 #include "dispatch_core_manager.hpp"
@@ -15,7 +13,6 @@
 #include "memcpy.hpp"
 #include "hal.hpp"
 #include "dispatch_settings.hpp"
-#include "helpers.hpp"
 #include "buffer.hpp"
 #include "umd/device/tt_core_coordinates.h"
 
@@ -193,8 +190,8 @@ class DispatchMemMap {
         uint32_t prefetch_dispatch_unreserved_base =
             device_cq_addrs_[tt::utils::underlying_type<CommandQueueDeviceAddrType>(
                 CommandQueueDeviceAddrType::UNRESERVED)];
-        cmddat_q_base_ = prefetch_dispatch_unreserved_base + round_size(settings.prefetch_q_size_, pcie_alignment);
-        scratch_db_base_ = cmddat_q_base_ + round_size(settings.prefetch_cmddat_q_size_, pcie_alignment);
+        cmddat_q_base_ = align(prefetch_dispatch_unreserved_base + settings.prefetch_q_size_, pcie_alignment);
+        scratch_db_base_ = align(cmddat_q_base_ + settings.prefetch_cmddat_q_size_, pcie_alignment);
         dispatch_buffer_base_ = align(prefetch_dispatch_unreserved_base, 1 << DispatchSettings::DISPATCH_BUFFER_LOG_PAGE_SIZE);
         dispatch_buffer_block_size_pages_ = settings.dispatch_pages_ / DispatchSettings::DISPATCH_BUFFER_SIZE_BLOCKS;
         const uint32_t dispatch_cb_end = dispatch_buffer_base_ + settings.dispatch_size_;
diff --git a/tt_metal/impl/dispatch/util/dispatch_settings.cpp b/tt_metal/impl/dispatch/util/dispatch_settings.cpp
index 7912a1f825d9..a6003177a968 100644
--- a/tt_metal/impl/dispatch/util/dispatch_settings.cpp
+++ b/tt_metal/impl/dispatch/util/dispatch_settings.cpp
@@ -8,7 +8,7 @@
 #include "magic_enum/magic_enum.hpp"
 #include "umd/device/tt_core_coordinates.h"
 #include <dispatch_settings.hpp>
-#include <helpers.hpp>
+#include "size_literals.hpp"
 
 namespace tt::tt_metal {
 
diff --git a/tt_metal/api/tt-metalium/helpers.hpp b/tt_metal/impl/dispatch/util/size_literals.hpp
similarity index 75%
rename from tt_metal/api/tt-metalium/helpers.hpp
rename to tt_metal/impl/dispatch/util/size_literals.hpp
index aebf3f3f69a5..061d98809044 100644
--- a/tt_metal/api/tt-metalium/helpers.hpp
+++ b/tt_metal/impl/dispatch/util/size_literals.hpp
@@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
 
@@ -18,8 +18,6 @@ constexpr auto operator""_MB(const unsigned long long v) -> uint32_t { return 10
 constexpr auto operator""_GB(const unsigned long long v) -> uint32_t { return 1024 * 1024 * 1024 * v; }
 
 // Returns the size rounded up to the given alignment
-inline uint32_t round_size(uint32_t sz, uint32_t alignment) {
-    return ((sz + alignment - 1) / alignment * alignment);
-}
+inline uint32_t round_size(uint32_t sz, uint32_t alignment) { return ((sz + alignment - 1) / alignment * alignment); }
 
 }  // namespace tt::tt_metal
diff --git a/tt_metal/tools/CMakeLists.txt b/tt_metal/tools/CMakeLists.txt
index 3509710519aa..186c1ea86c70 100644
--- a/tt_metal/tools/CMakeLists.txt
+++ b/tt_metal/tools/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/profiler)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/watcher_dump)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lightmetal_runner)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/mem_bench)
 
 set(TOOLS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/memset.cpp)
 
@@ -10,6 +11,7 @@ target_link_libraries(
     PUBLIC
         profiler
         Metalium::Metal::LLRT
+        Metalium::Metal
     PRIVATE
         TT::Metalium::HostDevCommon
 )
diff --git a/tt_metal/tools/mem_bench/CMakeLists.txt b/tt_metal/tools/mem_bench/CMakeLists.txt
new file mode 100644
index 000000000000..a6c13ae19d20
--- /dev/null
+++ b/tt_metal/tools/mem_bench/CMakeLists.txt
@@ -0,0 +1,42 @@
+set(IMPL_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/mem_bench.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.cpp
+)
+
+set(HEADERS_SRC
+    ${CMAKE_CURRENT_SOURCE_DIR}/host_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/device_utils.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/work_thread.hpp
+)
+
+add_executable(
+    mem_bench
+    ${IMPL_SRC}
+    ${HEADERS_SRC}
+)
+target_link_libraries(
+    mem_bench
+    PRIVATE
+        Metalium::Metal
+        TT::Metalium::Common
+        Metalium::Metal::Impl
+        Metalium::Metal::LLRT
+        numa
+        benchmark::benchmark
+)
+target_include_directories(
+    mem_bench
+    PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/tt_metal
+        ${PROJECT_SOURCE_DIR}/tt_metal/common
+        ${PROJECT_SOURCE_DIR}/tests
+        ${CMAKE_CURRENT_SOURCE_DIR}
+)
+set_target_properties(
+    mem_bench
+    PROPERTIES
+        RUNTIME_OUTPUT_DIRECTORY
+            ${PROJECT_BINARY_DIR}/tools
+)
diff --git a/tt_metal/tools/mem_bench/README.md b/tt_metal/tools/mem_bench/README.md
new file mode 100644
index 000000000000..03f2731d0d08
--- /dev/null
+++ b/tt_metal/tools/mem_bench/README.md
@@ -0,0 +1,46 @@
+# tt mem_bench
+
+Utility to measure host and device bandwidth on Tenstorrent devices.
+
+## Build
+
+Tools are included in `tt_metal` builds. Using a release build is required for accurate perf measurements.
+
+## Usage
+
+By default, each test is run for 5 iterations and only basic tests are executed. All test patterns can be executed by specifying `--full`. Additional run parameters are listed below.
+
+Tests will report host bandwidth and/or device bandwidth. If device bandwidth is reported, then the average of all cores is reported as well as bandwidth for just a single core.
+
+> [!NOTE]
+The `tt_metal` library log level can be adjusted by exporting `TT_METAL_LOGGER_LEVEL=fatal|info|error|debug`.
+
+> [!NOTE]
+On NUMA systems, the host page for the device's command queue data is pinned on the memory node closest to where the device is located. If `tt_metal` is run on a different node then bandwidth will degrade because it'll need to cross sockets. Therefore, it's important to run `tt_metal` on the closest node. On Linux, the execution policy can be set using `numactl`. E.g., if the device is located on node 0, then `numactl --cpubind=0 --membind=0 <command>` will allocate resources closer to the device.
+
+```
+./build/tools/mem_bench --help
+benchmark [--benchmark_list_tests={true|false}]
+          [--benchmark_filter=<regex>]
+          [--benchmark_min_time=`<integer>x` OR `<float>s` ]
+          [--benchmark_min_warmup_time=<min_warmup_time>]
+          [--benchmark_repetitions=<num_repetitions>]
+          [--benchmark_dry_run={true|false}]
+          [--benchmark_enable_random_interleaving={true|false}]
+          [--benchmark_report_aggregates_only={true|false}]
+          [--benchmark_display_aggregates_only={true|false}]
+          [--benchmark_format=<console|json|csv>]
+          [--benchmark_out=<filename>]
+          [--benchmark_out_format=<json|console|csv>]
+          [--benchmark_color={auto|true|false}]
+          [--benchmark_counters_tabular={true|false}]
+          [--benchmark_context=<key>=<value>,...]
+          [--benchmark_time_unit={ns|us|ms|s}]
+          [--v=<verbosity>]
+          [--help] Shows this help message
+          [--full] Run all tests
+
+Counters
+          bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured.
+          dev_bw: Average device core PCIe pull bandwidth. 0 if not measured.
+```
diff --git a/tt_metal/tools/mem_bench/context.hpp b/tt_metal/tools/mem_bench/context.hpp
new file mode 100644
index 000000000000..4bf8d8ff4503
--- /dev/null
+++ b/tt_metal/tools/mem_bench/context.hpp
@@ -0,0 +1,78 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/hal_exp.hpp>
+#include <tt-metalium/tt_align.hpp>
+
+namespace tt::tt_metal::tools::mem_bench {
+
+struct TestResult {
+    double host_bytes_processed{0};
+    double host_time_elapsed{0};
+    double host_wait_for_kernel_time_elapsed{0};
+
+    double total_cores_cycles{0};
+    double total_cores_time{0};
+    double total_cores_bytes_rd{0};
+    double total_cores_bytes_wr{0};
+
+    double kernel_0_cycles{0};
+    double kernel_0_time{0};
+    double kernel_0_bytes_rd{0};
+    double kernel_0_bytes_wr{0};
+
+    // Any additional values to be included in benchmark reports
+    std::map<std::string, double> arb_counters;
+};
+
+struct L1MemoryMap {
+    uint32_t cycles;
+    uint32_t rd_bytes;
+    uint32_t wr_bytes;
+    uint32_t unreserved;
+};
+
+struct Context {
+    std::map<chip_id_t, IDevice*> devices;
+    L1MemoryMap device_address;
+    uint32_t total_size{0};
+    uint32_t page_size{0};
+    int threads{0};
+    int number_reader_kernels{0};
+    int number_writer_kernels{0};
+    bool enable_host_copy_with_kernels{0};
+    int iterations{0};
+
+    Context(
+        const std::map<chip_id_t, IDevice*>& devices_,
+        uint32_t total_size_,
+        uint32_t page_size_,
+        int threads_,
+        int readers_,
+        int writers_,
+        bool enable_host_copy_with_kernels_,
+        int iterations_) {
+        auto l1_alignment = experimental::hal::get_l1_alignment();
+        auto l1_base = experimental::hal::get_tensix_l1_unreserved_base();
+        device_address.cycles = l1_base;
+        device_address.rd_bytes = align(device_address.cycles + sizeof(uint32_t), l1_alignment);
+        device_address.wr_bytes = align(device_address.rd_bytes + sizeof(uint32_t), l1_alignment);
+        device_address.unreserved = align(device_address.wr_bytes + sizeof(uint32_t), l1_alignment);
+        devices = devices_;
+        total_size = total_size_;
+        page_size = page_size_;
+        threads = threads_;
+        number_reader_kernels = readers_;
+        number_writer_kernels = writers_;
+        enable_host_copy_with_kernels = enable_host_copy_with_kernels_;
+        iterations = iterations_;
+    }
+};
+
+}  // namespace tt::tt_metal::tools::mem_bench
diff --git a/tt_metal/tools/mem_bench/device_utils.cpp b/tt_metal/tools/mem_bench/device_utils.cpp
new file mode 100644
index 000000000000..bd650a3c0527
--- /dev/null
+++ b/tt_metal/tools/mem_bench/device_utils.cpp
@@ -0,0 +1,92 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include "device_utils.hpp"
+#include "context.hpp"
+
+namespace tt::tt_metal::tools::mem_bench {
+
+std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr) {
+    std::vector<uint32_t> data;
+    for (int xi = cores.start_coord.x; xi <= cores.end_coord.x; ++xi) {
+        for (int yi = cores.start_coord.y; yi <= cores.end_coord.y; ++yi) {
+            std::vector<uint32_t> single_data;
+            tt::tt_metal::detail::ReadFromDeviceL1(device, CoreCoord{xi, yi}, addr, sizeof(uint32_t), single_data);
+            data.push_back(single_data[0]);
+        }
+    }
+    return data;
+}
+
+std::optional<CoreRange> configure_kernels(
+    tt::tt_metal::IDevice* device,
+    tt::tt_metal::Program& program,
+    const Context& context,
+    uint32_t start_y,
+    uint32_t num_kernels,
+    bool is_writer,
+    uint32_t pcie_size,
+    uint32_t pcie_offset) {
+    constexpr std::string_view k_PcieBenchKernel = "tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp";
+    const auto grid_size = device->logical_grid_size();
+    const auto max_x = grid_size.x;
+    const auto max_y = grid_size.y;
+    uint32_t total_kernel_transfer = context.total_size;
+    uint32_t kernel_transfer_size = context.page_size;
+
+    if (!kernel_transfer_size) {
+        kernel_transfer_size = total_kernel_transfer;
+    } else if (!num_kernels) {
+        return {};
+    }
+
+    // Number readers either less than one row
+    // or a multiple of the rows
+    CoreCoord start_coord{0, start_y};
+    CoreCoord end_coord;
+    if (num_kernels <= max_x) {
+        end_coord.x = start_coord.x + num_kernels - 1;
+        end_coord.y = start_coord.y;
+    } else {
+        const auto number_of_rows = num_kernels / max_x;
+        const auto last_row_width = (num_kernels % max_x) ? num_kernels % max_x : max_x;
+        end_coord.x = start_coord.x + last_row_width - 1;
+        end_coord.y = number_of_rows - 1;
+    }
+    CoreRange core_range{start_coord, end_coord};
+
+    std::vector<uint32_t> pcie_bench_compile_args(12, 0);
+    if (is_writer) {
+        pcie_bench_compile_args[5] = 0;                     // reserved_0
+        pcie_bench_compile_args[6] = pcie_offset;           // pcie_wr_base
+        pcie_bench_compile_args[7] = pcie_size;             // pcie_wr_size
+        pcie_bench_compile_args[8] = kernel_transfer_size;  // pcie_wr_transfer_size
+    } else {
+        pcie_bench_compile_args[0] = context.device_address.unreserved;  // my_rd_dst_addr
+        pcie_bench_compile_args[1] = pcie_offset;                        // pcie_rd_base
+        pcie_bench_compile_args[2] = pcie_size;                          // pcie_rd_size
+        pcie_bench_compile_args[3] = kernel_transfer_size;               // pcie_rd_transfer_size
+    }
+    pcie_bench_compile_args[4] = context.device_address.rd_bytes;  // my_bytes_rd_addr
+    pcie_bench_compile_args[9] = context.device_address.wr_bytes;  // my_bytes_wr_addr
+    pcie_bench_compile_args[10] = total_kernel_transfer;
+    pcie_bench_compile_args[11] = context.device_address.cycles;
+
+    [[maybe_unused]] auto kernel = tt::tt_metal::CreateKernel(
+        program,
+        std::string{k_PcieBenchKernel},
+        core_range,
+        DataMovementConfig{
+            .processor = DataMovementProcessor::RISCV_1,
+            .noc = tt::tt_metal::NOC_0,
+            .compile_args = pcie_bench_compile_args,
+            .defines = {},
+        });
+
+    return core_range;
+}
+
+}  // namespace tt::tt_metal::tools::mem_bench
diff --git a/tt_metal/tools/mem_bench/device_utils.hpp b/tt_metal/tools/mem_bench/device_utils.hpp
new file mode 100644
index 000000000000..ab20ebfc3cc8
--- /dev/null
+++ b/tt_metal/tools/mem_bench/device_utils.hpp
@@ -0,0 +1,26 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <vector>
+#include <tt-metalium/device.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include "context.hpp"
+
+namespace tt::tt_metal::tools::mem_bench {
+
+std::vector<uint32_t> read_cores(tt::tt_metal::IDevice* device, const CoreRange& cores, uint32_t addr);
+
+std::optional<CoreRange> configure_kernels(
+    tt::tt_metal::IDevice* device,
+    tt::tt_metal::Program& program,
+    const Context& context,
+    uint32_t start_y,
+    uint32_t num_kernels,
+    bool is_writer,
+    uint32_t pcie_size,
+    uint32_t pcie_offset = 0);
+
+}  // namespace tt::tt_metal::tools::mem_bench
diff --git a/tt_metal/tools/mem_bench/host_utils.cpp b/tt_metal/tools/mem_bench/host_utils.cpp
new file mode 100644
index 000000000000..9aad3fe59fa8
--- /dev/null
+++ b/tt_metal/tools/mem_bench/host_utils.cpp
@@ -0,0 +1,87 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "host_utils.hpp"
+#include <limits>
+#include <random>
+#include <chrono>
+#include <algorithm>
+#include <numa.h>
+#include <tt_cluster.hpp>
+
+namespace tt::tt_metal::tools::mem_bench {
+
+void* get_hugepage(int device_id, uint32_t base_offset) {
+    auto& cluster = tt::Cluster::instance();
+    auto mmio_device_id = cluster.get_associated_mmio_device(device_id);
+    auto channel = cluster.get_assigned_channel_for_device(device_id);
+    return (void*)(cluster.host_dma_address(base_offset, mmio_device_id, channel));
+}
+
+uint32_t get_hugepage_size(int device_id) {
+    auto& cluster = tt::Cluster::instance();
+    auto mmio_device_id = cluster.get_associated_mmio_device(device_id);
+    auto channel = cluster.get_assigned_channel_for_device(device_id);
+    return cluster.get_host_channel_size(mmio_device_id, channel);
+}
+
+tt::tt_metal::vector_memcpy_aligned<uint32_t> generate_random_src_data(uint32_t num_bytes) {
+    std::uniform_int_distribution<uint32_t> distribution(
+        std::numeric_limits<uint32_t>::min(), std::numeric_limits<uint32_t>::max());
+    std::default_random_engine generator;
+
+    tt::tt_metal::vector_memcpy_aligned<uint32_t> vec(num_bytes / sizeof(uint32_t));
+    std::generate(vec.begin(), vec.end(), [&]() { return distribution(generator); });
+
+    return vec;
+}
+
+double get_current_time_seconds() {
+    return std::chrono::duration<double>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
+}
+
+std::vector<int> get_mmio_device_ids(int number_of_devices, int numa_node) {
+    auto& cluster = tt::Cluster::instance();
+    const auto pcie_devices = cluster.number_of_pci_devices();
+    std::vector<int> device_ids;
+
+    // Assumes PCIe device IDs are iterated first
+    for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) {
+        // Not an MMIO device
+        if (cluster.get_associated_mmio_device(device_id) != device_id) {
+            continue;
+        }
+
+        auto associated_node = cluster.get_numa_node_for_device(device_id);
+        if (numa_node == -1 || associated_node == numa_node) {
+            device_ids.push_back(device_id);
+        }
+    }
+
+    return device_ids;
+}
+
+std::vector<int> get_mmio_device_ids_unique_nodes(int number_of_devices) {
+    auto& cluster = tt::Cluster::instance();
+    const auto pcie_devices = cluster.number_of_pci_devices();
+    std::vector<int> device_ids;
+    std::unordered_set<uint32_t> numa_nodes;
+
+    for (int device_id = 0; device_id < pcie_devices && device_ids.size() < number_of_devices; ++device_id) {
+        auto associated_node = cluster.get_numa_node_for_device(device_id);
+        if (!numa_nodes.contains(associated_node)) {
+            device_ids.push_back(device_id);
+            numa_nodes.insert(associated_node);
+        }
+    }
+
+    return device_ids;
+}
+
+int get_number_of_mmio_devices() {
+    auto& cluster = tt::Cluster::instance();
+    return cluster.number_of_pci_devices();
+}
+
+}  // namespace tt::tt_metal::tools::mem_bench
diff --git a/tt_metal/tools/mem_bench/host_utils.hpp b/tt_metal/tools/mem_bench/host_utils.hpp
new file mode 100644
index 000000000000..cab39c6ecdea
--- /dev/null
+++ b/tt_metal/tools/mem_bench/host_utils.hpp
@@ -0,0 +1,82 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+#include <span>
+#include <vector>
+#include <tt-metalium/memcpy.hpp>
+#include <tt-metalium/tt_align.hpp>
+
+namespace tt::tt_metal::tools::mem_bench {
+
+// Generate random data aligned for memcpy_to_device.
+tt::tt_metal::vector_memcpy_aligned<uint32_t> generate_random_src_data(uint32_t num_bytes);
+
+// Get current host time, in seconds.
+double get_current_time_seconds();
+
+// Return device ids. If numa_node is specified then only device ids on that
+// node will be returned. If numa_node == -1, then the node is not taken into
+// consideration. Note: Less than number_of_devices may be returned.
+std::vector<int> get_mmio_device_ids(int number_of_devices, int numa_node);
+
+// Returns device ids. All devices are on different nodes. Note: Less than
+// number_of_devices may be returned.
+std::vector<int> get_mmio_device_ids_unique_nodes(int number_of_devices);
+
+// Returns the number of MMIO connected chips.
+int get_number_of_mmio_devices();
+
+// Returns the hugepage pointer assigned to a device.
+void* get_hugepage(int device_id, uint32_t base_offset);
+
+// Returns the size of the hugepage assigned to a device.
+uint32_t get_hugepage_size(int device_id);
+
+// Copy data to hugepage. Returns the duration.
+// repeating_src_vector: Keep copying the same elements to hugepage. This should force the source data in stay in the
+// caches. fence: Memory barrier at the end of each copy. Returns the time in seconds
+template <bool fence = false>
+double copy_to_hugepage(
+    void* hugepage_base,
+    uint32_t hugepage_size,
+    std::span<uint32_t> src_data,
+    size_t total_size,
+    size_t page_size,
+    bool repeating_src_vector) {
+    uint64_t hugepage_addr = reinterpret_cast<uint64_t>(hugepage_base);
+    uint64_t hugepage_end = hugepage_addr + hugepage_size;
+    uint64_t src_addr = reinterpret_cast<uint64_t>(src_data.data());
+    size_t num_pages;
+    if (!page_size) {
+        num_pages = 1;
+        page_size = total_size;
+    } else {
+        num_pages = total_size / page_size;
+    }
+
+    auto start = get_current_time_seconds();
+    for (int i = 0; i < num_pages; ++i) {
+        tt::tt_metal::memcpy_to_device<fence>((void*)(hugepage_addr), (void*)(src_addr), page_size);
+
+        // 64 bit host address alignment
+        hugepage_addr = ((hugepage_addr + page_size - 1) | (tt::tt_metal::MEMCPY_ALIGNMENT - 1)) + 1;
+
+        if (!repeating_src_vector) {
+            src_addr += page_size;
+        }
+
+        // Wrap back to the beginning of hugepage
+        if (hugepage_addr + page_size >= hugepage_end) {
+            hugepage_addr = reinterpret_cast<uint64_t>(hugepage_base);
+        }
+    }
+    auto end = get_current_time_seconds();
+
+    return end - start;
+}
+
+};  // namespace tt::tt_metal::tools::mem_bench
diff --git a/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp b/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp
new file mode 100644
index 000000000000..e04b02013dec
--- /dev/null
+++ b/tt_metal/tools/mem_bench/kernels/mem_bench_kernel.cpp
@@ -0,0 +1,99 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+#include <cstdint>
+#include <cstdlib>
+#include "noc_overlay_parameters.h"
+
+//
+// Test Kernel for mem_bench
+//
+// Performs PCIe reads and/or writes
+//
+
+// reader kernel
+constexpr uint32_t my_rd_dst_addr = get_compile_time_arg_val(0);  // L1
+constexpr uint32_t pcie_rd_base = get_compile_time_arg_val(1);
+constexpr uint32_t pcie_rd_size = get_compile_time_arg_val(2);
+constexpr uint32_t pcie_rd_end = pcie_rd_base + pcie_rd_size;
+constexpr uint32_t pcie_rd_transfer_size = get_compile_time_arg_val(3);
+constexpr uint32_t my_bytes_rd_addr = get_compile_time_arg_val(4);
+
+// writer kernel
+constexpr uint32_t reserved_0 = get_compile_time_arg_val(5);
+constexpr uint32_t pcie_wr_base = get_compile_time_arg_val(6);
+constexpr uint32_t pcie_wr_size = get_compile_time_arg_val(7);
+constexpr uint32_t pcie_wr_end = pcie_wr_base + pcie_wr_size;
+constexpr uint32_t pcie_wr_transfer_size = get_compile_time_arg_val(8);
+constexpr uint32_t my_bytes_wr_addr = get_compile_time_arg_val(9);
+
+// common to both
+constexpr uint32_t my_total_work = get_compile_time_arg_val(10);  // Total bytes to read+write
+constexpr uint32_t my_cycles_addr = get_compile_time_arg_val(11);
+
+static_assert(my_bytes_rd_addr && my_bytes_wr_addr, "Must provide addresses for my_bytes_rd/wr_addr");
+static_assert(my_cycles_addr, "Must provide L1 address for cycles elapsed");
+
+uint64_t get_cycles() {
+    uint32_t timestamp_low = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_L);
+    uint32_t timestamp_high = reg_read(RISCV_DEBUG_REG_WALL_CLOCK_H);
+    return (((uint64_t)timestamp_high) << 32) | timestamp_low;
+}
+
+void kernel_main() {
+    auto my_cycles = reinterpret_cast<volatile uint32_t*>(my_cycles_addr);
+    auto my_bytes_read = reinterpret_cast<volatile uint32_t*>(my_bytes_rd_addr);
+    auto my_bytes_written = reinterpret_cast<volatile uint32_t*>(my_bytes_wr_addr);
+
+    my_bytes_read[0] = 0;
+    my_bytes_written[0] = 0;
+    my_cycles[0] = 0;
+
+    uint64_t pcie_noc_xy_encoding = (uint64_t)NOC_XY_PCIE_ENCODING(PCIE_NOC_X, PCIE_NOC_Y);
+    uint32_t rd_ptr = pcie_rd_base;
+    uint32_t wr_ptr = pcie_wr_base;
+
+    const auto start = get_cycles();
+
+    uint32_t total_bytes_read = 0;
+    uint32_t total_bytes_written = 0;
+    while (total_bytes_read + total_bytes_written < my_total_work) {
+        if constexpr (my_rd_dst_addr) {
+            uint64_t host_src_addr = pcie_noc_xy_encoding | rd_ptr;
+            noc_async_read(host_src_addr, my_rd_dst_addr, pcie_rd_transfer_size);
+            rd_ptr += pcie_rd_transfer_size;
+            total_bytes_read += pcie_rd_transfer_size;
+            if (rd_ptr >= pcie_rd_end) {
+                rd_ptr = pcie_rd_base;
+            }
+        }
+        if constexpr (pcie_wr_size) {
+            uint64_t host_dst_addr = pcie_noc_xy_encoding | wr_ptr;
+            noc_async_write(
+                wr_ptr,  // Any data
+                host_dst_addr,
+                pcie_wr_transfer_size);
+            wr_ptr += pcie_wr_transfer_size;
+            total_bytes_written += pcie_wr_transfer_size;
+            if (wr_ptr >= pcie_wr_end) {
+                wr_ptr = pcie_wr_base;
+            }
+        }
+    }
+
+    if constexpr (my_rd_dst_addr) {
+        noc_async_read_barrier();
+    }
+    if constexpr (pcie_wr_size) {
+        noc_async_write_barrier();
+    }
+
+    auto end = get_cycles();
+    my_cycles[0] = end - start;
+    my_bytes_read[0] = total_bytes_read;
+    my_bytes_written[0] = total_bytes_written;
+}
diff --git a/tt_metal/tools/mem_bench/mem_bench.cpp b/tt_metal/tools/mem_bench/mem_bench.cpp
new file mode 100644
index 000000000000..da0b2a8a8af5
--- /dev/null
+++ b/tt_metal/tools/mem_bench/mem_bench.cpp
@@ -0,0 +1,545 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <numeric>
+
+#include <benchmark/benchmark.h>
+
+#include <tt_cluster.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/hal_exp.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/test_common.hpp>
+
+#include "context.hpp"
+#include "host_utils.hpp"
+#include "device_utils.hpp"
+#include "work_thread.hpp"
+#include "tt_metal/impl/dispatch/util/size_literals.hpp"
+
+using namespace tt;
+using namespace tt::tt_metal;
+using namespace tt::tt_metal::tools::mem_bench;
+
+// Read L1 counters (cycles, bytes rd, bytes wr) and increment test_results
+void read_inc_data_from_cores(const Context& ctx, IDevice* device, const CoreRange& cores, TestResult& test_results) {
+    auto dev_cycles = read_cores(device, cores, ctx.device_address.cycles);
+    auto dev_bytes_read = read_cores(device, cores, ctx.device_address.rd_bytes);
+    auto dev_bytes_written = read_cores(device, cores, ctx.device_address.wr_bytes);
+    auto dev_clk = tt::Cluster::instance().get_device_aiclk(device->id()) * 1e6;  // Hz
+
+    double total_cycles = std::reduce(dev_cycles.begin(), dev_cycles.end(), 0ULL);
+
+    test_results.total_cores_cycles += total_cycles;
+    test_results.total_cores_time += total_cycles / dev_clk;
+    // Reduce with 64 bits to prevent overflow as values read from device is 32 bits
+    test_results.total_cores_bytes_rd += std::reduce(dev_bytes_read.begin(), dev_bytes_read.end(), 0ULL);
+    test_results.total_cores_bytes_wr += std::reduce(dev_bytes_written.begin(), dev_bytes_written.end(), 0ULL);
+
+    test_results.kernel_0_cycles += dev_cycles[0];
+    test_results.kernel_0_time += dev_cycles[0] / dev_clk;
+    test_results.kernel_0_bytes_rd += dev_bytes_read[0];
+    test_results.kernel_0_bytes_wr += dev_bytes_written[0];
+}
+
+// Report device bandwidth to the benchmark state
+// Average bw will be reported as "dev_bw" as well as the bw for the
+// first core will also be reported by itself as "kernel_0_bw".
+void report_device_bw(benchmark::State& state, const TestResult& test_results) {
+    state.counters["dev_bw"] =
+        (test_results.total_cores_bytes_rd + test_results.total_cores_bytes_wr) / test_results.total_cores_time;
+    state.counters["dev_rd_bytes"] = test_results.total_cores_bytes_rd;
+    state.counters["dev_wr_bytes"] = test_results.total_cores_bytes_wr;
+    state.counters["dev_rd_bw"] = test_results.total_cores_bytes_rd / test_results.total_cores_time;
+    state.counters["dev_wr_bw"] = test_results.total_cores_bytes_wr / test_results.total_cores_time;
+    state.counters["dev_cycles"] = test_results.total_cores_cycles;
+
+    state.counters["kernel_0_bw"] =
+        (test_results.kernel_0_bytes_rd + test_results.kernel_0_bytes_wr) / test_results.kernel_0_time;
+    state.counters["kernel_0_rd_bw"] = test_results.kernel_0_bytes_rd / test_results.kernel_0_time;
+    state.counters["kernel_0_wr_bw"] = test_results.kernel_0_bytes_wr / test_results.kernel_0_time;
+    state.counters["kernel_0_cycles"] = test_results.kernel_0_cycles;
+}
+
+// Benchmark various memcpy_to_device transfer sizes.
+// Reports host bw.
+TestResult mem_bench_page_sizing(benchmark::State& state) {
+    constexpr uint32_t k_DeviceId = 0;
+    TestResult results;
+    Context ctx{
+        {},
+        state.range(0),  // Total size
+        state.range(1),  // Page size
+        0,               // Threads
+        0,               // Readers
+        0,               // Writers
+        true,            // Enable host copy
+        0,               // Iterations is managed by the benchmark framework
+    };
+
+    auto src_data = generate_random_src_data(ctx.total_size);
+    auto hugepage = get_hugepage(k_DeviceId, 0);
+    auto hugepage_size = get_hugepage_size(k_DeviceId);
+    bool cached = state.range(2);
+
+    for (auto _ : state) {
+        const double iteration_time =
+            cached ? copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, true)
+                   : copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false);
+        results.host_bytes_processed += ctx.total_size;
+        results.host_time_elapsed += iteration_time;
+
+        state.SetIterationTime(iteration_time);
+    }
+    state.SetBytesProcessed(ctx.total_size * state.iterations());
+    return results;
+}
+
+// Benchmark memcpy_to_device on multiple threads to try saturating host bandwidth.
+// Reports host bw.
+TestResult mem_bench_copy_multithread(benchmark::State& state) {
+    static_assert((MEMCPY_ALIGNMENT & ((MEMCPY_ALIGNMENT)-1)) == 0);
+    constexpr uint32_t k_DeviceId = 0;
+    TestResult results;
+    Context ctx{
+        {},
+        state.range(0),  // Total size
+        state.range(1),  // Page size
+        state.range(2),  // Threads
+        0,               // Readers
+        0,               // Writers
+        true,            // Enable host copy
+        0,               // Iterations is managed by the benchmark framework
+    };
+    auto src_data = generate_random_src_data(ctx.total_size);
+    auto hugepage = get_hugepage(0, 0);
+    const auto hugepage_size = get_hugepage_size(0);
+    const auto bytes_per_thread = ((ctx.total_size / ctx.threads) + (MEMCPY_ALIGNMENT)-1) & -(MEMCPY_ALIGNMENT);
+    const auto last_thread_bytes = ctx.total_size - (bytes_per_thread * (ctx.threads - 1));
+
+    for (auto _ : state) {
+        auto iteration_time = execute_work_synced_start(
+            ctx.threads,
+            [&](int thread_idx) {
+                uint64_t thread_dst = (uint64_t)hugepage + (thread_idx * bytes_per_thread);
+                uint64_t thread_bytes = (thread_idx == ctx.threads - 1) ? last_thread_bytes : bytes_per_thread;
+                std::span<uint32_t> thread_src{src_data};
+                thread_src = thread_src.subspan(
+                    (thread_idx * bytes_per_thread) / sizeof(uint32_t), thread_bytes / sizeof(uint32_t));
+                copy_to_hugepage<false>(
+                    (void*)thread_dst, hugepage_size, thread_src, thread_bytes, ctx.page_size, false);
+            },
+            []() {});
+
+        results.host_bytes_processed += ctx.total_size;
+        results.host_time_elapsed += iteration_time;
+
+        state.SetIterationTime(iteration_time);
+    }
+
+    state.SetBytesProcessed(ctx.total_size * state.iterations());
+    return results;
+}
+
+// Benchmark memcpy_to_device while the device is reading the hugepage.
+// Reports host bw and device bw.
+TestResult mem_bench_copy_with_active_kernel(benchmark::State& state) {
+    TestResult results;
+    auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1));
+    IDevice* device = (*(devices.begin())).second;
+    Context ctx{
+        devices,
+        state.range(0),  // Total size
+        state.range(1),  // Page size
+        0,               // Threads
+        state.range(2),  // Readers
+        0,               // Writers
+        state.range(3),  // Enable host copy
+        0,               // Iterations is managed by the benchmark framework
+    };
+
+    auto src_data = generate_random_src_data(ctx.total_size);
+    auto hugepage = get_hugepage(device->id(), 0);
+    auto hugepage_size = get_hugepage_size(device->id());
+
+    for (auto _ : state) {
+        auto pgm = CreateProgram();
+        auto configured_cores = configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size);
+        double host_copy_time = 1;  // Set to 1 so it doesn't divide by 0 if host copy is disabled
+
+        double wait_for_kernel_time = execute_work_synced_start(
+            1,
+            [device, &pgm](int thread_idx) {
+                // Program
+                tt::tt_metal::detail::LaunchProgram(device, pgm, true);
+            },
+            [&]() {
+                if (ctx.enable_host_copy_with_kernels) {
+                    // Host copy while waiting for program
+                    host_copy_time =
+                        copy_to_hugepage(hugepage, hugepage_size, src_data, ctx.total_size, ctx.page_size, false);
+                    results.host_bytes_processed += ctx.total_size;
+                    results.host_time_elapsed += host_copy_time;
+                }
+            });
+
+        results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time;
+
+        read_inc_data_from_cores(ctx, device, configured_cores.value(), results);
+
+        state.SetIterationTime(host_copy_time);
+    }
+    if (ctx.enable_host_copy_with_kernels) {
+        state.SetBytesProcessed(ctx.total_size * state.iterations());
+    } else {
+        state.SetBytesProcessed(0);
+    }
+
+    report_device_bw(state, results);
+    tt::tt_metal::detail::CloseDevices(devices);
+    return results;
+}
+
+// Host writing to a hugepage while the device pulls from another hugepage.
+// Reports host bw and device bw.
+TestResult mem_bench_copy_active_kernel_different_page(benchmark::State& state) {
+    TestResult results;
+    auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1));
+    IDevice* device = (*(devices.begin())).second;
+    Context ctx{
+        devices,
+        state.range(0),  // Total size
+        state.range(1),  // Page size
+        0,               // Threads
+        state.range(2),  // Readers
+        0,               // Writers
+        true,            // Enable host copy
+        0,               // Iterations is managed by the benchmark framework
+    };
+
+    auto src_data = generate_random_src_data(ctx.total_size);
+    auto device_hugepage_size = get_hugepage_size(device->id());
+
+    // 2nd open device is not required
+    auto host_hugepage = get_hugepage(device->id() + 1, 0);
+    auto host_hugepage_size = get_hugepage_size(device->id() + 1);
+
+    for (auto _ : state) {
+        auto pgm = CreateProgram();
+        auto configured_cores =
+            configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size).value();
+        double host_copy_time = 0;
+
+        double wait_for_kernel_time = execute_work_synced_start(
+            1,
+            [device, &pgm](int thread_idx) {
+                // Program
+                tt::tt_metal::detail::LaunchProgram(device, pgm, true);
+            },
+            [&]() {
+                // Host copy while waiting for program
+                host_copy_time =
+                    copy_to_hugepage(host_hugepage, host_hugepage_size, src_data, ctx.total_size, ctx.page_size, false);
+                results.host_bytes_processed += ctx.total_size;
+                results.host_time_elapsed += host_copy_time;
+            });
+
+        results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time;
+
+        read_inc_data_from_cores(ctx, device, configured_cores, results);
+
+        state.SetIterationTime(host_copy_time);
+    }
+
+    state.SetBytesProcessed(ctx.total_size * state.iterations());
+
+    report_device_bw(state, results);
+    tt::tt_metal::detail::CloseDevices(devices);
+    return results;
+}
+
+// Common Multi MMIO device test.
+TestResult mem_bench_multi_mmio_devices(
+    benchmark::State& state, std::map<chip_id_t, IDevice*>& devices, const Context& ctx) {
+    TestResult results;
+
+    // One thread to wait for program on each device
+    int num_threads = devices.size();
+
+    for (auto _ : state) {
+        std::map<int, Program> programs;                  // device : programs
+        std::map<int, CoreRange> configured_core_ranges;  // device : cores
+        for (auto [device_id, device] : devices) {
+            programs[device_id] = CreateProgram();
+            Program& pgm = programs[device_id];
+            auto device_hugepage = get_hugepage(device_id, 0);
+            auto device_hugepage_size = get_hugepage_size(device_id);
+            configured_core_ranges.insert(
+                {device_id,
+                 configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, device_hugepage_size)
+                     .value()});
+        }
+
+        double host_copy_time = 0;
+        execute_work_synced_start(
+            1,
+            [devices, &programs](int thread_idx) {
+                // Program
+                for (auto& [device_id, pgm] : programs) {
+                    tt::tt_metal::detail::LaunchProgram(devices.at(device_id), pgm, false);
+                }
+            },
+            []() {});
+
+        // Wait all programs to complete
+        for (auto& [device_id, pgm] : programs) {
+            tt::tt_metal::detail::WaitProgramDone(devices.at(device_id), pgm);
+        }
+
+        // Read counters from each core
+        for (auto& [device_id, core_range] : configured_core_ranges) {
+            read_inc_data_from_cores(ctx, devices.at(device_id), core_range, results);
+        }
+
+        // This test does not report host bw
+        state.SetIterationTime(1);
+    }
+
+    state.SetBytesProcessed(0);
+    report_device_bw(state, results);
+    state.counters["num_mmio_devices"] = devices.size();
+
+    return results;
+}
+
+// Multi MMIO devices reading on the same NUMA node.
+TestResult mem_bench_multi_mmio_devices_reading_same_node(benchmark::State& state) {
+    // Node 0
+    auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(get_number_of_mmio_devices(), 0));
+
+    Context ctx{
+        devices,
+        state.range(0),  // Total size
+        state.range(1),  // Page size
+        0,               // Threads
+        state.range(2),  // Readers on each device
+        0,               // Writers
+        false,           // Enable host copy
+        0,               // Iterations is managed by the benchmark framework
+    };
+
+    TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx);
+    tt::tt_metal::detail::CloseDevices(devices);
+
+    return results;
+}
+
+// Multi MMIO devices reading on different NUMA nodes.
+TestResult mem_bench_multi_mmio_devices_reading_different_node(benchmark::State& state) {
+    auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids_unique_nodes(get_number_of_mmio_devices()));
+
+    Context ctx{
+        devices,
+        state.range(0),  // Total size
+        state.range(1),  // Page size
+        0,               // Threads
+        state.range(2),  // Readers on each device
+        0,               // Writers
+        false,           // Enable host copy
+        0,               // Iterations is managed by the benchmark framework
+    };
+
+    TestResult results = mem_bench_multi_mmio_devices(state, devices, ctx);
+    tt::tt_metal::detail::CloseDevices(devices);
+
+    return results;
+}
+
+// Benchmark memcpy_to_device while device is reading (prefetching) and writing (dispatching data back to host)
+// First half of hugepage will be written to by host
+// Second half will be written to by device
+TestResult mem_bench_copy_with_read_and_write_kernel(benchmark::State& state) {
+    auto devices = tt::tt_metal::detail::CreateDevices(get_mmio_device_ids(1, -1));
+    IDevice* device = (*(devices.begin())).second;
+    Context ctx{
+        devices,
+        state.range(0),  // Total size
+        state.range(1),  // Page size
+        0,               // Threads
+        state.range(2),  // Readers
+        state.range(3),  // Writers
+        true,            // Enable host copy
+        0,               // Iterations is managed by the benchmark framework
+    };
+
+    auto src_data = generate_random_src_data(ctx.total_size);
+    auto hugepage = get_hugepage(device->id(), 0);
+    auto hugepage_size = get_hugepage_size(device->id());
+
+    // Don't need to seperate device results
+    // Readers will have 0 bytes written
+    // Writers will have 0 bytes read. Will not mix.
+    TestResult results;
+
+    for (auto _ : state) {
+        auto pgm = CreateProgram();
+        auto configured_read_cores =
+            configure_kernels(device, pgm, ctx, 0, ctx.number_reader_kernels, false, hugepage_size / 2).value();
+        // Offset write cores to second half of PCIe
+        // Use second row
+        auto configured_write_cores =
+            configure_kernels(
+                device, pgm, ctx, 1, ctx.number_writer_kernels, true, hugepage_size / 2, hugepage_size / 2)
+                .value();
+        double host_copy_time = 0;
+
+        double wait_for_kernel_time = execute_work_synced_start(
+            1,
+            [device, &pgm](int thread_idx) {
+                // Program
+                tt::tt_metal::detail::LaunchProgram(device, pgm, true);
+            },
+            [&]() {
+                // Host copy while waiting for program
+                host_copy_time =
+                    copy_to_hugepage(hugepage, hugepage_size / 2, src_data, ctx.total_size, ctx.page_size, false);
+                results.host_bytes_processed += ctx.total_size;
+                results.host_time_elapsed += host_copy_time;
+            });
+
+        results.host_wait_for_kernel_time_elapsed += wait_for_kernel_time;
+
+        read_inc_data_from_cores(ctx, device, configured_read_cores, results);
+        read_inc_data_from_cores(ctx, device, configured_write_cores, results);
+
+        state.SetIterationTime(host_copy_time);
+    }
+
+    state.SetBytesProcessed(ctx.total_size * state.iterations());
+    report_device_bw(state, results);
+    tt::tt_metal::detail::CloseDevices(devices);
+    return results;
+}
+
+void global_bench_args(benchmark::internal::Benchmark* b) { b->UseManualTime()->Iterations(5); }
+
+void register_basic_benchmark_suite() {
+    ::benchmark::RegisterBenchmark("Host Copy Page Sizing", mem_bench_page_sizing)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {16, 8_KB, 16_KB, 32_KB},
+            {false},
+        });
+    ::benchmark::RegisterBenchmark("Host Copy (Cached)", mem_bench_page_sizing)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {16, 8_KB, 16_KB, 32_KB},
+            {true},
+        });
+    ::benchmark::RegisterBenchmark("Host Copy Saturation", mem_bench_copy_multithread)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {32_KB},
+            {1, 2, 3, 4, 5, 6, 7, 8},
+        });
+    ::benchmark::RegisterBenchmark("Device Reading Host", mem_bench_copy_with_active_kernel)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {32_KB},
+            {1, 2, 3, 4},
+            {false},
+        });
+}
+
+void register_full_benchmark_suite() {
+    ::benchmark::RegisterBenchmark("Host Copy with Active Kernel", mem_bench_copy_with_active_kernel)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {32_KB},
+            {1, 2, 3, 4},
+            {false},
+        });
+    ::benchmark::RegisterBenchmark(
+        "Host Copy with Active Kernel on Different Hugepages", mem_bench_copy_active_kernel_different_page)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {32_KB},
+            {1, 2, 3, 4},
+        });
+    ::benchmark::RegisterBenchmark(
+        "Host Copy with Active Kernel Reading and Writing", mem_bench_copy_with_read_and_write_kernel)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {32_KB},
+            {1, 2},
+            {1, 2},
+        });
+    ::benchmark::RegisterBenchmark(
+        "Multiple MMIO Devices Reading (Same NUMA node)", mem_bench_multi_mmio_devices_reading_same_node)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {32_KB},
+            {1, 2},
+        });
+    ::benchmark::RegisterBenchmark(
+        "Multiple MMIO Devices Reading (Different NUMA node)", mem_bench_multi_mmio_devices_reading_different_node)
+        ->Apply(global_bench_args)
+        ->ArgsProduct({
+            {1_GB},
+            {32_KB},
+            {1, 2},
+        });
+}
+
+void print_help() {
+    ::benchmark::PrintDefaultHelp();
+    std::cout << "          [--help] Shows this help message\n";
+    std::cout << "          [--full] Run all tests\n";
+    std::cout << "\nCounters\n";
+    std::cout << "          bytes_per_second: Aggregate Host copy to hugepage bandwidth. 0 if not measured.\n";
+    std::cout << "          dev_bw: Average device core PCIe pull bandwidth. 0 if not measured.\n";
+}
+
+int main(int argc, char* argv[]) {
+    std::vector<std::string> input_args(argv, argv + argc);
+    if (test_args::has_command_option(input_args, "--help")) {
+        print_help();
+        return 0;
+    }
+
+    // Force TT_METAL options
+    setenv("TT_METAL_SLOW_DISPATCH_MODE", "true", true);
+    setenv("TT_METAL_CLEAR_L1", "1", true);
+    // May be overridden by the user
+    setenv("TT_METAL_LOGGER_LEVEL", "FATAL", false);
+
+    char arg0_default[] = "benchmark";
+    char* args_default = arg0_default;
+    if (!argv) {
+        argc = 1;
+        argv = &args_default;
+    }
+
+    // Run basic benchmarks
+    register_basic_benchmark_suite();
+
+    // Run all benchmarks
+    if (test_args::has_command_option(input_args, "--full")) {
+        register_full_benchmark_suite();
+    }
+
+    ::benchmark::Initialize(&argc, argv);
+    ::benchmark::RunSpecifiedBenchmarks();
+    ::benchmark::Shutdown();
+    return 0;
+}
diff --git a/tt_metal/tools/mem_bench/work_thread.hpp b/tt_metal/tools/mem_bench/work_thread.hpp
new file mode 100644
index 000000000000..6e47308a5cb3
--- /dev/null
+++ b/tt_metal/tools/mem_bench/work_thread.hpp
@@ -0,0 +1,75 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <vector>
+#include <thread>
+#include <condition_variable>
+#include <mutex>
+#include "host_utils.hpp"
+
+namespace tt::tt_metal::tools::mem_bench {
+
+// Execute work_fn on num_threads threads and also do intermediate_fn on the side.
+// Returns time taken in seconds for all work_fn to complete. Time is calculated by latest thread end - earliest thread
+// start.
+template <typename F, typename IntermediateF, typename... Args>
+double execute_work_synced_start(int num_threads, F&& work_fn, IntermediateF&& intermediate_fn, Args&&... args) {
+    std::mutex m;
+    int threads_ready{0};
+    std::condition_variable go_cv;         // Signal to all threads to go
+    auto total_threads = num_threads + 1;  // Including intermediate
+    std::vector<double> thread_start_times(num_threads);
+    std::vector<double> thread_end_times(num_threads);
+    std::vector<std::thread> threads(total_threads);
+
+    for (int i = 0; i < num_threads; ++i) {
+        threads[i] = std::thread([i,
+                                  &m,
+                                  &go_cv,
+                                  &threads_ready,
+                                  &thread_start_times,
+                                  &thread_end_times,
+                                  total_threads,
+                                  work_fn = std::forward<F>(work_fn),
+                                  ... args = std::forward<Args>(args)]() mutable {
+            {
+                std::unique_lock lk{m};
+                threads_ready++;
+                if (threads_ready == total_threads) {
+                    go_cv.notify_all();
+                }
+                go_cv.wait(lk, [&] { return threads_ready == total_threads; });
+            }
+
+            thread_start_times[i] = get_current_time_seconds();
+            work_fn(i, std::forward<Args>(args)...);
+            thread_end_times[i] = get_current_time_seconds();
+        });
+    }
+
+    threads[num_threads] = std::thread([&]() mutable {
+        std::unique_lock lk{m};
+        threads_ready++;
+        if (threads_ready == total_threads) {
+            go_cv.notify_all();
+        }
+        go_cv.wait(lk, [&] { return threads_ready == total_threads; });
+
+        intermediate_fn();
+    });
+
+    for (auto& thread : threads) {
+        thread.join();
+    }
+
+    // Calculate work time based on earliest start and latest end
+    double earliest_start = *std::min_element(thread_start_times.begin(), thread_start_times.end());
+    double latest_end = *std::max_element(thread_end_times.begin(), thread_end_times.end());
+
+    return latest_end - earliest_start;
+}
+
+};  // namespace tt::tt_metal::tools::mem_bench