diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/filt_pgm_dispatch.pl b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/filt_pgm_dispatch.pl index 29863d6a850..a1083042cde 100755 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/filt_pgm_dispatch.pl +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/filt_pgm_dispatch.pl @@ -22,7 +22,8 @@ $j = 0; } - if ($line =~ /per iteration/) { + # output line from test_pgm_dispatch + if ($line =~ /us per iteration/) { my @parts = split(' ', $line); my $us = $parts[8]; my $index = index($parts[8], "."); @@ -32,6 +33,22 @@ $data->[$j][$i] = $us; $j++; } + + # output line from test_bw_and_latency + if ($line =~ /BW:/) { + my @parts = split(' ', $line); + my $bw = $parts[7]; + $data->[$j][$i] = $bw; + $j++; + } + + # output latency from test_bw_and_latency + if ($line =~ /Latency:/) { + my @parts = split(' ', $line); + my $bw = $parts[7]; + $data->[$j][$i] = $bw; + $j++; + } } for (my $y = 0; $y < $maxj; $y++) { diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp index 731ac407e0f..fec10557331 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/kernels/bw_and_latency.cpp @@ -15,7 +15,6 @@ void kernel_main() { uint32_t read_ptr = cb_addr; uint32_t write_ptr = cb_addr; for (int j = 0; j < PAGE_COUNT; j++) { - #if DRAM_BANKED uint64_t noc_addr = get_dram_noc_addr(j, page_size, 0); #else @@ -24,8 +23,11 @@ void kernel_main() { #if ISSUE_MCAST uint64_t dst_noc_multicast_addr = - get_noc_multicast_addr(NOC_ADDR_X, NOC_ADDR_Y, MCAST_NOC_END_ADDR_X, MCAST_NOC_END_ADDR_Y, NOC_MEM_ADDR); - noc_async_write_multicast(write_ptr, dst_noc_multicast_addr, page_size, NUM_MCAST_DESTS); + get_noc_multicast_addr(NOC_ADDR_X, NOC_ADDR_Y, MCAST_NOC_END_ADDR_X, MCAST_NOC_END_ADDR_Y, write_ptr); + noc_async_write_multicast(read_ptr, dst_noc_multicast_addr, page_size, NUM_MCAST_DESTS, LINKED); +#elif WRITE + uint64_t noc_write_addr = NOC_XY_ADDR(NOC_X(NOC_ADDR_X), NOC_Y(NOC_ADDR_Y), write_ptr); + noc_async_write(NOC_MEM_ADDR, noc_write_addr, page_size); #elif READ_ONE_PACKET noc_async_read_one_packet(noc_addr, read_ptr, page_size); #else @@ -33,16 +35,29 @@ void kernel_main() { #endif #if LATENCY - noc_async_read_barrier(); +#if WRITE +#if LINKED + noc_async_write_multicast(cb_addr, dst_noc_multicast_addr, page_size, NUM_MCAST_DESTS, false); +#endif noc_async_write_barrier(); +#else + noc_async_read_barrier(); +#endif #endif - read_ptr += page_size; write_ptr += page_size; } } #if !LATENCY - noc_async_read_barrier(); +#if WRITE +#if LINKED + uint64_t dst_noc_multicast_addr = + get_noc_multicast_addr(NOC_ADDR_X, NOC_ADDR_Y, MCAST_NOC_END_ADDR_X, MCAST_NOC_END_ADDR_Y, cb_addr); + noc_async_write_multicast(cb_addr, dst_noc_multicast_addr, page_size, NUM_MCAST_DESTS, false); +#endif noc_async_write_barrier(); +#else + noc_async_read_barrier(); +#endif #endif } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/run_bw_and_latency.sh b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/run_bw_and_latency.sh deleted file mode 100755 index 90a4972019b..00000000000 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/run_bw_and_latency.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -if [ "$ARCH_NAME" = "grayskull" ]; then - echo "Configured core range for grayskull" - max_x=11 - max_y=8 -elif [ "$ARCH_NAME" = "wormhole_b0" ]; then - echo "Configured core range for wormhole_b0" - max_x=7 - max_y=6 -elif [ "$ARCH_NAME" = "blackhole" ]; then - echo "Configured core range for blackhole" - max_x=12 - max_y=9 -else - echo "Unknown arch: $ARCH_NAME" - exit 1 -fi - -function get_half_way_away_core_x() { - half_way_away_core_x=$(( ($1 + (($max_x + 1) / 2)) % ($max_x + 1) )) - echo $half_way_away_core_x -} - -function get_half_way_away_core_y() { - half_way_away_core_y=$(( ($1 + (($max_y + 1) / 2)) % ($max_y + 1) )) - echo $half_way_away_core_y -} - -function read_from_half_way_away_core() { - half_way_away_core_x=$(get_half_way_away_core_x $1) - half_way_away_core_y=$(get_half_way_away_core_y $2) - echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 2 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y" - ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 2 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y -} - -function mcast_write_to_half_way_away_core() { - half_way_away_core_x=$(get_half_way_away_core_x $1) - half_way_away_core_y=$(get_half_way_away_core_y $2) - echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y -tx $half_way_away_core_x -ty $half_way_away_core_y" - ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $half_way_away_core_x -sy $half_way_away_core_y -tx $half_way_away_core_x -ty $half_way_away_core_y -} - -function mcast_write_to_adjacent_core() { - adj_core_y=$(($2 + 1)) - if [ $adj_core_y -gt $max_y ]; then - adj_core_y=$(($2 - 1)) - fi - echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $adj_core_y -tx $1 -ty $adj_core_y" - ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $adj_core_y -tx $1 -ty $adj_core_y -} - -function mcast_write_from_core_after_curr_core_to_half_way_away_core() { - half_way_away_core_x=$(get_half_way_away_core_x $1) - half_way_away_core_y=$(get_half_way_away_core_y $2) - mcast_start_y=$(($2 + 1)) - echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $mcast_start_y -tx $half_way_away_core_x -ty $half_way_away_core_y" - ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $1 -ry $2 -sx $1 -sy $mcast_start_y -tx $half_way_away_core_x -ty $half_way_away_core_y -} - -for ((x=0; x<=max_x; x++)); do - for ((y=0; y<=max_y; y++)); do - read_from_half_way_away_core $x $y - mcast_write_to_half_way_away_core $x $y - mcast_write_to_adjacent_core $x $y - mcast_write_from_core_after_curr_core_to_half_way_away_core $x $y - - if [ $y -eq 0 ]; then - mcast_start_y=$(($y + 1)) - echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy $mcast_start_y -tx $max_x -ty $max_y" - ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy $mcast_start_y -tx $max_x -ty $max_y - fi - - if [ $y -eq $max_y ]; then - mcast_end_y=$(($y - 1)) - echo "./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy 0 -tx $max_x -ty $mcast_end_y" - ./build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency -m 6 -rx $x -ry $y -sx 0 -sy 0 -tx $max_x -ty $mcast_end_y - fi - done -done diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_bw_and_latency.sh b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_bw_and_latency.sh new file mode 100755 index 00000000000..ccc3ec1e8d2 --- /dev/null +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/sweep_bw_and_latency.sh @@ -0,0 +1,108 @@ +#/bin/bash + +# Run this test w/ +# sweep_bw_and_latency.sh 2>&1 | tee log +# the run +# filt_pgm_dispatch.pl +# then paste the results into the BW spreadsheet + +if [ "$ARCH_NAME" = "grayskull" ]; then + echo "Configured core range for grayskull" + max_x=11 + max_y=8 +elif [ "$ARCH_NAME" = "wormhole_b0" ]; then + echo "Configured core range for wormhole_b0" + max_x=7 + max_y=6 +elif [ "$ARCH_NAME" = "blackhole" ]; then + echo "Configured core range for blackhole" + max_x=12 + max_y=9 +else + echo "Unknown arch: $ARCH_NAME" + exit 1 +fi + +function get_half_way_away_core_x() { + half_way_away_core_x=$(( ($1 + (($max_x + 1) / 2)) % ($max_x + 1) )) + echo $half_way_away_core_x +} + +function get_half_way_away_core_y() { + half_way_away_core_y=$(( ($1 + (($max_y + 1) / 2)) % ($max_y + 1) )) + echo $half_way_away_core_y +} + +hx=$(get_half_way_away_core_x 0); +hy=$(get_half_way_away_core_y 0); + +function run_one() { + echo "Running $@" + build/test/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency $@ +} + +function bw_test() { + run_one -bs 8 -p 16 $@ + run_one -bs 8 -p 32 $@ + run_one -bs 16 -p 64 $@ + run_one -bs 32 -p 128 $@ + run_one -bs 64 -p 256 $@ + run_one -bs 128 -p 512 $@ + run_one -bs 256 -p 1024 $@ + run_one -bs 256 -p 2048 $@ + run_one -bs 256 -p 4096 $@ + run_one -bs 256 -p 8192 $@ + run_one -bs 256 -p 16384 $@ + run_one -bs 256 -p 32768 $@ + run_one -bs 256 -p 65536 $@ +} + +function latency_test() { + run_one -bs 8 -p 16 -l $@ +} + +echo "###" read pcie +bw_test "-m 0" +latency_test "-m 0" + +echo "###" read dram +bw_test "-m 1" +latency_test "-m 1" + +echo "###" read drams +bw_test "-m 3" +latency_test "-m 3" + +echo "###" read l1 adjacent +bw_test "-m 2" +latency_test "-m 2" + +echo "###" read l1 far halfway away +bw_test "-m 2 -rx 0 -ry 0 -sx $hx -sy $hy" +latency_test "-m 2 -rx 0 -ry 0 -sx $hx -sy $hy" + +echo "###" read local +bw_test "-m 2 -rx 0" +latency_test "-m 2 -rx 0" + +echo "###" write l1 far halfway away +bw_test "-m 2 -rx 0 -ry 0 -sx $hx -sy $hy -wr" +latency_test "-m 2 -rx 0 -ry 0 -sx $hx -sy $hy -wr" + +echo "###" mcast write to adjacent +bw_test "-m 6 -rx 0 -ry 0 -sx 1 -sy 0 -tx 1 -ty 0" +latency_test "-m 6 -rx 0 -ry 0 -sx 1 -sy 0 -tx 1 -ty 0" + +echo "###" mcast write to halfway away +bw_test "-m 6 -rx 0 -ry 0 -sx $hx -sy $hy -tx $hx -ty $hy" +latency_test "-m 6 -rx 0 -ry 0 -sx $hx -sy $hy -tx $hx -ty $hy" + +echo "###" mcast write to all +bw_test "-m 6 -rx 0 -ry 0 -sx 0 -sy 1 -tx $max_x -ty $max_y" +latency_test "-m 6 -rx 0 -ry 0 -sx 0 -sy 1 -tx $max_x -ty $max_y" + +echo "###" mcast write to all, linked +bw_test "-m 6 -rx 0 -ry 0 -sx 0 -sy 1 -tx $max_x -ty $max_y -link" +latency_test "-m 6 -rx 0 -ry 0 -sx 0 -sy 1 -tx $max_x -ty $max_y -link" + +echo "###" done diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index d13994ded6e..34283f67937 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -48,6 +48,8 @@ bool page_size_as_runtime_arg_g; // useful particularly on GS multi-dram tests ( bool hammer_write_reg_g = false; bool hammer_pcie_g = false; bool hammer_pcie_type_g = false; +bool test_write = false; +bool linked = false; void init(int argc, char **argv) { std::vector input_args(argv, argv + argc); @@ -67,10 +69,12 @@ void init(int argc, char **argv) { log_info(LogTest, " -sy: when reading from L1, Y of core to read from. when issuing a multicast write, Y of start core to write to. (default {})", 0); log_info(LogTest, " -tx: when issuing a multicast write, X of end core to write to (default {})", 0); log_info(LogTest, " -ty: when issuing a multicast write, Y of end core to write to (default {})", 0); + log_info(LogTest, " -wr: issue unicast write instead of read (default false)"); log_info(LogTest, " -c: when reading from dram, DRAM channel (default 0)"); log_info(LogTest, " -f: time just the finish call (use w/ lazy mode) (default disabled)"); log_info(LogTest, " -o: use read_one_packet API. restricts page size to 8K max (default {})", 0); log_info(LogTest, " -z: enable dispatch lazy mode (default disabled)"); + log_info(LogTest, "-link: link mcast transactions"); log_info(LogTest, " -hr: hammer write_reg while executing (for PCIe test)"); log_info(LogTest, " -hp: hammer hugepage PCIe memory while executing (for PCIe test)"); log_info(LogTest, " -hpt:hammer hugepage PCIe hammer type: 0:32bit writes 1:128bit non-temporal writes"); @@ -104,6 +108,14 @@ void init(int argc, char **argv) { } page_count_g = size_bytes / page_size_g; + test_write = test_args::has_command_option(input_args, "-wr"); + if (test_write && (source_mem_g != 2 && source_mem_g != 6)) { + log_info(LogTest, "Writing only tested w/ L1 destination\n"); + exit(-1); + } + + linked = test_args::has_command_option(input_args, "-link"); + worker_g = CoreRange({core_x, core_y}, {core_x, core_y}); src_worker_g = {src_core_x, src_core_y}; @@ -200,7 +212,7 @@ int main(int argc, char **argv) { break; case 2: { - src_mem = "FROM_L1"; + src_mem = test_write ? "TO_L1" : "FROM_L1"; CoreCoord w = device->physical_core_from_logical_core(src_worker_g, CoreType::WORKER); noc_addr_x = w.x; noc_addr_y = w.y; @@ -245,6 +257,7 @@ int main(int argc, char **argv) { noc_addr_y = start.y; mcast_noc_addr_end_x = end.x; mcast_noc_addr_end_y = end.y; + test_write = true; } break; } @@ -259,6 +272,8 @@ int main(int argc, char **argv) { {"READ_ONE_PACKET", std::to_string(read_one_packet_g)}, {"DRAM_BANKED", std::to_string(dram_banked)}, {"ISSUE_MCAST", std::to_string(issue_mcast)}, + {"WRITE", std::to_string(test_write)}, + {"LINKED", std::to_string(linked)}, {"NUM_MCAST_DESTS", std::to_string(num_mcast_dests)}, {"MCAST_NOC_END_ADDR_X", std::to_string(mcast_noc_addr_end_x)}, {"MCAST_NOC_END_ADDR_Y", std::to_string(mcast_noc_addr_end_y)} @@ -284,27 +299,29 @@ int main(int argc, char **argv) { CoreCoord w = device->physical_core_from_logical_core(worker_g.start_coord, CoreType::WORKER); log_info(LogTest, "Master core: {}", w.str()); + string direction = test_write ? "Writing" : "Reading"; if (source_mem_g == 3) { - log_info(LogTest, "Reading: {}", src_mem); + log_info(LogTest, "{}: {}", direction, src_mem); } else if (source_mem_g == 4) { - log_info(LogTest, "Reading: {} - core ({}, {})", src_mem, w.x, w.y); + log_info(LogTest, "{}: {} - core ({}, {})", direction, src_mem, w.x, w.y); } else if (source_mem_g == 5) { - log_info(LogTest, "Writing: {} - core ({}, {})", src_mem, w.x, w.y); + log_info(LogTest, "{}: {} - core ({}, {})", test_write, src_mem, w.x, w.y); } else if (source_mem_g == 6) { - log_info(LogTest, "Writing: {} - core grid [({}, {}) - ({}, {})]", src_mem, noc_addr_x, noc_addr_y, mcast_noc_addr_end_x, mcast_noc_addr_end_y); + log_info(LogTest, "direction: {} - core grid [({}, {}) - ({}, {})]", direction, src_mem, noc_addr_x, noc_addr_y, mcast_noc_addr_end_x, mcast_noc_addr_end_y); } else { - log_info(LogTest, "Reading: {} - core ({}, {})", src_mem, noc_addr_x, noc_addr_y); + log_info(LogTest, "{}: {} - core ({}, {})", direction, src_mem, noc_addr_x, noc_addr_y); } if (source_mem_g < 4 || source_mem_g == 6) { std::string api; + string read_write = test_write ? "write" : "read"; if (issue_mcast) { - api = "noc_async_write_multicast"; + api = "noc_async_" + read_write + "_multicast"; } else if (read_one_packet_g) { - api = "noc_async_read_one_packet"; + api = "noc_async_" + read_write + "_one_packet"; } else { - api = "noc_async_read"; + api = "noc_async_" + read_write; } log_info(LogTest, "Using API: {}", api); log_info(LogTest, "Lazy: {}", lazy_g);