diff --git a/.github/workflows/bisect-dispatch.yaml b/.github/workflows/bisect-dispatch.yaml index dce44222ea7..3905c86cd0c 100644 --- a/.github/workflows/bisect-dispatch.yaml +++ b/.github/workflows/bisect-dispatch.yaml @@ -50,12 +50,12 @@ on: required: true type: string description: "Timeout (eg: 5m, 1h)" - description: - type: string - default: "Git bisect dispatch" + patch: required: false + type: string + description: "Commit-ish to cherry-pick for each step" -run-name: ${{ inputs.description }} +run-name: "Bisect on ${{ inputs.runner-label }}" jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml @@ -88,8 +88,11 @@ jobs: - uses: ./.github/actions/install-python-deps - name: Run Git Bisect shell: bash + env: + GIT_COMMITTER_NAME: "GitHub Actions" + GIT_COMMITTER_EMAIL: "actions@github.com" run: | source ${{ github.workspace }}/python_env/bin/activate cd $TT_METAL_HOME export PYTHONPATH=$TT_METAL_HOME - ./tests/scripts/tt_bisect.sh -t ${{ inputs.timeout }} -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }} + ./tests/scripts/tt_bisect.sh -t ${{ inputs.timeout }} -f "${{ inputs.command }}" -b ${{ inputs.bad-commit }} -g ${{ inputs.good-commit }} -p "${{ inputs.patch }}" diff --git a/.github/workflows/tgg-unit-tests-impl.yaml b/.github/workflows/tgg-unit-tests-impl.yaml index 140230c82b2..22a56b63189 100644 --- a/.github/workflows/tgg-unit-tests-impl.yaml +++ b/.github/workflows/tgg-unit-tests-impl.yaml @@ -2,6 +2,13 @@ name: "[internal] TGG unit tests impl" on: workflow_call: + inputs: + docker-image: + required: true + type: string + wheel-artifact-name: + required: true + type: string jobs: TGG-tests: @@ -17,26 +24,60 @@ jobs: }, ] name: ${{ matrix.test-group.name }} - env: - ARCH_NAME: ${{ matrix.test-group.arch }} - LOGURU_LEVEL: INFO - LD_LIBRARY_PATH: ${{ github.workspace }}/build/lib runs-on: ${{ matrix.test-group.runs-on }} + container: + image: ${{ inputs.docker-image }} + env: + TT_METAL_HOME: /work + PYTHONPATH: /work + LD_LIBRARY_PATH: /work/build/lib + LOGURU_LEVEL: INFO + ARCH_NAME: ${{ matrix.test-group.arch }} + volumes: + - ${{ github.workspace }}/docker-job:/work # Subdir to workaround https://github.com/actions/runner/issues/691 + - /dev/hugepages-1G:/dev/hugepages-1G + - /mnt/MLPerf:/mnt/MLPerf + options: "--device /dev/tenstorrent" + defaults: + run: + shell: bash + working-directory: /work # https://github.com/actions/runner/issues/878 steps: - - uses: tenstorrent/tt-metal/.github/actions/checkout-with-submodule-lfs@main - - name: Set up dynamic env vars for build - run: | - echo "TT_METAL_HOME=$(pwd)" >> $GITHUB_ENV + - name: ⬇️ Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + path: docker-job # Here be dragons; keep it scoped to our desired volume, yet must be under github.workspace and be sure to clean up at the end - uses: actions/download-artifact@v4 with: name: TTMetal_build_any + path: /work - name: Extract files run: tar -xvf ttm_any.tar - - uses: ./.github/actions/install-python-deps + - name: ⬇️ Download Wheel + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.wheel-artifact-name }} + path: /work + - name: Install Wheel + run: | + WHEEL_FILENAME=$(ls -1 *.whl) + pip3 install $WHEEL_FILENAME - name: Run unit regression tests timeout-minutes: 60 run: | - source ${{ github.workspace }}/python_env/bin/activate - cd $TT_METAL_HOME - export PYTHONPATH=$TT_METAL_HOME + set -x + pwd + echo $PYTHONPATH + ls -al ${{ matrix.test-group.cmd }} + - name: Cleanup + if: always() + run: | + # We are forced to checkout the repo into a subdir of the host's workdir; this pollutes the host + # with root-owned files. Be sure to clean up after ourselves in case we're on a non-ephemeral runner. + echo "pre rm" + ls -al /__w/tt-metal/tt-metal + rm -rf /__w/tt-metal/tt-metal/docker-job + echo "post rm" + ls -al /__w/tt-metal/tt-metal diff --git a/.github/workflows/tgg-unit-tests.yaml b/.github/workflows/tgg-unit-tests.yaml index 6c42ff61f4f..9d1bba42a64 100644 --- a/.github/workflows/tgg-unit-tests.yaml +++ b/.github/workflows/tgg-unit-tests.yaml @@ -9,7 +9,12 @@ jobs: build-artifact: uses: ./.github/workflows/build-artifact.yaml secrets: inherit + with: + build-wheel: true TGG-tests: needs: build-artifact secrets: inherit uses: ./.github/workflows/tgg-unit-tests-impl.yaml + with: + docker-image: ${{ needs.build-artifact.outputs.ci-build-docker-image }} + wheel-artifact-name: ${{ needs.build-artifact.outputs.wheel-artifact-name }} diff --git a/tests/scripts/run_tests.sh b/tests/scripts/run_tests.sh index 0f4d4480a11..a048cd440c5 100755 --- a/tests/scripts/run_tests.sh +++ b/tests/scripts/run_tests.sh @@ -431,7 +431,13 @@ set_up_chdir() { return fi done - echo "Could not find the 'tt-metal' directory in your PYTHONPATH." 1>&2 + for ENTRY in "${ENTRIES[@]}"; do + if [[ -d "$ENTRY/tt_metal" ]]; then + cd "$ENTRY" + return + fi + done + echo "Could not find the 'tt-metal' directory in your PYTHONPATH." 1>&2 exit 1 } diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh index e4e54a510b1..7f709db3316 100755 --- a/tests/scripts/t3000/run_t3000_unit_tests.sh +++ b/tests/scripts/t3000/run_t3000_unit_tests.sh @@ -26,6 +26,7 @@ run_t3000_ttmetal_tests() { ./build/programming_examples/distributed/distributed_program_dispatch ./build/programming_examples/distributed/distributed_buffer_rw ./build/programming_examples/distributed/distributed_eltwise_add + ./build/programming_examples/distributed/distributed_trace_and_events # Record the end time end_time=$(date +%s) diff --git a/tests/scripts/tt_bisect.sh b/tests/scripts/tt_bisect.sh index 5304803d18b..077656456d8 100755 --- a/tests/scripts/tt_bisect.sh +++ b/tests/scripts/tt_bisect.sh @@ -7,6 +7,8 @@ Flags: -f | --file : test file to run, also the test that broke -g | --good : good commit to start bisect -b | --bad : bad commit to start bisect + -p | --path : commit-ish to cherry-pick onto each commit before building + -t | --timeout : timeout duration for the test Example: ./tests/scripts/tt_bisect.sh -f ./build/test/tt_metal/test_add_two_ints -b HEAD -g 1eb7930 If the test involves multiple words you have to do "test_file": @@ -19,7 +21,8 @@ source python_env/bin/activate export PYTHONPATH=$TT_METAL_HOME timeout_duration=2m -while getopts "f:g:b:t:" opt; do +patch="" +while getopts "f:g:b:t:p:" opt; do case $opt in f | file) test=$OPTARG @@ -33,6 +36,9 @@ while getopts "f:g:b:t:" opt; do t | timeout) timeout_duration=$OPTARG ;; + p | patch) + patch=$OPTARG + ;; \?) echo "Invalid option: -$OPTARG" >&2 exit 1 @@ -48,14 +54,20 @@ fi echo "Time to find who broke it :)" echo "Good commit:" $good_commit echo "Bad commit:" $bad_commit +if ([ ! -z "$patch" ]); then + echo "Cherry-pick commit:" $patch +fi found=false git bisect start $bad_commit $good_commit -- while [[ "$found" = "false" ]]; do - git submodule update --recursive echo "::group::Building `git rev-parse HEAD`" + if ([ ! -z "$patch" ]); then + git cherry-pick $patch + fi + git submodule update --recursive build_rc=0 ./build_metal.sh --build-tests > /dev/null || build_rc=$? echo "::endgroup::" @@ -70,6 +82,11 @@ while [[ "$found" = "false" ]]; do timeout_rc=0 timeout "$timeout_duration" bash -c "$test" || timeout_rc=$? echo "Exit code: $timeout_rc" + + if ([ ! -z "$patch" ]); then + # Must reset HEAD or git bisect good/bad will retry the merge base and we'll be stuck in a loop + git reset --hard HEAD^ + fi echo "::endgroup::" if [ $timeout_rc -eq 0 ]; then diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt index 922e19ef993..88890c7eded 100644 --- a/tests/tt_metal/distributed/CMakeLists.txt +++ b/tests/tt_metal/distributed/CMakeLists.txt @@ -1,7 +1,8 @@ set(UNIT_TESTS_DISTRIBUTED_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/test_distributed.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_coord.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_device_reshape.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_workload.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_sub_device.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp diff --git a/tests/tt_metal/distributed/test_distributed.cpp b/tests/tt_metal/distributed/test_distributed.cpp deleted file mode 100644 index bf8877879e3..00000000000 --- a/tests/tt_metal/distributed/test_distributed.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" - -namespace tt::tt_metal::distributed::test { -namespace { - -TEST_F(T3000MeshDeviceFixture, SimpleMeshDeviceTest) { - EXPECT_EQ(mesh_device_->num_devices(), 8); - EXPECT_EQ(mesh_device_->num_rows(), 2); - EXPECT_EQ(mesh_device_->num_cols(), 4); -} - -TEST(MeshDeviceSuite, Test1x1SystemMeshInitialize) { - auto& sys = tt::tt_metal::distributed::SystemMesh::instance(); - - auto config = tt::tt_metal::distributed::MeshDeviceConfig{.mesh_shape = MeshShape(1, 1)}; - - EXPECT_NO_THROW({ - auto mesh = tt::tt_metal::distributed::MeshDevice::create( - config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - mesh->close(); - }); -} - -} // namespace -} // namespace tt::tt_metal::distributed::test diff --git a/tests/tt_metal/distributed/test_mesh_device.cpp b/tests/tt_metal/distributed/test_mesh_device.cpp new file mode 100644 index 00000000000..c87c87cae35 --- /dev/null +++ b/tests/tt_metal/distributed/test_mesh_device.cpp @@ -0,0 +1,93 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "mesh_device.hpp" +#include "system_mesh.hpp" + +#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" + +namespace tt::tt_metal::distributed { +namespace { + +using ::testing::IsEmpty; +using ::testing::SizeIs; +using ::tt::tt_metal::distributed::MeshContainer; + +TEST(MeshDeviceInitTest, Init1x1Mesh) { + auto& sys = SystemMesh::instance(); + + auto config = tt::tt_metal::distributed::MeshDeviceConfig{.mesh_shape = MeshShape(1, 1)}; + + EXPECT_NO_THROW({ + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + config, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); + mesh->close(); + }); +} + +using MeshDeviceTest = T3000MeshDeviceFixture; + +TEST_F(MeshDeviceTest, SystemMeshTearDownWithoutClose) { + auto& sys = SystemMesh::instance(); + + const auto system_shape = sys.get_shape(); + ASSERT_EQ(system_shape.dims(), 2); + EXPECT_EQ(system_shape[0], 2); + EXPECT_EQ(system_shape[1], 4); +} + +TEST_F(MeshDeviceTest, MemoryAllocationStatistics) { + auto stats = mesh_device_->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); + for (auto* device : mesh_device_->get_devices()) { + auto device_stats = device->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); + EXPECT_EQ(stats.total_allocatable_size_bytes, device_stats.total_allocatable_size_bytes); + } +} + +TEST_F(MeshDeviceTest, NumDramChannels) { + EXPECT_EQ(mesh_device_->num_dram_channels(), 96); // 8 devices * 12 channels +} + +TEST_F(MeshDeviceTest, ViewIs2D) { + std::vector devices = mesh_device_->get_devices(); + + MeshContainer container_1d(SimpleMeshShape(8), devices); + MeshDeviceView view_1d(container_1d); + EXPECT_FALSE(view_1d.is_mesh_2d()); + + MeshContainer container_2d(SimpleMeshShape(2, 4), devices); + MeshDeviceView view_2d(container_2d); + EXPECT_TRUE(view_2d.is_mesh_2d()); + + MeshContainer container_3d(SimpleMeshShape(2, 2, 2), devices); + MeshDeviceView view_3d(container_3d); + EXPECT_FALSE(view_3d.is_mesh_2d()); +} + +TEST_F(MeshDeviceTest, Submesh) { + EXPECT_EQ(mesh_device_->shape().num_rows, 2); + EXPECT_EQ(mesh_device_->shape().num_cols, 4); + EXPECT_THAT(mesh_device_->get_devices(), SizeIs(8)); + EXPECT_TRUE(mesh_device_->is_parent_mesh()); + EXPECT_THAT(mesh_device_->get_submeshes(), IsEmpty()); + + auto submesh = mesh_device_->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1}); + EXPECT_THAT(mesh_device_->get_submeshes(), SizeIs(1)); + EXPECT_EQ(submesh->shape().num_rows, 1); + EXPECT_EQ(submesh->shape().num_cols, 2); + EXPECT_THAT(submesh->get_devices(), SizeIs(2)); + EXPECT_FALSE(submesh->is_parent_mesh()); + EXPECT_THAT(submesh->get_submeshes(), IsEmpty()); + + // Verify coordinates are correct. + EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id()); + EXPECT_EQ(mesh_device_->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id()); + EXPECT_EQ(submesh->get_device(1, 1), nullptr); +} + +} // namespace +} // namespace tt::tt_metal::distributed diff --git a/tests/ttnn/distributed/test_distributed_reshape.cpp b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp similarity index 62% rename from tests/ttnn/distributed/test_distributed_reshape.cpp rename to tests/tt_metal/distributed/test_mesh_device_reshape.cpp index f3a085d0700..893ad9aca1a 100644 --- a/tests/ttnn/distributed/test_distributed_reshape.cpp +++ b/tests/tt_metal/distributed/test_mesh_device_reshape.cpp @@ -6,29 +6,20 @@ #include #include #include -#include -#include + +#include "host_api.hpp" +#include "mesh_config.hpp" +#include "mesh_device.hpp" #include "mesh_coord.hpp" + +#include "system_mesh.hpp" #include "tests/tt_metal/test_utils/env_vars.hpp" -namespace ttnn::distributed::test { +namespace tt::tt_metal::distributed { namespace { using ::testing::SizeIs; -// Helper function to check test environment -void check_t3k_test_environment() { - auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); - const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); - const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); - if (slow_dispatch) { - GTEST_SKIP() << "Skipping Multi-Device test suite, since it can only be run in Fast Dispatch Mode."; - } - if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) { - GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine."; - } -} - std::vector get_physical_device_ids(const MeshDevice& mesh) { std::vector device_ids; for (auto* device : mesh.get_devices()) { @@ -37,46 +28,56 @@ std::vector get_physical_device_ids(const MeshDevice& mesh) { return device_ids; } -static constexpr std::array kMeshShapes{ - {{1, 1}, {1, 2}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, {2, 1}, {2, 2}, {2, 3}, {2, 4}, - {3, 1}, {3, 2}, {4, 1}, {4, 2}, {8, 1}, {7, 1}, {6, 1}, {5, 1}, {4, 1}, {3, 1}, {2, 1}, {1, 1}}}; - -class MeshConfigurationTest : public ::testing::TestWithParam { -protected: - void SetUp() override { check_t3k_test_environment(); } +class T3KTestFixture : public ::testing::Test { +public: + void SetUp() override { + auto slow_dispatch = getenv("TT_METAL_SLOW_DISPATCH_MODE"); + const auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + const size_t num_devices = tt::tt_metal::GetNumAvailableDevices(); + if (slow_dispatch) { + GTEST_SKIP() << "Skipping Multi-Device test suite, since it can only be run in Fast Dispatch Mode."; + } + if (num_devices < 8 or arch != tt::ARCH::WORMHOLE_B0) { + GTEST_SKIP() << "Skipping T3K Multi-Device test suite on non T3K machine."; + } + } }; +constexpr std::array kMeshShapes{{{1, 1}, {1, 2}, {1, 3}, {1, 4}, {1, 5}, {1, 6}, {1, 7}, {1, 8}, + {2, 1}, {2, 2}, {2, 3}, {2, 4}, {3, 1}, {3, 2}, {4, 1}, {4, 2}, + {8, 1}, {7, 1}, {6, 1}, {5, 1}, {4, 1}, {3, 1}, {2, 1}, {1, 1}}}; + +class MeshConfigurationTest : public T3KTestFixture, public ::testing::WithParamInterface {}; + TEST_P(MeshConfigurationTest, MeshConfigurations) { const auto& shape = GetParam(); - auto mesh = ttnn::distributed::open_mesh_device( - {shape.num_rows, shape.num_cols}, + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape.num_rows, shape.num_cols)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); EXPECT_EQ(mesh->num_rows(), shape.num_rows); EXPECT_EQ(mesh->num_cols(), shape.num_cols); - ttnn::distributed::close_mesh_device(mesh); + mesh->close(); } TEST_P(MeshConfigurationTest, GetPhysicalDeviceIds) { const auto& shape = GetParam(); - auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); + auto& system_mesh = SystemMesh::instance(); EXPECT_THAT( system_mesh.get_mapped_physical_device_ids(MeshDeviceConfig{.mesh_shape = SimpleMeshShape(shape)}), SizeIs(shape.num_cols * shape.num_rows)); } // Test all possible mesh configurations on T3000 -INSTANTIATE_TEST_SUITE_P(MeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes)); +INSTANTIATE_TEST_SUITE_P(AllMeshShapes, MeshConfigurationTest, ::testing::ValuesIn(kMeshShapes)); -class MeshReshapeTest : public ::testing::TestWithParam> { -protected: - void SetUp() override { check_t3k_test_environment(); } -}; +class MeshDeviceReshapeRoundtripTest : public T3KTestFixture, + public ::testing::WithParamInterface> {}; -TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { +TEST_P(MeshDeviceReshapeRoundtripTest, ReshapeBetweenConfigurations) { const auto& [old_shape, new_shape] = GetParam(); if ((old_shape.num_rows * old_shape.num_cols) != (new_shape.num_rows * new_shape.num_cols)) { @@ -86,8 +87,8 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { GTEST_SKIP() << "Old shape is 1xN or Nx1; we test this in From1x4To2x2Invalid"; } - auto mesh = ttnn::distributed::open_mesh_device( - {old_shape.num_rows, old_shape.num_cols}, + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(old_shape.num_rows, old_shape.num_cols)}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, @@ -112,17 +113,14 @@ TEST_P(MeshReshapeTest, ReshapeBetweenConfigurations) { // Generate all possible combinations of shapes from kMeshShapes INSTANTIATE_TEST_SUITE_P( - ReshapeConfigurations, - MeshReshapeTest, + AllMeshShapes, + MeshDeviceReshapeRoundtripTest, ::testing::Combine(::testing::ValuesIn(kMeshShapes), ::testing::ValuesIn(kMeshShapes))); // Base class for non-parameterized tests -class T3000ReshapeTest : public ::testing::Test { -protected: - void SetUp() override { check_t3k_test_environment(); } -}; +using MeshDeviceReshapeTest = T3KTestFixture; -TEST_F(T3000ReshapeTest, InvalidRequestedShape) { +TEST_F(MeshDeviceReshapeTest, InvalidRequestedShape) { auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); // Shape too big. @@ -144,9 +142,13 @@ TEST_F(T3000ReshapeTest, InvalidRequestedShape) { MeshDeviceConfig{.mesh_shape = SimpleMeshShape(8), .offset = MeshCoordinate(1)})); } -TEST_F(T3000ReshapeTest, InvalidReshapeDimensions) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, InvalidReshapeDimensions) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); // Test reshaping to dimensions that don't match total device count EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error); // 9 devices != 8 @@ -157,9 +159,13 @@ TEST_F(T3000ReshapeTest, InvalidReshapeDimensions) { EXPECT_EQ(mesh->num_cols(), 8); } -TEST_F(T3000ReshapeTest, From1x8To2x4ThenBackTo1x8) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, From1x8To2x4ThenBackTo1x8) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); EXPECT_EQ(mesh->num_rows(), 1); EXPECT_EQ(mesh->num_cols(), 8); @@ -187,9 +193,13 @@ TEST_F(T3000ReshapeTest, From1x8To2x4ThenBackTo1x8) { EXPECT_EQ(mesh->get_device_ids(), original_order); } -TEST_F(T3000ReshapeTest, InvalidTotalDeviceCount) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 8}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, InvalidTotalDeviceCount) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 8)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); // Test reshaping to dimensions that don't match total device count EXPECT_THROW(mesh->reshape({3, 3}), std::runtime_error); // 9 devices != 8 @@ -200,15 +210,19 @@ TEST_F(T3000ReshapeTest, InvalidTotalDeviceCount) { EXPECT_EQ(mesh->num_cols(), 8); } -TEST_F(T3000ReshapeTest, From1x4To2x2Invalid) { - auto mesh = ttnn::distributed::open_mesh_device( - {1, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, From1x4To2x2Invalid) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); // This is an invalid reshape because the 1x4 mesh does not fully cover the 2x2 mesh EXPECT_THROW(mesh->reshape({2, 2}), std::runtime_error); } -TEST_F(T3000ReshapeTest, From1x4To2x2Valid) { +TEST_F(MeshDeviceReshapeTest, From1x4To2x2Valid) { auto& system_mesh = tt::tt_metal::distributed::SystemMesh::instance(); // Fetch the device ids for a physically connected 2x2 mesh. @@ -218,14 +232,12 @@ TEST_F(T3000ReshapeTest, From1x4To2x2Valid) { // Supply the physical device ids to the mesh constructor that we know we know is 2x2 physically connected. // We will create a 1x4 mesh and then reshape it to 2x2. - auto mesh = ttnn::distributed::open_mesh_device( - {1, 4}, + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(1, 4), .physical_device_ids = physical_device_ids}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, - tt::tt_metal::DispatchCoreType::WORKER, - MeshOffset{0, 0}, - physical_device_ids); + tt::tt_metal::DispatchCoreType::WORKER); mesh->reshape({2, 2}); EXPECT_EQ(mesh->num_rows(), 2); @@ -236,9 +248,13 @@ TEST_F(T3000ReshapeTest, From1x4To2x2Valid) { } } -TEST_F(T3000ReshapeTest, From2x2To1x4) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 2}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); +TEST_F(MeshDeviceReshapeTest, From2x2To1x4) { + auto mesh = tt::tt_metal::distributed::MeshDevice::create( + MeshDeviceConfig{.mesh_shape = SimpleMeshShape(2, 2)}, + DEFAULT_L1_SMALL_SIZE, + DEFAULT_TRACE_REGION_SIZE, + 1, + tt::tt_metal::DispatchCoreType::WORKER); auto mesh_2x2_device_ids = mesh->get_device_ids(); @@ -258,4 +274,4 @@ TEST_F(T3000ReshapeTest, From2x2To1x4) { } } // namespace -} // namespace ttnn::distributed::test +} // namespace tt::tt_metal::distributed diff --git a/tt_metal/api/tt-metalium/test_common.hpp b/tests/tt_metal/test_utils/test_common.hpp similarity index 99% rename from tt_metal/api/tt-metalium/test_common.hpp rename to tests/tt_metal/test_utils/test_common.hpp index 7a81c7a0732..dbcf2c50e25 100644 --- a/tt_metal/api/tt-metalium/test_common.hpp +++ b/tests/tt_metal/test_utils/test_common.hpp @@ -23,7 +23,7 @@ template constexpr std::false_type always_false{}; template -T parse(std::string const& s) { +T parse(const std::string& s) { if constexpr (std::is_same_v) { return std::stoul(s, 0, 0); } else if constexpr (std::is_same_v) { @@ -39,7 +39,7 @@ T parse(std::string const& s) { } } -inline std::string strip(std::string const& s) { +inline std::string strip(const std::string& s) { std::string whitespace = " \t\n"; std::size_t start = s.find_first_not_of(whitespace); std::size_t end = s.find_last_not_of(whitespace); diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt index bafab7885dd..9065d45acd5 100644 --- a/tests/tt_metal/tt_metal/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/CMakeLists.txt @@ -48,6 +48,7 @@ foreach(TEST_SRC ${TT_METAL_TESTS_SRCS}) PRIVATE "$" ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/test_utils ${CMAKE_CURRENT_SOURCE_DIR} ) set_target_properties( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp index ff359239b1e..4ab8453a76d 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/10_dram_read_remote_cb_sync/test_dram_read_remote_cb.cpp @@ -28,6 +28,8 @@ #include "tt_metal/tt_metal/common/matmul_test_utils.hpp" #include +#include "test_common.hpp" + using std::vector; using namespace tt; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp index 16ceb8092cd..784f8814af0 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/11_remote_cb_sync_matmul_single_core/test_remote_cb_sync_matmul.cpp @@ -27,6 +27,8 @@ #include "tt_metal/test_utils/deprecated/tensor.hpp" #include "tt_metal/tt_metal/common/matmul_test_utils.hpp" +#include "test_common.hpp" + using std::vector; using namespace tt; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp index 38b82e910e7..24382b4ff73 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/1_compute_mm/test_compute_mm.cpp @@ -28,6 +28,8 @@ #include "tt_metal/tt_metal/common/matmul_test_utils.hpp" #include +#include "test_common.hpp" + using std::vector; using namespace tt; //////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp index 11944860693..a877ef09d0a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_adjacent/test_noc_adjacent.cpp @@ -13,6 +13,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp index 661d0018769..27cb5adcff2 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/2_noc_rtor/test_noc_rtor.cpp @@ -14,6 +14,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp index 8d83a1b175b..9e7ff0e7f05 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_pull_from_pcie.cpp @@ -16,6 +16,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp index 306c3463bd3..6ce45cc0efe 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/3_pcie_transfer/test_rw_buffer.cpp @@ -17,6 +17,8 @@ #include "tt_cluster.hpp" #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using namespace tt; using namespace tt::tt_metal; using std::chrono::duration_cast; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp index df8fe9407aa..3a9589bc218 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/6_dram_offchip/test_dram_offchip.cpp @@ -19,6 +19,8 @@ #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" #include +#include "test_common.hpp" + using namespace tt; using std::chrono::duration_cast; using std::chrono::microseconds; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp index 9889aa430b9..2bc2d18553f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/7_kernel_launch/test_kernel_launch.cpp @@ -13,6 +13,8 @@ #include #include "tt_metal/tt_metal/perf_microbenchmark/common/util.hpp" +#include "test_common.hpp" + using std::vector; using namespace tt; using namespace tt::tt_metal; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp index d40e9384635..554c85e559c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/8_dram_adjacent_core_read/test_dram_read.cpp @@ -21,6 +21,8 @@ #include #include +#include "test_common.hpp" + using namespace tt; using std::chrono::duration_cast; using std::chrono::microseconds; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp index 301ceea8c21..9340465fe2c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/9_dram_adjacent_read_remote_l1_write/test_dram_read_l1_write.cpp @@ -24,6 +24,8 @@ #include #include +#include "test_common.hpp" + using namespace tt; using std::chrono::duration_cast; using std::chrono::microseconds; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt index e4178cba02b..598e4125424 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt @@ -79,6 +79,7 @@ foreach(arch ${ARCHITECTURES}) "$" ${PROJECT_SOURCE_DIR}/ttnn/cpp/ttnn/deprecated # this all should go away and be replaced with link to ttnn ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/test_utils ${CMAKE_CURRENT_SOURCE_DIR} ) set_target_properties( diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp index 3053fd4c7ed..31f7c2296ed 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_bw_and_latency.cpp @@ -12,7 +12,7 @@ #include "logger.hpp" #include #include -#include +#include "test_common.hpp" #include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp index e751187a2ab..d0f0fea005b 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_dispatcher.cpp @@ -14,6 +14,8 @@ #include #include "common.h" +#include "test_common.hpp" + constexpr uint32_t DEFAULT_ITERATIONS = 10000; constexpr uint32_t DEFAULT_WARMUP_ITERATIONS = 100; constexpr uint32_t DEFAULT_DISPATCH_BUFFER_LOG_PAGE_SIZE = 12; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp index b9e3aaaf083..0d9c0eefd8f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_pgm_dispatch.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp index 0b1dc88bec3..0c6b581e7c3 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/dispatch/test_prefetcher.cpp @@ -21,6 +21,8 @@ #include #include "llrt.hpp" +#include "test_common.hpp" + #define CQ_PREFETCH_CMD_BARE_MIN_SIZE tt::tt_metal::hal.get_alignment(tt::tt_metal::HalMemType::HOST) constexpr uint32_t DEFAULT_TEST_TYPE = 0; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp index 13eb1015602..73c0fb19225 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_global_l1.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include #include "dprint_server.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp index b15d222a21d..acef9bfcd07 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/matmul/matmul_local_l1.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp index 24580476130..20ce9327a65 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_global_l1.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp index a08ec04c278..9ae53cb1e28 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/noc/test_noc_read_local_l1.cpp @@ -11,7 +11,7 @@ #include #include #include -#include +#include "test_common.hpp" #include #include "dprint_server.hpp" #include "tt_metal/test_utils/deprecated/tensor.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp index caa962ab89e..da12baa481f 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_enqueue_rw_buffer.cpp @@ -8,7 +8,7 @@ #include #include -#include +#include "test_common.hpp" #include #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp index 714e0b2af26..c1f5b1426f9 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_buffer.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include "test_common.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp index 4ab4568663b..89dedffba0a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_dram.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include "test_common.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp index 04ae58dc362..844d2e4bb9e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/old/pcie/test_rw_device_l1.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include "test_common.hpp" #include using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp similarity index 80% rename from tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp rename to tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp index ad6c6eff13b..1dcd801b127 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_common.hpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/routing_test_common.hpp @@ -17,7 +17,8 @@ static inline std::string to_string(pkt_dest_size_choices_t choice) { } } -static inline void log_phys_coord_to_json(nlohmann::json& config, const std::vector& phys_cores, const std::string& name) { +static inline void log_phys_coord_to_json( + nlohmann::json& config, const std::vector& phys_cores, const std::string& name) { for (int i = 0; i < phys_cores.size(); ++i) { config[fmt::format("{}_{}", name, i)] = fmt::format("({}, {})", phys_cores[i].x, phys_cores[i].y); } @@ -28,9 +29,9 @@ static inline void log_phys_coord_to_json(nlohmann::json& config, const CoreCoor } inline uint64_t get_64b_result(uint32_t* buf, uint32_t index) { - return (((uint64_t)buf[index]) << 32) | buf[index+1]; + return (((uint64_t)buf[index]) << 32) | buf[index + 1]; } inline uint64_t get_64b_result(const std::vector& vec, uint32_t index) { - return (((uint64_t)vec[index]) << 32) | vec[index+1]; + return (((uint64_t)vec[index]) << 32) | vec[index + 1]; } diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp index f267a746382..eda89407079 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux.cpp @@ -9,6 +9,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "llrt.hpp" using std::vector; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp index dc4a8f132fd..2834227a93e 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_mux_demux_2level.cpp @@ -8,6 +8,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "llrt.hpp" using std::vector; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp index bacca186d10..00761a5843a 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_multi_hop_sanity.cpp @@ -11,6 +11,7 @@ // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "eth_l1_address_map.h" #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp index f495c0b5e7b..c6d48b3f670 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_sanity.cpp @@ -13,6 +13,7 @@ //#include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "eth_l1_address_map.h" #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" #include diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp index b6a5e0182c8..198246ce0da 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tt_fabric_socket_sanity.cpp @@ -11,6 +11,7 @@ // #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "tt_metal/fabric/hw/inc/tt_fabric_status.h" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "eth_l1_address_map.h" #include "tt_metal/fabric/hw/inc/tt_fabric_interface.h" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp index a645b972fa6..d8a5c7263bd 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_tx_rx.cpp @@ -8,6 +8,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" #include "utils.hpp" #include "llrt.hpp" diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp index 99d271f3ce0..bfaaadb2a0c 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_2ep.cpp @@ -9,6 +9,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp index 8c70290d9c3..23a4e9db4f7 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_bi_tunnel_4ep.cpp @@ -9,6 +9,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp index 0b9cf4ae5b4..c34eea39242 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_loopback_tunnel.cpp @@ -9,6 +9,7 @@ #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp index 805ea48ca01..28a89013e54 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_mux_demux.cpp @@ -9,6 +9,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp index 32d69fb8586..b4c37a1ff14 100644 --- a/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp +++ b/tests/tt_metal/tt_metal/perf_microbenchmark/routing/test_vc_uni_tunnel.cpp @@ -9,6 +9,7 @@ #include #include "tt_metal/impl/dispatch/kernels/packet_queue_ctrl.hpp" #include "test_common.hpp" +#include "routing_test_common.hpp" using std::vector; using namespace tt; diff --git a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp index f6ffce0e797..9cb9cf85c0c 100644 --- a/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp +++ b/tests/tt_metal/tt_metal/test_interleaved_layouts.cpp @@ -15,6 +15,8 @@ #include "dprint_server.hpp" +#include "test_common.hpp" + ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp index 2b11027b701..2affd969e68 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_core_multi_dram.cpp @@ -13,6 +13,8 @@ #include #include +#include "test_common.hpp" + ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp index 2d457de3e58..b50fdd0f708 100644 --- a/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp +++ b/tests/tt_metal/tt_metal/test_matmul_multi_tile.cpp @@ -12,6 +12,8 @@ #include "tt_metal/test_utils/deprecated/tensor.hpp" #include +#include "test_common.hpp" + ////////////////////////////////////////////////////////////////////////////////////////// // TODO: explain what test does ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp index 2ab7e642602..d69f71d3588 100644 --- a/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp +++ b/tests/tt_metal/tt_metal/test_stress_noc_mcast.cpp @@ -18,7 +18,7 @@ #include "logger.hpp" #include #include -#include +#include "test_common.hpp" #include #include #include diff --git a/tests/ttnn/CMakeLists.txt b/tests/ttnn/CMakeLists.txt index 3117e6b8920..7e3c43ea023 100644 --- a/tests/ttnn/CMakeLists.txt +++ b/tests/ttnn/CMakeLists.txt @@ -25,5 +25,4 @@ function(setup_ttnn_test_target target_name) ) endfunction() -add_subdirectory(distributed) add_subdirectory(unit_tests/gtests) diff --git a/tests/ttnn/distributed/CMakeLists.txt b/tests/ttnn/distributed/CMakeLists.txt deleted file mode 100644 index 5823925eec3..00000000000 --- a/tests/ttnn/distributed/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -add_executable( - test_distributed - test_distributed.cpp - test_distributed_reshape.cpp -) -add_executable(test_distributed_atexit test_distributed_atexit.cpp) - -# Set up properties for the target -setup_ttnn_test_target(test_distributed) -setup_ttnn_test_target(test_distributed_atexit) -# Add test to CTest -add_test(NAME test_distributed COMMAND test_distributed) -add_test(NAME test_distributed_atexit COMMAND test_distributed_atexit) diff --git a/tests/ttnn/distributed/test_distributed.cpp b/tests/ttnn/distributed/test_distributed.cpp deleted file mode 100644 index ee9d2f83fb4..00000000000 --- a/tests/ttnn/distributed/test_distributed.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include - -#include -#include -#include "ttnn/distributed/types.hpp" - -namespace ttnn::distributed::test { - -using ::testing::IsEmpty; -using ::testing::SizeIs; -using ::tt::tt_metal::distributed::MeshContainer; - -class DistributedTest : public ::testing::Test { -protected: - void SetUp() override {} - void TearDown() override {} -}; - -TEST_F(DistributedTest, TestSystemMeshTearDownWithoutClose) { - auto& sys = SystemMesh::instance(); - auto mesh = ttnn::distributed::open_mesh_device( - /*mesh_shape=*/{2, 4}, - DEFAULT_L1_SMALL_SIZE, - DEFAULT_TRACE_REGION_SIZE, - 1, - tt::tt_metal::DispatchCoreType::WORKER); - - const auto system_shape = sys.get_shape(); - ASSERT_EQ(system_shape.dims(), 2); - EXPECT_EQ(system_shape[0], 2); - EXPECT_EQ(system_shape[1], 4); -} - -TEST_F(DistributedTest, TestMemoryAllocationStatistics) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - auto stats = mesh->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); - for (auto* device : mesh->get_devices()) { - auto device_stats = device->allocator()->get_statistics(tt::tt_metal::BufferType::DRAM); - EXPECT_EQ(stats.total_allocatable_size_bytes, device_stats.total_allocatable_size_bytes); - } -} - -TEST_F(DistributedTest, TestNumDramChannels) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - EXPECT_EQ(mesh->num_dram_channels(), 96); // 8 devices * 12 channels -} - -TEST_F(DistributedTest, ViewIs2D) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - std::vector devices = mesh->get_devices(); - - MeshContainer container_1d(SimpleMeshShape(8), devices); - MeshDeviceView view_1d(container_1d); - EXPECT_FALSE(view_1d.is_mesh_2d()); - - MeshContainer container_2d(SimpleMeshShape(2, 4), devices); - MeshDeviceView view_2d(container_2d); - EXPECT_TRUE(view_2d.is_mesh_2d()); - - MeshContainer container_3d(SimpleMeshShape(2, 2, 2), devices); - MeshDeviceView view_3d(container_3d); - EXPECT_FALSE(view_3d.is_mesh_2d()); -} - -TEST_F(DistributedTest, Submesh) { - auto mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - - EXPECT_EQ(mesh->shape().num_rows, 2); - EXPECT_EQ(mesh->shape().num_cols, 4); - EXPECT_THAT(mesh->get_devices(), SizeIs(8)); - EXPECT_TRUE(mesh->is_parent_mesh()); - EXPECT_THAT(mesh->get_submeshes(), IsEmpty()); - - auto submesh = mesh->create_submesh(MeshShape{1, 2}, MeshOffset{1, 1}); - EXPECT_THAT(mesh->get_submeshes(), SizeIs(1)); - EXPECT_EQ(submesh->shape().num_rows, 1); - EXPECT_EQ(submesh->shape().num_cols, 2); - EXPECT_THAT(submesh->get_devices(), SizeIs(2)); - EXPECT_FALSE(submesh->is_parent_mesh()); - EXPECT_THAT(submesh->get_submeshes(), IsEmpty()); - - // Verify coordinates are correct. - EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 1})->id(), submesh->get_device(MeshCoordinate{0, 0})->id()); - EXPECT_EQ(mesh->get_device(MeshCoordinate{1, 2})->id(), submesh->get_device(MeshCoordinate{0, 1})->id()); - EXPECT_EQ(submesh->get_device(1, 1), nullptr); - -} // namespace ttnn::distributed::test -} // namespace ttnn::distributed::test diff --git a/tests/ttnn/distributed/test_distributed_atexit.cpp b/tests/ttnn/distributed/test_distributed_atexit.cpp deleted file mode 100644 index 6d4461f7386..00000000000 --- a/tests/ttnn/distributed/test_distributed_atexit.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. -// -// SPDX-License-Identifier: Apache-2.0 - -#include - -#include -#include -#include -#include - -namespace ttnn::distributed::test { - -// Simplified test without fixture, and mesh variable moved inside test -TEST(DistributedTestStandalone, TestSystemMeshTearDownWithoutClose) { - static std::shared_ptr mesh; - auto& sys = tt::tt_metal::distributed::SystemMesh::instance(); - mesh = ttnn::distributed::open_mesh_device( - {2, 4}, DEFAULT_L1_SMALL_SIZE, DEFAULT_TRACE_REGION_SIZE, 1, tt::tt_metal::DispatchCoreType::WORKER); - - const auto system_shape = sys.get_shape(); - ASSERT_EQ(system_shape.dims(), 2); - EXPECT_EQ(system_shape[0], 2); - EXPECT_EQ(system_shape[1], 4); -} - -} // namespace ttnn::distributed::test diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp index 91fe40d181e..c22ae1d57f3 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/edm_fabric_writer.cpp @@ -18,8 +18,8 @@ static constexpr bool enable_any_synchronization = enable_start_synchronization FORCE_INLINE void line_sync( FabricConnectionManager& fabric_connection, - volatile tt::fabric::PacketHeader* mcast_fwd_packet_header, - volatile tt::fabric::PacketHeader* mcast_bwd_packet_header, + volatile PACKET_HEADER_TYPE* mcast_fwd_packet_header, + volatile PACKET_HEADER_TYPE* mcast_bwd_packet_header, size_t sync_bank_addr, size_t sync_noc_x, size_t sync_noc_y, @@ -33,7 +33,7 @@ FORCE_INLINE void line_sync( fabric_connection.get_forward_connection().wait_for_empty_write_slot(); print_pkt_header(mcast_fwd_packet_header); fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_fwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_fwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -41,7 +41,7 @@ FORCE_INLINE void line_sync( fabric_connection.get_backward_connection().wait_for_empty_write_slot(); print_pkt_header(mcast_bwd_packet_header); fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_bwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_bwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } noc_semaphore_inc(get_noc_addr(sync_noc_x, sync_noc_y, sync_bank_addr), 1); if (sync_noc_x == my_x[0] && sync_noc_y == my_y[0]) { @@ -98,11 +98,11 @@ void kernel_main() { const auto source_l1_buffer_address = get_write_ptr(source_l1_cb_index); const auto packet_header_buffer_address = get_write_ptr(packet_header_cb); - auto* mcast_fwd_packet_header = reinterpret_cast(packet_header_buffer_address); + auto* mcast_fwd_packet_header = reinterpret_cast(packet_header_buffer_address); auto* mcast_bwd_packet_header = - reinterpret_cast(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader)); + reinterpret_cast(packet_header_buffer_address + sizeof(PACKET_HEADER_TYPE)); auto* unicast_packet_header = - reinterpret_cast(packet_header_buffer_address + sizeof(tt::fabric::PacketHeader) * 2); + reinterpret_cast(packet_header_buffer_address + sizeof(PACKET_HEADER_TYPE) * 2); mcast_fwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_fwd_hops)}); mcast_bwd_packet_header->to_chip_multicast(MulticastRoutingCommandHeader{1, static_cast(mcast_bwd_hops)}); @@ -146,7 +146,7 @@ void kernel_main() { fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_fwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_fwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -157,7 +157,7 @@ void kernel_main() { fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_non_blocking_from_address( - (uint32_t)mcast_bwd_packet_header, sizeof(tt::fabric::PacketHeader)); + (uint32_t)mcast_bwd_packet_header, sizeof(PACKET_HEADER_TYPE)); } { noc_async_writes_flushed(); @@ -174,8 +174,7 @@ void kernel_main() { fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address( source_l1_buffer_address, packet_payload_size_bytes); - fabric_conn.send_payload_blocking_from_address( - (uint32_t)unicast_packet_header, sizeof(tt::fabric::PacketHeader)); + fabric_conn.send_payload_blocking_from_address((uint32_t)unicast_packet_header, sizeof(PACKET_HEADER_TYPE)); } if (enable_finish_synchronization) { diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp index 976f579ab4d..46c421049f0 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_reader.cpp @@ -30,7 +30,7 @@ void kernel_main() { uint32_t pages_to_read = std::min(pages_per_edm_buffer, num_pages_to_read_total - num_pages_read); cb_reserve_back(cb_id_in0, pages_to_read); uint32_t local_l1_read_addr = get_write_ptr(cb_id_in0); - local_l1_read_addr += sizeof(tt::fabric::PacketHeader); + local_l1_read_addr += sizeof(PACKET_HEADER_TYPE); for (uint32_t p = 0; p < pages_to_read; ++p) { uint64_t src_noc_addr = get_noc_addr(num_pages_read + p, source_address_generator); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp index b210f32efb5..7bc4ad00b90 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_erisc_datamover_sender_worker_sender.cpp @@ -122,9 +122,9 @@ void kernel_main() { // bit of a hack to extract X/Y const auto dest_noc_address = get_noc_addr(p, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); + const size_t packet_size = page_size + sizeof(PACKET_HEADER_TYPE); auto packet_addr = get_read_ptr(cb_id_in0); - auto* packet_header = reinterpret_cast(packet_addr); + auto* packet_header = reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header ->to_chip_multicast( @@ -145,7 +145,7 @@ void kernel_main() { if constexpr (!mcast_mode) { sender.wait_for_empty_write_slot(); - auto& packet_header = *reinterpret_cast(a_packet_header_addr); + auto& packet_header = *reinterpret_cast(a_packet_header_addr); ASSERT(*last_message_semaphore_address == 0); uint64_t last_message_semaphore_noc0_addr = safe_get_noc_addr(my_x[0], my_y[0], (uint32_t)last_message_semaphore_address, 0); diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp index eaa14a0e40f..23b9789b998 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/fabric_worker_sender_multi_input.cpp @@ -52,10 +52,10 @@ auto forward_to_fabric_from_cb( // bit of a hack to extract X/Y const auto noc0_dest_address = get_noc_addr(current_page, dest_addr_gen, 0, NORMALIZED_NOC_INDEX); - const size_t packet_size = page_size + sizeof(tt::fabric::PacketHeader); + const size_t packet_size = page_size + sizeof(PACKET_HEADER_TYPE); auto packet_addr = get_read_ptr(cb_id); - auto &packet_header = *reinterpret_cast(packet_addr); + auto& packet_header = *reinterpret_cast(packet_addr); if constexpr (mcast_mode) { packet_header .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{config.mcast.distance, config.mcast.range}) @@ -182,7 +182,7 @@ void kernel_main() { sender.wait_for_empty_write_slot(); constexpr size_t kLoopbackNumHopsToMyChip = 2; - auto &packet_header = *reinterpret_cast(a_packet_header_addr); + auto& packet_header = *reinterpret_cast(a_packet_header_addr); ASSERT(*last_message_semaphore_address == 0); packet_header.reserved = 0xE; packet_header.reserved2 = 0xFFFF; diff --git a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp index ae5e9135a2b..8f5287ee0d7 100644 --- a/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp +++ b/tests/ttnn/unit_tests/gtests/ccl/kernels/test_kernels.common.hpp @@ -29,13 +29,14 @@ bool terminate_fabric_endpoints_farthest_to_nearest ( get_noc_addr(edm_noc_x, edm_noc_y, termination_addr), tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE); } else { - auto &packet_header = *reinterpret_cast(a_packet_header_addr); - reinterpret_cast(a_packet_header_addr)[sizeof(tt::fabric::PacketHeader) >> 2] = tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; + auto& packet_header = *reinterpret_cast(a_packet_header_addr); + reinterpret_cast(a_packet_header_addr)[sizeof(PACKET_HEADER_TYPE) >> 2] = + tt::fabric::TerminationSignal::GRACEFULLY_TERMINATE; sender.wait_for_empty_write_slot(); packet_header.to_chip_unicast(static_cast(distance)) .to_noc_unicast_write( tt::fabric::NocUnicastCommandHeader{termination_sig_noc_addr}, - sizeof(tt::fabric::PacketHeader) + sizeof(uint32_t)); + sizeof(PACKET_HEADER_TYPE) + sizeof(uint32_t)); sender.send_payload_blocking_from_address(a_packet_header_addr, packet_header.get_payload_size_including_header()); noc_async_writes_flushed(); } diff --git a/tt_metal/CMakeLists.txt b/tt_metal/CMakeLists.txt index 7d96a44a239..9dce6002708 100644 --- a/tt_metal/CMakeLists.txt +++ b/tt_metal/CMakeLists.txt @@ -142,7 +142,7 @@ if(BUILD_PROGRAMMING_EXAMPLES) endif() # Allow internal files to access the public API "by default" and without the -# scoping that external consumers must use. Scaoping may still be used if desired. +# scoping that external consumers must use. Scoping may still be used if desired. include_directories( api api/tt-metalium diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 21d017789c0..878569038d2 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -270,6 +270,8 @@ class Device : public IDevice { program_cache::detail::ProgramCache program_cache_; uint32_t trace_buffers_size_ = 0; + bool uninitialized_error_fired_ = + false; // To avoid spam with warnings about calling Device methods when it's not initialized. }; } // namespace v0 diff --git a/tt_metal/api/tt-metalium/system_mesh.hpp b/tt_metal/api/tt-metalium/system_mesh.hpp index 1ee91588dcc..f904de46044 100644 --- a/tt_metal/api/tt-metalium/system_mesh.hpp +++ b/tt_metal/api/tt-metalium/system_mesh.hpp @@ -9,7 +9,7 @@ #include "mesh_config.hpp" #include "mesh_coord.hpp" - +#include "indestructible.hpp" namespace tt::tt_metal::distributed { // SystemMesh creates a virtualization over the physical devices in the system. @@ -21,6 +21,8 @@ class SystemMesh { std::unique_ptr pimpl_; SystemMesh(); + friend class tt::stl::Indestructible; + public: static SystemMesh& instance(); SystemMesh(const SystemMesh&) = delete; @@ -28,12 +30,13 @@ class SystemMesh { SystemMesh(SystemMesh&&) = delete; SystemMesh& operator=(SystemMesh&&) = delete; + // Returns the shape of the system mesh const SimpleMeshShape& get_shape() const; - // Gets the physical device ID for a given logical row and column index + // Returns the physical device ID for a given logical row and column index chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; - // Get the physical device IDs mapped to a MeshDevice + // Returns the physical device IDs mapped to a MeshDevice std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; }; diff --git a/tt_metal/common/mesh_coord.cpp b/tt_metal/common/mesh_coord.cpp index 19dab608c35..88f4309cd90 100644 --- a/tt_metal/common/mesh_coord.cpp +++ b/tt_metal/common/mesh_coord.cpp @@ -82,9 +82,12 @@ bool operator==(const MeshCoordinate& lhs, const MeshCoordinate& rhs) { bool operator!=(const MeshCoordinate& lhs, const MeshCoordinate& rhs) { return !(lhs == rhs); } std::ostream& operator<<(std::ostream& os, const MeshCoordinate& coord) { - os << "MeshCoordinate(" << coord.dims() << ", ["; - for (size_t dim : coord.coords()) { - os << dim << ", "; + os << "MeshCoordinate(["; + for (size_t i = 0; i < coord.dims(); ++i) { + if (i > 0) { + os << ", "; + } + os << coord[i]; } os << "])"; return os; diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index 7190e8e3806..80535e32674 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -69,9 +69,7 @@ MeshDevice::ScopedDevices::ScopedDevices( size_t num_command_queues, const DispatchCoreConfig& dispatch_core_config, const MeshDeviceConfig& config) { - auto& system_mesh = SystemMesh::instance(); - auto physical_device_ids = system_mesh.request_available_devices(config); - + auto physical_device_ids = SystemMesh::instance().request_available_devices(config); opened_devices_ = tt::tt_metal::detail::CreateDevices( physical_device_ids, num_command_queues, l1_small_size, trace_region_size, dispatch_core_config); diff --git a/tt_metal/distributed/system_mesh.cpp b/tt_metal/distributed/system_mesh.cpp index b2eff3b89d2..10a20b6e433 100644 --- a/tt_metal/distributed/system_mesh.cpp +++ b/tt_metal/distributed/system_mesh.cpp @@ -8,6 +8,7 @@ #include "umd/device/types/cluster_descriptor_types.h" #include "tt_metal/distributed/coordinate_translation.hpp" +#include "indestructible.hpp" #include "mesh_coord.hpp" #include "tt_cluster.hpp" @@ -29,8 +30,6 @@ class SystemMesh::Impl { const SimpleMeshShape& get_shape() const; std::vector get_mapped_physical_device_ids(const MeshDeviceConfig& config) const; std::vector request_available_devices(const MeshDeviceConfig& config) const; - - IDevice* get_device(const chip_id_t physical_device_id) const; chip_id_t get_physical_device_id(const MeshCoordinate& coord) const; }; @@ -128,7 +127,7 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me auto line_length = config.mesh_shape.mesh_size(); for (const auto& logical_coordinate : MeshDeviceView::get_line_coordinates(line_length, shape_2d)) { - auto physical_device_id = logical_to_device_id_.at(logical_coordinate); + auto physical_device_id = get_physical_device_id(logical_coordinate); physical_device_ids.push_back(physical_device_id); log_debug( @@ -176,14 +175,9 @@ std::vector SystemMesh::Impl::get_mapped_physical_device_ids(const Me MeshCoordinateRange system_range(system_offset, MeshCoordinate(end_coord)); for (const auto& system_coord : system_range) { - auto physical_device_id = logical_to_device_id_.find(system_coord); - TT_FATAL( - physical_device_id != logical_to_device_id_.end(), - "Logical coordinate: {} not found in SystemMesh of shape {}", - system_coord, - logical_mesh_shape_); - physical_device_ids.push_back(physical_device_id->second); - log_debug(LogMetal, "Logical coordinate: {}, Physical device ID: {}", system_coord, physical_device_id->second); + auto physical_device_id = get_physical_device_id(system_coord); + physical_device_ids.push_back(physical_device_id); + log_debug(LogMetal, "Logical coordinate: {}, Physical device ID: {}", system_coord, physical_device_id); } return physical_device_ids; } @@ -201,11 +195,11 @@ std::vector SystemMesh::Impl::request_available_devices(const MeshDev SystemMesh::SystemMesh() : pimpl_(std::make_unique()) {} SystemMesh& SystemMesh::instance() { - static SystemMesh instance; - if (!instance.pimpl_->is_system_mesh_initialized()) { - instance.pimpl_->initialize(); + static tt::stl::Indestructible instance; + if (!instance.get().pimpl_->is_system_mesh_initialized()) { + instance.get().pimpl_->initialize(); } - return instance; + return instance.get(); } chip_id_t SystemMesh::get_physical_device_id(const MeshCoordinate& coord) const { diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 8df3eb90854..4afa1b342a7 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -961,7 +961,10 @@ bool Device::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t t void Device::push_work(std::function work, bool blocking) { if (not this->initialized_) { - log_warning("Attempting to push work to Device {} which is not initialized. Ignoring...", this->id_); + if (!uninitialized_error_fired_) { + log_fatal("Attempting to push work to Device {} which is not initialized. Ignoring...", this->id_); + uninitialized_error_fired_ = true; + } return; } this->work_executor_.push_work(std::move(work), blocking); diff --git a/tt_metal/llrt/tt_cluster.cpp b/tt_metal/llrt/tt_cluster.cpp index afa0a600254..b7cecc47732 100644 --- a/tt_metal/llrt/tt_cluster.cpp +++ b/tt_metal/llrt/tt_cluster.cpp @@ -23,7 +23,6 @@ #include "fmt/base.h" #include #include -#include #include #include "umd/device/types/arch.h" #include "umd/device/tt_cluster_descriptor.h" diff --git a/tt_metal/llrt/tt_cluster.hpp b/tt_metal/llrt/tt_cluster.hpp index 1b54e3a1213..6f91b01300e 100644 --- a/tt_metal/llrt/tt_cluster.hpp +++ b/tt_metal/llrt/tt_cluster.hpp @@ -8,7 +8,6 @@ #include #include "metal_soc_descriptor.h" -#include "test_common.hpp" #include "tt_backend_api_types.hpp" #include "umd/device/device_api_metal.h" #include "umd/device/tt_cluster_descriptor.h" diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt new file mode 100644 index 00000000000..736e5ddcf76 --- /dev/null +++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/CMakeLists.txt @@ -0,0 +1,18 @@ +set(DISTRIBUTED_TRCE_AND_EVENTS ${CMAKE_CURRENT_SOURCE_DIR}/distributed_trace_and_events.cpp) +add_executable(distributed_trace_and_events ${DISTRIBUTED_TRCE_AND_EVENTS}) + +target_link_libraries( + distributed_trace_and_events + PUBLIC + tt_metal + pthread +) + +target_include_directories(distributed_trace_and_events PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + +set_target_properties( + distributed_trace_and_events + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/programming_examples/distributed +) diff --git a/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp new file mode 100644 index 00000000000..f64154f3c74 --- /dev/null +++ b/tt_metal/programming_examples/distributed/4_distributed_trace_and_events/distributed_trace_and_events.cpp @@ -0,0 +1,285 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +using namespace tt; +using namespace tt::tt_metal; +using namespace tt::tt_metal::distributed; + +// The following is an advanced programming example that demonstrates: +// +// 1. Initializing a MeshDevice with 2 MeshCommandQueues and a dedicated memory region to store MeshWorkload Traces +// 2. Loading a SubDevice configuration on a Virtual Mesh, and how this configuration gets replicated across all +// physical devices +// 3. Allocating MeshBuffers in the distributed memory space exposed by the Virtual Mesh, to shard data across physical +// devices +// 4. Constructing programs targeting different SubDevices +// 5. Constructing homogenous (same program dispatched to all physical devices) and heterogenous (different programs +// dispatched +// to physical different devices) MeshWorkloads from programs +// 6. Capturing the execution of MeshWorkloads inside a MeshTrace that gets loaded onto the Virtual Mesh +// 7. Performing IO and MeshTrace execution on different MeshCommandQueues and using MeshEvents for MeshCQ <--> MeshCQ +// synchronization + +std::shared_ptr EltwiseBinaryProgramGenerator( + const std::shared_ptr& src0_buf, + const std::shared_ptr& src1_buf, + const std::shared_ptr& output_buf, + const SubDevice& sub_device_for_program, + uint32_t num_tiles, + uint32_t single_tile_size, + uint32_t eltwise_op_index) { + // Program Generation helper function: Can be used to run addition, multiplication and subtraction + // on a SubDevice. + // Requires: + // 1. The src (input) and output buffers + // 2. The SubDevice being targeted + // 3. The number of tiles that must be processed by the op + // 4. The size of the tile in bytes + // The op specifier: Addition (0), Multiplication (1), Subtraction (2) + const std::vector op_id_to_op_define = {"add_tiles", "mul_tiles", "sub_tiles"}; + const std::vector op_id_to_op_type_define = { + "EltwiseBinaryType::ELWADD", "EltwiseBinaryType::ELWMUL", "EltwiseBinaryType::ELWSUB"}; + + const auto cores_for_program = sub_device_for_program.cores(HalProgrammableCoreType::TENSIX); + + std::shared_ptr program = std::make_shared(); + + uint32_t src0_cb_index = tt::CBIndex::c_0; + uint32_t num_input_tiles = 2; + tt_metal::CircularBufferConfig cb_src0_config = + tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src0_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src0_cb_index, single_tile_size); + auto cb_src0 = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_src0_config); + + uint32_t src1_cb_index = tt::CBIndex::c_1; + tt_metal::CircularBufferConfig cb_src1_config = + tt_metal::CircularBufferConfig(num_input_tiles * single_tile_size, {{src1_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(src1_cb_index, single_tile_size); + auto cb_src1 = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_src1_config); + + uint32_t output_cb_index = tt::CBIndex::c_16; + uint32_t num_output_tiles = 2; + tt_metal::CircularBufferConfig cb_output_config = + tt_metal::CircularBufferConfig( + num_output_tiles * single_tile_size, {{output_cb_index, tt::DataFormat::Float16_b}}) + .set_page_size(output_cb_index, single_tile_size); + auto cb_output = tt_metal::CreateCircularBuffer(*program, cores_for_program, cb_output_config); + + auto binary_reader_kernel = tt_metal::CreateKernel( + *program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/reader_dual_8bank.cpp", + cores_for_program, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_1, .noc = tt_metal::NOC::RISCV_1_default}); + + auto unary_writer_kernel = tt_metal::CreateKernel( + *program, + "tests/tt_metal/tt_metal/test_kernels/dataflow/writer_unary_8bank.cpp", + cores_for_program, + tt_metal::DataMovementConfig{ + .processor = tt_metal::DataMovementProcessor::RISCV_0, .noc = tt_metal::NOC::RISCV_0_default}); + + std::vector compute_kernel_args = {}; + + bool fp32_dest_acc_en = false; + bool math_approx_mode = false; + std::map binary_defines = { + {"ELTWISE_OP", op_id_to_op_define[eltwise_op_index]}, + {"ELTWISE_OP_TYPE", op_id_to_op_type_define[eltwise_op_index]}}; + auto eltwise_binary_kernel = tt_metal::CreateKernel( + *program, + "tt_metal/kernels/compute/eltwise_binary.cpp", + cores_for_program, + tt_metal::ComputeConfig{.compile_args = compute_kernel_args, .defines = binary_defines}); + + SetRuntimeArgs(*program, eltwise_binary_kernel, cores_for_program, {num_tiles, 1}); + + const std::array reader_args = { + src0_buf->address(), 0, num_tiles, src1_buf->address(), 0, num_tiles, 0}; + + const std::array writer_args = {output_buf->address(), 0, num_tiles}; + + SetRuntimeArgs(*program, unary_writer_kernel, cores_for_program, writer_args); + SetRuntimeArgs(*program, binary_reader_kernel, cores_for_program, reader_args); + + return program; +} + +int main(int argc, char** argv) { + using tt::constants::TILE_HEIGHT; + using tt::constants::TILE_WIDTH; + // Initialize constants used to define the workload + constexpr uint32_t ADD_OP_ID = 0; + constexpr uint32_t MULTIPLY_OP_ID = 1; + constexpr uint32_t SUBTRACT_OP_ID = 2; + // Create a 2x4 MeshDevice with 2 MeshCQs, 16MB allocated to the trace region and Ethernet Dispatch enabled + auto mesh_device = MeshDevice::create( + MeshDeviceConfig{.mesh_shape = MeshShape(2, 4)}, // Shape of MeshDevice + 0, // l1 small size + 16 << 20, // trace region size + 2, // num MeshCQs + DispatchCoreType::ETH /* Dispatch Configuration: 8 Chip Wormhole systems can only support 2 MeshCQs when Ethernet Dispatch is enabled */); + + // Initialize command queue ids used for data movement and workload dispatch + constexpr uint8_t data_movement_cq_id = 1; + constexpr uint8_t workload_cq_id = 0; + auto data_movement_cq = mesh_device->mesh_command_queue(data_movement_cq_id); + auto workload_cq = mesh_device->mesh_command_queue(workload_cq_id); + + // =========== Step 1: Initialize and load two SubDevices =========== + // Each SubDevice contains a single core. This SubDevice configuration is loaded on each physical device + // in the Virtual Mesh + SubDevice sub_device_1(std::array{CoreRangeSet(CoreRange({0, 0}, {0, 0}))}); + SubDevice sub_device_2(std::array{CoreRangeSet(CoreRange({1, 1}, {1, 1}))}); + auto sub_device_manager = mesh_device->create_sub_device_manager( + {sub_device_1, sub_device_2}, 3200 /* size of L1 region allocated for the SubDevices */); + mesh_device->load_sub_device_manager(sub_device_manager); + + // =========== Step 2: Initialize IO Buffers and Workload parameters =========== + uint32_t single_tile_size = sizeof(bfloat16) * TILE_HEIGHT * TILE_WIDTH; // Using bfloat16 in this example + uint32_t num_tiles_per_device = 2048; // Number of tiles sent to each physical device + uint32_t num_tiles_in_mesh = + num_tiles_per_device * mesh_device->num_devices(); // The total number of tiles in the distributed memory space + + // Specify data layout in distributed memory space - Data will be sharded in row major order across the Virtual Mesh + tt::tt_metal::distributed::ShardedBufferConfig global_buffer_config{ + .global_size = single_tile_size * num_tiles_in_mesh, // Total size of the sharded buffer + .global_buffer_shape = + {num_tiles_in_mesh * TILE_WIDTH, TILE_HEIGHT}, // Data represents horizontally concatenated tiles + .shard_shape = {num_tiles_per_device * TILE_WIDTH, TILE_HEIGHT}, // Row major sharding + .shard_orientation = ShardOrientation::ROW_MAJOR // Row major sharding + }; + // Specify data layout on a single physical device + DeviceLocalBufferConfig per_device_buffer_config{ + .page_size = single_tile_size, + .buffer_type = tt_metal::BufferType::DRAM, + .buffer_layout = TensorMemoryLayout::INTERLEAVED, + .bottom_up = true}; + // Allocate buffers in distributed memory space for first MeshWorkload + auto add_src0_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto add_src1_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto add_output_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + // Allocate buffers in distributed memory space for second MeshWorkload + auto mul_sub_src0_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto mul_sub_src1_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + auto mul_sub_output_buf = MeshBuffer::create(global_buffer_config, per_device_buffer_config, mesh_device.get()); + + // =========== Step 3: Create Workloads to run on the Virtual Mesh =========== + // Specify Device Ranges on which the Workloads will run + LogicalDeviceRange all_devices({0, 0}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); + LogicalDeviceRange top_row({0, 0}, {mesh_device->num_cols() - 1, 0}); + LogicalDeviceRange bottom_row( + {0, mesh_device->num_rows() - 1}, {mesh_device->num_cols() - 1, mesh_device->num_rows() - 1}); + // Create three eltwise binary ops using a simple program generation function + auto add_program = EltwiseBinaryProgramGenerator( + add_src0_buf, + add_src1_buf, + add_output_buf, + sub_device_1, // Addition runs on the first SubDevice + num_tiles_per_device, + single_tile_size, + ADD_OP_ID); + auto multiply_program = EltwiseBinaryProgramGenerator( + mul_sub_src0_buf, + mul_sub_src1_buf, + mul_sub_output_buf, + sub_device_2, // Multiplication runs on the second SubDevice + num_tiles_per_device, + single_tile_size, + MULTIPLY_OP_ID); + auto subtract_program = EltwiseBinaryProgramGenerator( + mul_sub_src0_buf, + mul_sub_src1_buf, + mul_sub_output_buf, + sub_device_2, // Subtraction runs on the second SubDevice + num_tiles_per_device, + single_tile_size, + SUBTRACT_OP_ID); + // Create MeshWorkloads and add programs to them. A MeshWorkload allows a program to target + // multiple Physical Devices in the Virtual Mesh. + auto add_mesh_workload = CreateMeshWorkload(); + auto multiply_and_subtract_mesh_workload = CreateMeshWorkload(); + AddProgramToMeshWorkload( + add_mesh_workload, *add_program, all_devices); // Addition runs on the full grid (sub_device 1) + AddProgramToMeshWorkload( + multiply_and_subtract_mesh_workload, + *multiply_program, + top_row); // Multiplication runs on the top row (sub_device 2) + AddProgramToMeshWorkload( + multiply_and_subtract_mesh_workload, + *subtract_program, + bottom_row); // Subtraction runs on the bottom row (sub device 2) + + // =========== Step 4: Compile and Load Workloads on the Mesh =========== + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), add_mesh_workload, true); + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), multiply_and_subtract_mesh_workload, true); + // =========== Step 5: Trace the MeshWorkloads using the Workload Dispatch CQ =========== + auto trace_id = BeginTraceCapture(mesh_device.get(), workload_cq_id); + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), add_mesh_workload, false); + EnqueueMeshWorkload(mesh_device->mesh_command_queue(), multiply_and_subtract_mesh_workload, false); + EndTraceCapture(mesh_device.get(), workload_cq_id, trace_id); + + // =========== Step 6: Populate inputs =========== + uint32_t workload_0_src0_val = 2; + uint32_t workload_0_src1_val = 3; + uint32_t workload_1_src0_val = 7; + uint32_t workload_1_src1_val = 5; + // Uniform values passed to the add operation + std::vector add_src0_vec = create_constant_vector_of_bfloat16(add_src0_buf->size(), workload_0_src0_val); + std::vector add_src1_vec = create_constant_vector_of_bfloat16(add_src1_buf->size(), workload_0_src1_val); + // Uniform values passed to the multiply and subtract operations (the top row runs multiplication with subtraction + // on the bottom row of the Virtual Mesh) + std::vector mul_sub_src0_vec = + create_constant_vector_of_bfloat16(mul_sub_src0_buf->size(), workload_1_src0_val); + std::vector mul_sub_src1_vec = + create_constant_vector_of_bfloat16(mul_sub_src1_buf->size(), workload_1_src1_val); + + // =========== Step 7: Write inputs on MeshCQ1 =========== + // IO is done through MeshCQ1 and Workload dispatch is done through MeshCQ0. Use MeshEvents to synchronize the + // independent MeshCQs. + std::shared_ptr write_event = std::make_shared(); + std::shared_ptr trace_event = std::make_shared(); + + EnqueueWriteMeshBuffer(data_movement_cq, add_src0_buf, add_src0_vec); + EnqueueWriteMeshBuffer(data_movement_cq, add_src1_buf, add_src1_vec); + EnqueueWriteMeshBuffer(data_movement_cq, mul_sub_src0_buf, mul_sub_src0_vec); + EnqueueWriteMeshBuffer(data_movement_cq, mul_sub_src1_buf, mul_sub_src1_vec); + // Synchronize + EnqueueRecordEvent(data_movement_cq, write_event); + EnqueueWaitForEvent(workload_cq, write_event); + // =========== Step 8: Run MeshTrace on MeshCQ0 =========== + ReplayTrace(mesh_device.get(), workload_cq_id, trace_id, false); + // Synchronize + EnqueueRecordEvent(workload_cq, trace_event); + EnqueueWaitForEvent(data_movement_cq, trace_event); + // =========== Step 9: Read Outputs on MeshCQ1 =========== + std::vector add_dst_vec = {}; + std::vector mul_sub_dst_vec = {}; + EnqueueReadMeshBuffer(data_movement_cq, add_dst_vec, add_output_buf); + EnqueueReadMeshBuffer(data_movement_cq, mul_sub_dst_vec, mul_sub_output_buf); + + // =========== Step 10: Verify Outputs =========== + bool pass = true; + for (int i = 0; i < add_dst_vec.size(); i++) { + pass &= (add_dst_vec[i].to_float() == workload_0_src0_val + workload_0_src1_val); + } + for (int i = 0; i < mul_sub_dst_vec.size(); i++) { + if (i < mul_sub_dst_vec.size() / 2) { + pass &= (mul_sub_dst_vec[i].to_float() == workload_1_src0_val * workload_1_src1_val); + } else { + pass &= (mul_sub_dst_vec[i].to_float() == workload_1_src0_val - workload_1_src1_val); + } + } + ReleaseTrace(mesh_device.get(), trace_id); + if (pass) { + std::cout << "Running EltwiseBinary MeshTraces on 2 MeshCQs Passed!" << std::endl; + return 0; + } else { + std::cout << "Running EltwiseBinary MeshTraces on 2 MeshCQs Failed with Incorrect Outputs!" << std::endl; + return 1; + } +} diff --git a/tt_metal/programming_examples/distributed/CMakeLists.txt b/tt_metal/programming_examples/distributed/CMakeLists.txt index e887109662d..7dcd7fc8583 100644 --- a/tt_metal/programming_examples/distributed/CMakeLists.txt +++ b/tt_metal/programming_examples/distributed/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(1_distributed_program_dispatch) add_subdirectory(2_distributed_buffer_rw) add_subdirectory(3_distributed_eltwise_add) +add_subdirectory(4_distributed_trace_and_events) diff --git a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp index fd6bae7f5ee..aa8fd3f04f0 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/interpreter_backends/kernel_common/kernel_writers.hpp @@ -28,7 +28,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( uint32_t payload_size_bytes) { const size_t payload_l1_address = l1_read_addr; - auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); + auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); #ifdef DEBUG_PRINT_ENABLED pkt_hdr->reserved2 = my_chip_id; #endif @@ -44,7 +44,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( pkt_hdr->to_chip_unicast(unicast_args.distance_in_hops); fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); - fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } break; case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: { noc_async_write( @@ -57,7 +57,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -67,7 +67,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } } break; default: { @@ -87,8 +87,8 @@ FORCE_INLINE void write_payload_then_advance_read_address( size_t& l1_read_addr, size_t payload_size_bytes) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion"); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: [[fallthrough]]; diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp index bb62676afbf..172222d7abf 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader.cpp @@ -162,7 +162,7 @@ void kernel_main() { for (uint32_t p = 0; p < command_tensor.worker_pages_per_slice; p += packet_size_in_pages) { cb_reserve_back(cb_id, packet_size_in_pages); const uint32_t local_l1_scratch_buffer_address = - get_write_ptr(cb_id) + sizeof(tt::fabric::PacketHeader); + get_write_ptr(cb_id) + sizeof(PACKET_HEADER_TYPE); uint32_t n_pages = std::min(packet_size_in_pages, command_tensor.worker_pages_per_slice - p); ASSERT(command_tensor.worker_start_offset_in_slice.w == 0); diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp index 731ed70359e..8107d2d992e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_reader_two_input.cpp @@ -437,7 +437,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) ASSERT(cmd_ctx.core_desc_type == ttnn::ccl::cmd::CclCommandCoreDescriptorType::NOC_XY); ASSERT(cmd_ctx.packet_header_buffer_addr != 0); - auto* pkt_hdr = reinterpret_cast(cmd_ctx.packet_header_buffer_addr); + auto* pkt_hdr = reinterpret_cast(cmd_ctx.packet_header_buffer_addr); uint64_t dest_noc_addr_for_pkt = safe_get_noc_addr(dest_noc0_x, dest_noc0_y, dest_bank_addr, 0); if (cmd_ctx.current_cmd_header.code == ttnn::ccl::cmd::CclCommandCode::ATOMIC_INC) { @@ -457,7 +457,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) : cmd_ctx.fabric_connection.get_backward_connection(); fabric_connection.wait_for_empty_write_slot(); fabric_connection.send_payload_flush_blocking_from_address( - cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader)); + cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE)); } break; case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: { write_local = true; @@ -467,7 +467,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) 1, static_cast(mcast_args.num_targets_forward_direction)}); cmd_ctx.fabric_connection.get_forward_connection().wait_for_empty_write_slot(); cmd_ctx.fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader)); + cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE)); } // Write the mcast packet (backward) @@ -476,7 +476,7 @@ void try_advance_inline_write_or_atomic_inc(command_context_t& cmd_ctx) 1, static_cast(mcast_args.num_targets_backward_direction)}); cmd_ctx.fabric_connection.get_backward_connection().wait_for_empty_write_slot(); cmd_ctx.fabric_connection.get_backward_connection().send_payload_non_blocking_from_address( - cmd_ctx.packet_header_buffer_addr, sizeof(tt::fabric::PacketHeader)); + cmd_ctx.packet_header_buffer_addr, sizeof(PACKET_HEADER_TYPE)); } } break; @@ -559,7 +559,7 @@ void write_and_advance_local_read_address_for_fabric_write( uint32_t payload_size_bytes) { const size_t payload_l1_address = l1_read_addr; - auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); + auto pkt_hdr = reinterpret_cast(packet_header_buffer_addr); pkt_hdr->to_noc_unicast_write(tt::fabric::NocUnicastCommandHeader{noc0_dest_noc_addr}, payload_size_bytes); @@ -573,7 +573,7 @@ void write_and_advance_local_read_address_for_fabric_write( fabric_conn.wait_for_empty_write_slot(); fabric_conn.send_payload_without_header_non_blocking_from_address(l1_read_addr, payload_size_bytes); - fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + fabric_conn.send_payload_flush_blocking_from_address((uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } break; case ttnn::ccl::cmd::CclCommandDestType::CHIP_MULTICAST: { const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_noc_addr); @@ -588,7 +588,7 @@ void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -598,7 +598,7 @@ void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr, sizeof(PACKET_HEADER_TYPE)); } } break; default: { @@ -618,8 +618,8 @@ FORCE_INLINE void write_payload_then_advance_read_address( size_t& l1_read_addr, size_t payload_size_bytes) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(PACKET_HEADER_TYPE) is not a power of two which violates the below assertion"); switch (current_cmd_header.dest_type) { case ttnn::ccl::cmd::CclCommandDestType::CHIP_UNICAST: [[fallthrough]]; @@ -933,7 +933,7 @@ void kernel_main() { cb_reserve_back(reserved_packet_header_cb_id, num_packet_headers_storable); auto packet_header_buffer_addr0 = get_write_ptr(reserved_packet_header_cb_id); auto packet_header_buffer_addr1 = - packet_header_buffer_addr0 + (num_packet_headers_storable >> 2) * sizeof(tt::fabric::PacketHeader); + packet_header_buffer_addr0 + (num_packet_headers_storable >> 2) * sizeof(PACKET_HEADER_TYPE); auto operand_0_cmd_ctx = command_context_t( fabric_connection, diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp index 904cd775a9a..decb79c8070 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_utils.hpp @@ -96,7 +96,7 @@ void mcast_contig_pages_to_noc_address( size_t backward_direction_num_hops) { const size_t payload_size_bytes = contig_pages_advanced * payload_page_size; const auto [dest_noc_xy, dest_addr] = get_noc_address_components(noc0_dest_addr); - const size_t payload_l1_address = l1_read_addr + sizeof(tt::fabric::PacketHeader); + const size_t payload_l1_address = l1_read_addr + sizeof(PACKET_HEADER_TYPE); // Local chip write noc_async_write( @@ -106,15 +106,15 @@ void mcast_contig_pages_to_noc_address( // coords it is necessary get_noc_addr(dest_noc_xy.x, dest_noc_xy.y, dest_addr, noc_index), payload_size_bytes); - size_t packet_send_size_bytes = payload_size_bytes + sizeof(tt::fabric::PacketHeader); + size_t packet_send_size_bytes = payload_size_bytes + sizeof(PACKET_HEADER_TYPE); // Forward fabric connection if (has_forward_fabric_connection) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion"); - auto& pkt_hdr = *reinterpret_cast(l1_read_addr); + auto& pkt_hdr = *reinterpret_cast(l1_read_addr); pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(forward_direction_num_hops)}) @@ -125,7 +125,7 @@ void mcast_contig_pages_to_noc_address( // Backward fabric connection if (has_backward_fabric_connection) { - auto& pkt_hdr = *reinterpret_cast(l1_read_addr); + auto& pkt_hdr = *reinterpret_cast(l1_read_addr); pkt_hdr .to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(backward_direction_num_hops)}) @@ -286,11 +286,11 @@ void mcast_sync_signal_to_addr( size_t remote_sem_l1_addr, size_t directional_num_hops) { static_assert( - ((sizeof(tt::fabric::PacketHeader) - 1) & sizeof(tt::fabric::PacketHeader)) == 0, - "sizeof(sizeof(tt::fabric::PacketHeader)) is not a power of two which violates the below assertion"); - ASSERT((pkt_addr & (sizeof(tt::fabric::PacketHeader) - 1)) == 0); + is_power_of_2(sizeof(PACKET_HEADER_TYPE)), + "sizeof(tt::fabric::PacketHeader) is not a power of two which violates the below assertion"); + ASSERT((pkt_addr & (sizeof(PACKET_HEADER_TYPE) - 1)) == 0); - auto& pkt_hdr = *reinterpret_cast(pkt_addr); + auto& pkt_hdr = *reinterpret_cast(pkt_addr); pkt_hdr .to_chip_multicast(tt::fabric::MulticastRoutingCommandHeader{1, static_cast(directional_num_hops)}) .to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ diff --git a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp index 71865c224e5..766cdd0b688 100644 --- a/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/common/kernels/ccl_send_writer.cpp @@ -125,7 +125,7 @@ void kernel_main() { // out when we start enabling other modes const size_t packet_size_in_pages = get_arg_val(arg_idx++); const size_t payload_page_size = get_arg_val(arg_idx++); - const size_t l1_scratch_page_size = payload_page_size + sizeof(tt::fabric::PacketHeader); + const size_t l1_scratch_page_size = payload_page_size + sizeof(PACKET_HEADER_TYPE); const size_t forward_direction_num_hops = get_arg_val(arg_idx++); const size_t backward_direction_num_hops = get_arg_val(arg_idx++); const bool has_forward_fabric_connection = get_arg_val(arg_idx++) != 0; @@ -248,7 +248,7 @@ void kernel_main() { DPRINT << "ccl_send_writer Sending payload completion sync signals\n"; ASSERT(some_buffering_addr != 0); some_buffering_addr = - (some_buffering_addr + (sizeof(tt::fabric::PacketHeader))) & ~(sizeof(tt::fabric::PacketHeader) - 1); + (some_buffering_addr + (sizeof(PACKET_HEADER_TYPE))) & ~(sizeof(PACKET_HEADER_TYPE) - 1); mcast_sync_signal_to_addr( some_buffering_addr, diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp index 564ed163999..87ba5ea5fba 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/edm_fabric_worker_adapters.hpp @@ -326,7 +326,7 @@ struct WorkerToFabricEdmSenderImpl { FORCE_INLINE void send_packet_header_and_notify_fabric(uint32_t source_address) { uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr(); - send_chunk_from_address(source_address, 1, sizeof(tt::fabric::PacketHeader), buffer_address); + send_chunk_from_address(source_address, 1, sizeof(PACKET_HEADER_TYPE), buffer_address); post_send_payload_increment_pointers(); } @@ -335,23 +335,23 @@ struct WorkerToFabricEdmSenderImpl { uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr(); // skip past the first part of the buffer which will be occupied by the packet header - send_chunk_from_address(source_address, 1, size_bytes, buffer_address + sizeof(tt::fabric::PacketHeader)); + send_chunk_from_address(source_address, 1, size_bytes, buffer_address + sizeof(PACKET_HEADER_TYPE)); } template FORCE_INLINE void send_payload_from_address_impl(uint32_t source_address, size_t size_bytes) { uint64_t buffer_address = this->compute_dest_buffer_slot_noc_addr(); ASSERT(size_bytes <= this->buffer_size_bytes); - ASSERT(tt::fabric::is_valid(*const_cast( - reinterpret_cast(source_address)))); + ASSERT(tt::fabric::is_valid(*const_cast( + reinterpret_cast(source_address)))); send_chunk_from_address(source_address, 1, size_bytes, buffer_address); post_send_payload_increment_pointers(); } template FORCE_INLINE void send_payload_from_address_with_trid_impl(uint32_t source_address, size_t size_bytes, uint8_t trid) { ASSERT(size_bytes <= this->buffer_size_bytes); - ASSERT(tt::fabric::is_valid(*const_cast( - reinterpret_cast(source_address)))); + ASSERT(tt::fabric::is_valid(*const_cast( + reinterpret_cast(source_address)))); send_chunk_from_address_with_trid(source_address, 1, size_bytes, this->edm_buffer_addr, trid, this->edm_noc_cmd_buf); post_send_payload_increment_pointers(); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp index af3c53f27b5..468777220e8 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header.hpp @@ -89,11 +89,11 @@ struct NocMulticastAtomicIncCommandHeader { uint8_t size_x; uint8_t size_y; }; -static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 1 byte"); -static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 1 byte"); +static_assert(sizeof(NocUnicastCommandHeader) == 8, "NocUnicastCommandHeader size is not 8 bytes"); +static_assert(sizeof(NocMulticastCommandHeader) == 8, "NocMulticastCommandHeader size is not 8 bytes"); +static_assert(sizeof(NocUnicastInlineWriteCommandHeader) == 16, "NocMulticastCommandHeader size is not 16 bytes"); +static_assert(sizeof(NocUnicastAtomicIncCommandHeader) == 16, "NocUnicastCommandHeader size is not 16 bytes"); +static_assert(sizeof(NocMulticastAtomicIncCommandHeader) == 12, "NocAtomicIncCommandHeader size is not 12 bytes"); union NocCommandFields{ NocUnicastCommandHeader unicast_write; NocUnicastInlineWriteCommandHeader unicast_inline_write; @@ -104,116 +104,107 @@ union NocCommandFields{ static_assert(sizeof(NocCommandFields) <= 16, "CommandFields size is not 16 bytes"); // TODO: wrap this in a debug version that holds type info so we can assert for field/command/ -struct PacketHeader { +template +struct PacketHeaderBase { + NocCommandFields command_fields; // size = 16B due to uint64_t alignment + uint16_t payload_size_bytes; // TODO: trim this down noc_send_type 2 bits (4 values): // -> unicast_write, mcast_write, unicast_seminc, mcast_seminc // For now, kept it separate so I could do reads which would be handled differently // but for our purposes we shouldn't need read so we should be able to omit the support NocSendType noc_send_type : 3; + // ChipSendType only used by PacketHeader, but keep here for now for bit-fields ChipSendType chip_send_type : 1; - // Used only by the EDM sender and receiver channels. Populated by EDM sender channel to // indicate to the receiver channel what channel was the source of this packet. Reserved // otherwise. uint8_t src_ch_id : 4; - RoutingFields routing_fields; - uint16_t payload_size_bytes; // excludes header size - NocCommandFields command_fields; // size = 16B due to uint64_t alignment - - // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned - // To simplify worker kernel code, we for now decide to pad up the packet header - // to 32B so the user can simplify shift into their CB chunk by sizeof(tt::fabric::PacketHeader) - // and automatically work around the DRAM read alignment bug. - // - // Future changes will remove this padding and require the worker kernel to be aware of this bug - // and pad their own CBs conditionally when reading from DRAM. It'll be up to the users to - // manage this complexity. - uint32_t padding0; - uint32_t padding1; - - inline void set_chip_send_type(ChipSendType &type) { this->chip_send_type = type; } - inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; } - inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } - inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } - // Returns size of payload in bytes - TODO: convert to words (4B) size_t get_payload_size_excluding_header() volatile const { return this->payload_size_bytes; } + inline size_t get_payload_size_including_header() volatile const { - return get_payload_size_excluding_header() + sizeof(PacketHeader); + return get_payload_size_excluding_header() + sizeof(Derived); } - inline PacketHeader &to_chip_unicast(uint8_t distance_in_hops) { - this->chip_send_type = CHIP_UNICAST; - this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; - return *this; + // Setters for noc_send_type, routing_fields, and command_fields + inline void set_noc_send_type(NocSendType &type) { this->noc_send_type = type; } + inline void set_command_fields(NocCommandFields &fields) { this->command_fields = fields; } + + inline Derived &to_chip_unicast(uint8_t distance_in_hops) { + static_cast(this)->to_chip_unicast_impl(distance_in_hops); + return *static_cast(this); } - inline PacketHeader &to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) { - this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.value = ((static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast(chip_multicast_command_header.start_distance_in_hops); - return *this; + + inline Derived &to_chip_multicast(MulticastRoutingCommandHeader const &mcast_routing_command_header) { + static_cast(this)->to_chip_multicast_impl(mcast_routing_command_header); + return *static_cast(this); } - inline PacketHeader &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { + inline Derived &to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write = noc_unicast_command_header; this->payload_size_bytes = payload_size_bytes; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { + + inline Derived &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) { this->noc_send_type = NOC_UNICAST_INLINE_WRITE; this->command_fields.unicast_inline_write = noc_unicast_command_header; this->payload_size_bytes = 0; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_multicast_write(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { + + inline Derived &to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write = noc_multicast_command_header; this->payload_size_bytes = payload_size_bytes; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { + + inline Derived &to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) { this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc = noc_unicast_atomic_inc_command_header; this->payload_size_bytes = 0; - return *this; + return *static_cast(this); } - inline PacketHeader &to_noc_multicast_atomic_inc(NocMulticastAtomicIncCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) { - #if defined(KERNEL_BUILD) || defined(FW_BUILD) - ASSERT(false); - while (1) {}; - #endif + + inline Derived &to_noc_multicast_atomic_inc( + NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) { + this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; + this->command_fields.mcast_seminc = noc_multicast_atomic_inc_command_header; this->payload_size_bytes = payload_size_bytes; - return *this; + return *static_cast(this); } - inline volatile PacketHeader *to_chip_unicast(uint8_t distance_in_hops) volatile { - this->chip_send_type = CHIP_UNICAST; - this->routing_fields.value = RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; - return this; + inline volatile Derived* to_chip_unicast(uint8_t distance_in_hops) volatile { + static_cast(this)->to_chip_unicast_impl(distance_in_hops); + return static_cast(this); } - inline volatile PacketHeader *to_chip_multicast(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile { - this->chip_send_type = CHIP_MULTICAST; - this->routing_fields.value = (static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH) | chip_multicast_command_header.start_distance_in_hops; - return this; + + inline volatile Derived* to_chip_multicast(MulticastRoutingCommandHeader const &mcast_routing_command_header) volatile { + static_cast(this)->to_chip_multicast_impl(mcast_routing_command_header); + return static_cast(this); } - inline volatile PacketHeader *to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { + + inline volatile Derived* to_noc_unicast_write(NocUnicastCommandHeader const &noc_unicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_UNICAST_WRITE; this->command_fields.unicast_write.noc_address = noc_unicast_command_header.noc_address; this->payload_size_bytes = payload_size_bytes; - - return this; + return static_cast(this); } - inline volatile PacketHeader &to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { + + inline volatile Derived* to_noc_unicast_inline_write(NocUnicastInlineWriteCommandHeader const &noc_unicast_command_header) volatile { this->noc_send_type = NOC_UNICAST_INLINE_WRITE; this->command_fields.unicast_inline_write.noc_address = noc_unicast_command_header.noc_address; this->command_fields.unicast_inline_write.value = noc_unicast_command_header.value; this->payload_size_bytes = 0; - return *this; + return static_cast(this); } - inline volatile PacketHeader *to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { + + inline volatile Derived* to_noc_multicast(NocMulticastCommandHeader const &noc_multicast_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_WRITE; this->command_fields.mcast_write.mcast_rect_size_x = noc_multicast_command_header.mcast_rect_size_x; this->command_fields.mcast_write.mcast_rect_size_y = noc_multicast_command_header.mcast_rect_size_y; @@ -221,20 +212,19 @@ struct PacketHeader { this->command_fields.mcast_write.noc_y_start = noc_multicast_command_header.noc_y_start; this->payload_size_bytes = payload_size_bytes; this->command_fields.mcast_write.address = noc_multicast_command_header.address; - - return this; + return static_cast(this); } - inline volatile PacketHeader *to_noc_unicast_atomic_inc( - NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile { + + inline volatile Derived* to_noc_unicast_atomic_inc(NocUnicastAtomicIncCommandHeader const &noc_unicast_atomic_inc_command_header) volatile { this->noc_send_type = NOC_UNICAST_ATOMIC_INC; this->command_fields.unicast_seminc.noc_address = noc_unicast_atomic_inc_command_header.noc_address; this->command_fields.unicast_seminc.val = noc_unicast_atomic_inc_command_header.val; this->command_fields.unicast_seminc.wrap = noc_unicast_atomic_inc_command_header.wrap; this->payload_size_bytes = 0; - - return this; + return static_cast(this); } - inline volatile PacketHeader *to_noc_multicast_atomic_inc( + + inline volatile Derived *to_noc_multicast_atomic_inc( NocMulticastAtomicIncCommandHeader const &noc_multicast_atomic_inc_command_header, size_t payload_size_bytes) volatile { this->noc_send_type = NOC_MULTICAST_ATOMIC_INC; this->command_fields.mcast_seminc.address = noc_multicast_atomic_inc_command_header.address; @@ -245,17 +235,145 @@ struct PacketHeader { this->command_fields.mcast_seminc.val = noc_multicast_atomic_inc_command_header.val; this->command_fields.mcast_seminc.wrap = noc_multicast_atomic_inc_command_header.wrap; this->payload_size_bytes = payload_size_bytes; + return static_cast(this); + } - return this; + inline void set_src_ch_id(uint8_t ch_id) volatile { + this->src_ch_id = ch_id; } - inline void set_src_ch_id(uint8_t ch_id) volatile { this->src_ch_id = ch_id; } }; +struct PacketHeader : public PacketHeaderBase { + RoutingFields routing_fields; + // Sort of hack to work-around DRAM read alignment issues that must be 32B aligned + // To simplify worker kernel code, we for now decide to pad up the packet header + // to 32B so the user can simplify shift into their CB chunk by sizeof(tt::fabric::PacketHeader) + // and automatically work around the DRAM read alignment bug. + // + // Future changes will remove this padding and require the worker kernel to be aware of this bug + // and pad their own CBs conditionally when reading from DRAM. It'll be up to the users to + // manage this complexity. + uint32_t padding0; + uint32_t padding1; + + private: + + inline static uint32_t calculate_chip_unicast_routing_fields_value(uint8_t distance_in_hops) { + return RoutingFields::LAST_CHIP_IN_MCAST_VAL | distance_in_hops; + } + inline static uint32_t calculate_chip_multicast_routing_fields_value( + const MulticastRoutingCommandHeader& chip_multicast_command_header) { + return ((static_cast(chip_multicast_command_header.range_hops) << RoutingFields::START_DISTANCE_FIELD_BIT_WIDTH)) | static_cast(chip_multicast_command_header.start_distance_in_hops); + } + + public: + + // Setters for PacketHeader-specific fields + inline void set_chip_send_type(ChipSendType &type) { this->chip_send_type = type; } + + inline void set_routing_fields(RoutingFields &fields) { this->routing_fields = fields; } + + inline void to_chip_unicast_impl(uint8_t distance_in_hops) { + this->chip_send_type = CHIP_UNICAST; + this->routing_fields.value = PacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); + } + inline void to_chip_multicast_impl(MulticastRoutingCommandHeader const &chip_multicast_command_header) { + this->chip_send_type = CHIP_MULTICAST; + this->routing_fields.value = PacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); + } + + inline void to_chip_unicast_impl(uint8_t distance_in_hops) volatile { + this->chip_send_type = CHIP_UNICAST; + this->routing_fields.value = PacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); + } + inline void to_chip_multicast_impl(MulticastRoutingCommandHeader const &chip_multicast_command_header) volatile{ + this->chip_send_type = CHIP_MULTICAST; + this->routing_fields.value = PacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); + } +}; + +struct LowLatencyRoutingFields { + static constexpr uint32_t FIELD_WIDTH = 2; + static constexpr uint32_t FIELD_MASK = 0b11; + static constexpr uint32_t NOOP = 0b00; + static constexpr uint32_t WRITE_ONLY = 0b01; + static constexpr uint32_t FORWARD_ONLY = 0b10; + static constexpr uint32_t WRITE_AND_FORWARD = 0b11; + static constexpr uint32_t FWD_ONLY_FIELD = 0xAAAAAAAA; + static constexpr uint32_t WR_AND_FWD_FIELD = 0xFFFFFFFF; + uint32_t value; +}; + +struct LowLatencyPacketHeader : public PacketHeaderBase { + uint8_t padding0; + LowLatencyRoutingFields routing_fields; + uint32_t padding1; + + private: + + inline static uint32_t calculate_chip_unicast_routing_fields_value(uint8_t distance_in_hops) { + // Example of unicast 3 hops away + // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) + // Last line will do 0b01 << 4 = 0b010000. This means that on the 3rd chip, we will write only + // Together this means the final encoding is 0b011010 + return + (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | + (LowLatencyRoutingFields::WRITE_ONLY << (distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH); + } + inline static uint32_t calculate_chip_multicast_routing_fields_value( + const MulticastRoutingCommandHeader& chip_multicast_command_header) { + // Example of starting 3 hops away mcasting to 2 chips + // First line will do 0xAAAAAAAA & 0b1111 = 0b1010. This means starting from our neighbor, we will forward twice (forward to neighbor is not encoded in the field) + // Second line will do 0xFFFFFFFF & 0b11 = 0b11. 0b11 << 4 = 0b110000. This means starting from the 3rd chip, we will write and forward once + // Last line will do 0b01 << 6 = 0b01000000. This means that on the 5th chip, we will write only + // Together this means the final encoding is 0b01111010 + return + (LowLatencyRoutingFields::FWD_ONLY_FIELD & ((1 << (chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1)) | + (LowLatencyRoutingFields::WR_AND_FWD_FIELD & ((1 << (chip_multicast_command_header.range_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH) - 1) << + ((chip_multicast_command_header.start_distance_in_hops - 1) * LowLatencyRoutingFields::FIELD_WIDTH)) | + (LowLatencyRoutingFields::WRITE_ONLY << (chip_multicast_command_header.start_distance_in_hops + chip_multicast_command_header.range_hops - 2) * LowLatencyRoutingFields::FIELD_WIDTH); + } + + public: + + // Specialized implementations for LowLatencyPacketHeader + inline void set_routing_fields(LowLatencyRoutingFields &fields) { + this->routing_fields = fields; + } + + inline void to_chip_unicast_impl(uint8_t distance_in_hops) { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); + } + inline void to_chip_multicast_impl( + const MulticastRoutingCommandHeader& chip_multicast_command_header) { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); + } + + inline void to_chip_unicast_impl(uint8_t distance_in_hops) volatile { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_unicast_routing_fields_value(distance_in_hops); + } + inline void to_chip_multicast_impl( + const MulticastRoutingCommandHeader& chip_multicast_command_header) volatile { + this->routing_fields.value = LowLatencyPacketHeader::calculate_chip_multicast_routing_fields_value(chip_multicast_command_header); + } +}; // TODO: When we remove the 32B padding requirement, reduce to 16B size check static_assert(sizeof(PacketHeader) == 32, "sizeof(PacketHeader) is not equal to 32B"); +// Host code still hardcoded to sizeof(PacketHeader) so we need to keep this check +static_assert(sizeof(LowLatencyPacketHeader) == sizeof(PacketHeader), "sizeof(LowLatencyPacketHeader) is not equal to 32B"); static constexpr size_t header_size_bytes = sizeof(PacketHeader); +#define FABRIC_LOW_LATENCY_MODE 1 + +#if defined FABRIC_LOW_LATENCY_MODE and FABRIC_LOW_LATENCY_MODE == 1 +#define PACKET_HEADER_TYPE tt::fabric::LowLatencyPacketHeader +#define ROUTING_FIELDS_TYPE tt::fabric::LowLatencyRoutingFields +#else +#define PACKET_HEADER_TYPE tt::fabric::PacketHeader +#define ROUTING_FIELDS_TYPE tt::fabric::RoutingFields +#endif + } // namespace tt::fabric diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp index 2589c8f526a..a284320d4d1 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_header_validate.hpp @@ -16,4 +16,9 @@ FORCE_INLINE bool is_valid(PacketHeader const& packet_header) { return (packet_header.chip_send_type <= CHIP_SEND_TYPE_LAST) && (packet_header.noc_send_type <= NOC_SEND_TYPE_LAST); } +FORCE_INLINE void validate(const LowLatencyPacketHeader& packet_header) {} +FORCE_INLINE bool is_valid(const LowLatencyPacketHeader& packet_header) { + return (packet_header.noc_send_type <= NOC_SEND_TYPE_LAST); +} + } // namespace tt::fabric diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp index 85553bf6dab..5e8f59954c2 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_edm_packet_transmission.hpp @@ -32,7 +32,13 @@ FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::PacketHeader #endif } -FORCE_INLINE void print_pkt_header_noc_fields(volatile tt::fabric::PacketHeader *const packet_start) { +FORCE_INLINE void print_pkt_hdr_routing_fields(volatile tt::fabric::LowLatencyPacketHeader *const packet_start) { + #ifdef DEBUG_PRINT_ENABLED + DPRINT << "ROUTE:" << packet_start->routing_fields.value << "\n"; + #endif +} + +FORCE_INLINE void print_pkt_header_noc_fields(volatile PACKET_HEADER_TYPE *const packet_start) { #ifdef DEBUG_PRINT_ENABLED switch (packet_start->noc_send_type) { case tt::fabric::NocSendType::NOC_UNICAST_WRITE: { @@ -62,12 +68,23 @@ FORCE_INLINE void print_pkt_header(volatile tt::fabric::PacketHeader *const pack #endif } +FORCE_INLINE void print_pkt_header(volatile tt::fabric::LowLatencyPacketHeader *const packet_start) { + #ifdef DEBUG_PRINT_ENABLED + auto const& header = *packet_start; + DPRINT << "PKT: nsnd_t:" << (uint32_t) packet_start->noc_send_type << + ", src_chip:" << (uint32_t) packet_start->src_ch_id << + ", payload_size_bytes:" << (uint32_t) packet_start->payload_size_bytes << "\n"; + print_pkt_hdr_routing_fields(packet_start); + print_pkt_header_noc_fields(packet_start); + #endif + } + // Since we unicast to local, we must omit the packet header FORCE_INLINE void execute_chip_unicast_to_local_chip( - volatile tt::fabric::PacketHeader *const packet_start, uint16_t payload_size_bytes, uint32_t transaction_id) { + volatile PACKET_HEADER_TYPE *const packet_start, uint16_t payload_size_bytes, uint32_t transaction_id) { auto const& header = *packet_start; - uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(tt::fabric::PacketHeader); + uint32_t payload_start_address = reinterpret_cast(packet_start) + sizeof(PACKET_HEADER_TYPE); tt::fabric::NocSendType noc_send_type = packet_start->noc_send_type; switch (noc_send_type) { @@ -116,6 +133,10 @@ FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketH packet_header->routing_fields.value = cached_routing_fields.value - decrement_val; } +FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::LowLatencyPacketHeader * packet_header, tt::fabric::LowLatencyRoutingFields cached_routing_fields) { + packet_header->routing_fields.value >>= tt::fabric::LowLatencyRoutingFields::FIELD_WIDTH; +} + // This function forwards a packet to the downstream EDM channel for eventual sending // to the next chip in the line/ring // @@ -127,9 +148,9 @@ FORCE_INLINE void update_packet_header_for_next_hop(volatile tt::fabric::PacketH // !!!WARNING!!! template FORCE_INLINE void forward_payload_to_downstream_edm( - volatile tt::fabric::PacketHeader *packet_header, + volatile PACKET_HEADER_TYPE *packet_header, uint16_t payload_size_bytes, - tt::fabric::RoutingFields cached_routing_fields, + ROUTING_FIELDS_TYPE cached_routing_fields, tt::fabric::EdmToEdmSender &downstream_edm_interface, uint8_t transaction_id ) { @@ -141,6 +162,6 @@ FORCE_INLINE void forward_payload_to_downstream_edm( update_packet_header_for_next_hop(packet_header, cached_routing_fields); downstream_edm_interface.send_payload_non_blocking_from_address_with_trid( reinterpret_cast(packet_header), - payload_size_bytes + sizeof(tt::fabric::PacketHeader), + payload_size_bytes + sizeof(PACKET_HEADER_TYPE), transaction_id); } diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp index be1ec45d50d..f80505d936d 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover.cpp @@ -433,18 +433,18 @@ struct ReceiverChannelPointers { }; struct PacketHeaderRecorder { - volatile tt::fabric::PacketHeader *buffer_ptr; + volatile uint32_t *buffer_ptr; size_t buffer_n_headers; size_t buffer_index; - PacketHeaderRecorder(volatile tt::fabric::PacketHeader *buffer_ptr, size_t buffer_n_headers) : buffer_ptr(buffer_ptr), buffer_n_headers(buffer_n_headers), buffer_index(0) {} + PacketHeaderRecorder(volatile uint32_t *buffer_ptr, size_t buffer_n_headers) : buffer_ptr(buffer_ptr), buffer_n_headers(buffer_n_headers), buffer_index(0) {} - void record_packet_header(volatile tt::fabric::PacketHeader *packet_header_ptr) { - uint32_t dest_l1_addr = (uint32_t)buffer_ptr + buffer_index * sizeof(tt::fabric::PacketHeader); + void record_packet_header(volatile uint32_t *packet_header_ptr) { + uint32_t dest_l1_addr = (uint32_t)buffer_ptr + buffer_index * sizeof(PACKET_HEADER_TYPE); noc_async_write( (uint32_t)packet_header_ptr, get_noc_addr(my_x[0], my_y[0], dest_l1_addr), - sizeof(tt::fabric::PacketHeader), + sizeof(PACKET_HEADER_TYPE), 1 - noc_index // avoid the contention on main noc ); buffer_index++; @@ -541,8 +541,8 @@ FORCE_INLINE void send_next_data( // NOTE: if we always send full packet, then we don't need the second branch below dedicated for // channel sync auto volatile *pkt_header = - reinterpret_cast(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index)); - ASSERT(tt::fabric::is_valid(*const_cast(pkt_header))); + reinterpret_cast(sender_buffer_channel.get_buffer_address(local_sender_wrptr_buffer_index)); + ASSERT(tt::fabric::is_valid(*const_cast(pkt_header))); size_t payload_size_bytes = pkt_header->get_payload_size_including_header(); pkt_header->src_ch_id = sender_channel_index; @@ -582,7 +582,7 @@ FORCE_INLINE void receiver_send_received_ack( // Set the acknowledgement bits. We have a different location than the auto receiver_buffer_index = receiver_channel_ptr.get_buffer_index(); - auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); + auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); const auto src_id = pkt_header->src_ch_id; remote_update_ptr_val(to_sender_packets_acked_streams[src_id], 1); } @@ -597,7 +597,7 @@ FORCE_INLINE void receiver_send_completion_ack( auto receiver_buffer_index = receiver_channel_ptr.get_buffer_index(); - auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); + auto volatile *pkt_header = reinterpret_cast(local_receiver_buffer_channel.get_buffer_address(receiver_buffer_index)); const auto src_id = pkt_header->src_ch_id; remote_update_ptr_val(to_sender_packets_completed_streams[src_id], 1); receiver_channel_ptr.increment(); @@ -607,11 +607,16 @@ FORCE_INLINE void receiver_send_completion_ack( template FORCE_INLINE bool can_forward_packet_completely( - tt::fabric::RoutingFields cached_routing_fields, + ROUTING_FIELDS_TYPE cached_routing_fields, tt::fabric::EdmToEdmSender& downstream_edm_interface) { // We always check if it is the terminal mcast packet value. We can do this because all unicast packets have the // mcast terminal value masked in to the routing field. This simplifies the check here to a single compare. - bool deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL; + bool deliver_locally_only; + if constexpr (std::is_same_v) { + deliver_locally_only = cached_routing_fields.value == tt::fabric::RoutingFields::LAST_MCAST_VAL; + } else if constexpr (std::is_same_v) { + deliver_locally_only = (cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK) == tt::fabric::LowLatencyRoutingFields::WRITE_ONLY; + } return deliver_locally_only || downstream_edm_interface.edm_has_space_for_packet(); } @@ -619,19 +624,39 @@ FORCE_INLINE bool can_forward_packet_completely( template FORCE_INLINE void receiver_forward_packet( // TODO: have a separate cached copy of the packet header to save some additional L1 loads - volatile tt::fabric::PacketHeader *packet_start, - tt::fabric::RoutingFields cached_routing_fields, + volatile PACKET_HEADER_TYPE *packet_start, + ROUTING_FIELDS_TYPE cached_routing_fields, tt::fabric::EdmToEdmSender &downstream_edm_interface, uint8_t transaction_id) { - bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; - uint16_t payload_size_bytes = packet_start->payload_size_bytes; - if (start_distance_is_terminal_value) { - execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); - } - bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; - if (not_last_destination_device) { - forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + if constexpr (std::is_same_v) { + // If the packet is a terminal packet, then we can just deliver it locally + bool start_distance_is_terminal_value = (cached_routing_fields.value & tt::fabric::RoutingFields::HOP_DISTANCE_MASK) == tt::fabric::RoutingFields::LAST_HOP_DISTANCE_VAL; + uint16_t payload_size_bytes = packet_start->payload_size_bytes; + if (start_distance_is_terminal_value) { + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); + } + bool not_last_destination_device = cached_routing_fields.value != tt::fabric::RoutingFields::LAST_MCAST_VAL; + if (not_last_destination_device) { + forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + } + } else if constexpr (std::is_same_v) { + uint32_t routing = cached_routing_fields.value & tt::fabric::LowLatencyRoutingFields::FIELD_MASK; + uint16_t payload_size_bytes = packet_start->payload_size_bytes; + switch (routing) { + case tt::fabric::LowLatencyRoutingFields::WRITE_ONLY: + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); + break; + case tt::fabric::LowLatencyRoutingFields::FORWARD_ONLY: + forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + break; + case tt::fabric::LowLatencyRoutingFields::WRITE_AND_FORWARD: + execute_chip_unicast_to_local_chip(packet_start, payload_size_bytes, transaction_id); + forward_payload_to_downstream_edm(packet_start, payload_size_bytes, cached_routing_fields, downstream_edm_interface, transaction_id); + break; + default: + ASSERT(false); + } } } @@ -663,10 +688,10 @@ FORCE_INLINE bool run_sender_channel_step( bool sender_backpressured_from_sender_side = !(local_sender_channel_worker_interface.local_rdptr.distance_behind(local_sender_channel_worker_interface.local_wrptr) < SENDER_NUM_BUFFERS); if (!sender_backpressured_from_sender_side) { did_something = true; - auto packet_header = reinterpret_cast(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index())); + auto packet_header = reinterpret_cast(local_sender_channel.get_buffer_address(local_sender_channel_worker_interface.local_wrptr.get_buffer_index())); if constexpr (enable_packet_header_recording) { tt::fabric::validate(*packet_header); - packet_header_recorder.record_packet_header(packet_header); + packet_header_recorder.record_packet_header(reinterpret_cast(packet_header)); } send_next_data( local_sender_channel, @@ -780,9 +805,9 @@ FORCE_INLINE void run_receiver_channel_step( bool unwritten_packets = !wr_sent_ptr.is_caught_up_to(ack_ptr); if (unwritten_packets) { auto receiver_buffer_index = wr_sent_ptr.get_buffer_index(); - volatile auto packet_header = local_receiver_channel.get_packet_header(receiver_buffer_index); + volatile auto packet_header = local_receiver_channel.template get_packet_header(receiver_buffer_index); - tt::fabric::RoutingFields cached_routing_fields = const_cast(packet_header)->routing_fields; + ROUTING_FIELDS_TYPE cached_routing_fields = const_cast(packet_header)->routing_fields; bool can_send_to_all_local_chip_receivers = can_forward_packet_completely( cached_routing_fields, downstream_edm_interface); @@ -1054,14 +1079,14 @@ void kernel_main() { std::array sender_channel_packet_recorders{ PacketHeaderRecorder( - reinterpret_cast(sender_0_completed_packet_header_cb_address), + reinterpret_cast(sender_0_completed_packet_header_cb_address), sender_0_completed_packet_header_cb_size_headers), PacketHeaderRecorder( - reinterpret_cast(sender_1_completed_packet_header_cb_address), + reinterpret_cast(sender_1_completed_packet_header_cb_address), sender_1_completed_packet_header_cb_size_headers) }; PacketHeaderRecorder receiver_channel_packet_recorder( - reinterpret_cast(receiver_completed_packet_header_cb_address), + reinterpret_cast(receiver_completed_packet_header_cb_address), receiver_completed_packet_header_cb_size_headers); static_assert(SENDER_NUM_BUFFERS > 0, "compile time argument [1]: SENDER_NUM_BUFFERS must be > 0"); @@ -1178,14 +1203,14 @@ void kernel_main() { auto local_receiver_channel = tt::fabric::EthChannelBuffer( local_receiver_channel_buffer_address, channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), eth_transaction_ack_word_addr, // Assume for receiver channel, this address points to a chunk of memory that // can fit 2 eth_channel_syncs cfor ack receiver_channel_id); auto remote_receiver_channel = tt::fabric::EthChannelBuffer( remote_receiver_channel_buffer_address, channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), eth_transaction_ack_word_addr, // Assume for receiver channel, this address points to a chunk of memory that // can fit 2 eth_channel_syncs cfor ack receiver_channel_id); @@ -1196,13 +1221,13 @@ void kernel_main() { new (&local_sender_channels[i]) tt::fabric::EthChannelBuffer( local_sender_buffer_addresses[i], channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), 0, // For sender channels there is no eth_transaction_ack_word_addr because they don't send acks i); new (&remote_sender_channels[i]) tt::fabric::EthChannelBuffer( remote_sender_buffer_addresses[i], channel_buffer_size, - tt::fabric::header_size_bytes, + sizeof(PACKET_HEADER_TYPE), 0, // For sender channels there is no eth_transaction_ack_word_addr because they don't send acks i); diff --git a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp index 369c4f57f33..4bf3cad530e 100644 --- a/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp +++ b/ttnn/cpp/ttnn/operations/ccl/kernels/edm_fabric/fabric_erisc_datamover_channels.hpp @@ -64,12 +64,14 @@ class EthChannelBuffer final { return this->buffer_addresses[buffer_index]; } - [[nodiscard]] FORCE_INLINE volatile PacketHeader *get_packet_header(BufferIndex const& buffer_index) const { - return reinterpret_cast(this->buffer_addresses[buffer_index]); + template + [[nodiscard]] FORCE_INLINE volatile T *get_packet_header(BufferIndex const& buffer_index) const { + return reinterpret_cast(this->buffer_addresses[buffer_index]); } + template [[nodiscard]] FORCE_INLINE size_t get_payload_size(BufferIndex const& buffer_index) const { - return get_packet_header(buffer_index)->get_payload_size_including_header(); + return get_packet_header(buffer_index)->get_payload_size_including_header(); } [[nodiscard]] FORCE_INLINE size_t get_channel_buffer_max_size_in_bytes(BufferIndex const& buffer_index) const { return this->buffer_size_in_bytes; diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp index a8dbeb8ade7..487df3be943 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/interleaved_dim3_1_1_32_any_writer.cpp @@ -94,10 +94,10 @@ void kernel_main() { DPRINT << "packet_header_buffer_seminc: " << (uint32_t)packet_header_buffer_seminc << "\n"; // pre-populate packet headers - volatile tt::fabric::PacketHeader* pkt_hdr_forward = - reinterpret_cast(packet_header_buffer_addr_forward); - volatile tt::fabric::PacketHeader* pkt_hdr_backward = - reinterpret_cast(packet_header_buffer_addr_backward); + volatile PACKET_HEADER_TYPE* pkt_hdr_forward = + reinterpret_cast(packet_header_buffer_addr_forward); + volatile PACKET_HEADER_TYPE* pkt_hdr_backward = + reinterpret_cast(packet_header_buffer_addr_backward); pkt_hdr_forward->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); pkt_hdr_backward->to_chip_multicast( @@ -152,7 +152,7 @@ void kernel_main() { // 2. mcast output ready semaphore uint64_t out_ready_sem_noc_addr_in_pkt = safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0); - auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); + auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ out_ready_sem_noc_addr_in_pkt, static_cast(1), // increment 1 @@ -163,7 +163,7 @@ void kernel_main() { pkt_hdr->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // Write the mcast packet (backward) if (fabric_connection.has_backward_connection()) { @@ -171,7 +171,7 @@ void kernel_main() { tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_backward_direction)}); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); fabric_connection.get_backward_connection().send_payload_non_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // increment locally uint64_t out_ready_sem_noc_addr = diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp index b9f306cc42b..aad1e889c68 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/llama_post_binary_matmul_shape_writer.cpp @@ -103,10 +103,10 @@ void kernel_main() { DPRINT << "packet_header_buffer_seminc: " << (uint32_t)packet_header_buffer_seminc << "\n"; // pre-populate packet headers - volatile tt::fabric::PacketHeader* pkt_hdr_forward = - reinterpret_cast(packet_header_buffer_addr_forward); - volatile tt::fabric::PacketHeader* pkt_hdr_backward = - reinterpret_cast(packet_header_buffer_addr_backward); + volatile PACKET_HEADER_TYPE* pkt_hdr_forward = + reinterpret_cast(packet_header_buffer_addr_forward); + volatile PACKET_HEADER_TYPE* pkt_hdr_backward = + reinterpret_cast(packet_header_buffer_addr_backward); pkt_hdr_forward->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); pkt_hdr_backward->to_chip_multicast( @@ -158,7 +158,7 @@ void kernel_main() { } // 2. mcast output ready semaphore - auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); + auto* pkt_hdr = reinterpret_cast(packet_header_buffer_seminc); uint64_t out_ready_sem_noc_addr_in_pkt = safe_get_noc_addr(out_ready_sem_noc0_x, out_ready_sem_noc0_y, out_ready_sem_bank_addr, 0); pkt_hdr->to_noc_unicast_atomic_inc(tt::fabric::NocUnicastAtomicIncCommandHeader{ @@ -171,7 +171,7 @@ void kernel_main() { pkt_hdr->to_chip_multicast( tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_forward_direction)}); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // Write the mcast packet (backward) if (fabric_connection.has_backward_connection()) { @@ -179,7 +179,7 @@ void kernel_main() { tt::fabric::MulticastRoutingCommandHeader{1, static_cast(num_targets_backward_direction)}); fabric_connection.get_backward_connection().wait_for_empty_write_slot(); fabric_connection.get_backward_connection().send_payload_non_blocking_from_address( - packet_header_buffer_seminc, sizeof(tt::fabric::PacketHeader)); + packet_header_buffer_seminc, sizeof(PACKET_HEADER_TYPE)); } // increment locally uint64_t out_ready_sem_noc_addr = diff --git a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp index 641e6cee244..55e2668d5d1 100644 --- a/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp +++ b/ttnn/cpp/ttnn/operations/experimental/ccl/all_gather_async/device/kernels/minimal_ccl_common.hpp @@ -12,8 +12,8 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( uint64_t noc0_dest_noc_addr, - volatile tt::fabric::PacketHeader* pkt_hdr_forward, - volatile tt::fabric::PacketHeader* pkt_hdr_backward, + volatile PACKET_HEADER_TYPE* pkt_hdr_forward, + volatile PACKET_HEADER_TYPE* pkt_hdr_backward, FabricConnectionManager& fabric_connection, size_t& l1_read_addr, uint32_t payload_size_bytes) { @@ -29,7 +29,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_forward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_forward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr_forward, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr_forward, sizeof(PACKET_HEADER_TYPE)); } if (fabric_connection.has_backward_connection()) { @@ -37,7 +37,7 @@ FORCE_INLINE void write_and_advance_local_read_address_for_fabric_write( fabric_connection.get_backward_connection().send_payload_without_header_non_blocking_from_address( l1_read_addr, payload_size_bytes); fabric_connection.get_backward_connection().send_payload_flush_blocking_from_address( - (uint32_t)pkt_hdr_backward, sizeof(tt::fabric::PacketHeader)); + (uint32_t)pkt_hdr_backward, sizeof(PACKET_HEADER_TYPE)); } noc_async_writes_flushed();